简体   繁体   中英

Unable to download images in Instagram posts of target user using selenium webdriver python

I am trying to download all instagram posts of a particular person by searching username in searchbox. After I located to target profile using selenium webdriver, I am using

driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

to scroll down to all posts present in that profile. Later I am trying to find all urls of those images. Unfortunately, i am able to get 30 image urls only of 37 images. But, I can see the remaining 7 images in browser by inspecting the browser.

My code

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print('Downloading Image..',index,':',link)
            response=requests.get(link,stream=True)
            try:
                with open(image_path,'wb') as file:
                    shutil.copyfileobj(response.raw,file)
            except Exception as e:
                print(e)
                print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

Below are my logs


DevTools listening on ws://127.0.0.1:59120/devtools/browser/8310d943-619d-4278-9d52-4ae4aa68047f
length of all images: 30
Downloading Image.. 0
Downloading Image.. 1
Downloading Image.. 2
Downloading Image.. 3
Downloading Image.. 4
Downloading Image.. 5
Downloading Image.. 6
Downloading Image.. 7
Downloading Image.. 8
Downloading Image.. 9
Downloading Image.. 10
Downloading Image.. 11
Downloading Image.. 12
Downloading Image.. 13
Downloading Image.. 14
Downloading Image.. 15
Downloading Image.. 16
Downloading Image.. 17
Downloading Image.. 18
Downloading Image.. 19
Downloading Image.. 20
Downloading Image.. 21
Downloading Image.. 22
Downloading Image.. 23
Downloading Image.. 24
Downloading Image.. 25
Downloading Image.. 26
Downloading Image.. 27
Downloading Image.. 28
Downloading Image.. 29
Traceback (most recent call last):
  File "part_1_login.py", line 119, in <module>
    app=App()
  File "part_1_login.py", line 32, in __init__
    self.downloading_images()
  File "part_1_login.py", line 50, in downloading_images
    response=requests.get(link,stream=True)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 519, in request
    prep = self.prepare_request(req)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 462, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 313, in prepare
    self.prepare_url(url, params)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 387, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png': No schema supplied. Perhaps you meant http:///static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png?

It couldn't download the instagram logo image as it doesn't have the absolute/full image path.so I just added if loop to download only images which has the full path. Please check and let me know if it helps.

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.driver.implicitly_wait(15)        
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print link
            if "http" in link:
                print('Downloading Image..',index,':',link)
                response=requests.get(link,stream=True)
                try:
                    with open(image_path,'wb') as file:
                        shutil.copyfileobj(response.raw,file)
                except Exception as e:
                    print(e)
                    print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

Update:

Looks like few images are getting removed from the DOM/html page if you scroll to the bottom of the page.So we need to find some other logic to extract all the image. I am looking into it. I will let you know if i get something for your problem.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM