简体   繁体   English

无法使用 selenium webdriver python 下载目标用户的 Instagram 帖子中的图像

[英]Unable to download images in Instagram posts of target user using selenium webdriver python

I am trying to download all instagram posts of a particular person by searching username in searchbox.我正在尝试通过在搜索框中搜索用户名来下载特定人的所有 Instagram 帖子。 After I located to target profile using selenium webdriver, I am using在我使用 selenium webdriver 定位到目标配置文件后,我正在使用

driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

to scroll down to all posts present in that profile.向下滚动到该个人资料中的所有帖子。 Later I am trying to find all urls of those images.后来我试图找到这些图像的所有网址。 Unfortunately, i am able to get 30 image urls only of 37 images.不幸的是,我只能获得 37 张图片中的 30 张图片网址。 But, I can see the remaining 7 images in browser by inspecting the browser.但是,我可以通过检查浏览器在浏览器中看到剩余的 7 张图像。

My code我的代码

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print('Downloading Image..',index,':',link)
            response=requests.get(link,stream=True)
            try:
                with open(image_path,'wb') as file:
                    shutil.copyfileobj(response.raw,file)
            except Exception as e:
                print(e)
                print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

Below are my logs以下是我的日志


DevTools listening on ws://127.0.0.1:59120/devtools/browser/8310d943-619d-4278-9d52-4ae4aa68047f
length of all images: 30
Downloading Image.. 0
Downloading Image.. 1
Downloading Image.. 2
Downloading Image.. 3
Downloading Image.. 4
Downloading Image.. 5
Downloading Image.. 6
Downloading Image.. 7
Downloading Image.. 8
Downloading Image.. 9
Downloading Image.. 10
Downloading Image.. 11
Downloading Image.. 12
Downloading Image.. 13
Downloading Image.. 14
Downloading Image.. 15
Downloading Image.. 16
Downloading Image.. 17
Downloading Image.. 18
Downloading Image.. 19
Downloading Image.. 20
Downloading Image.. 21
Downloading Image.. 22
Downloading Image.. 23
Downloading Image.. 24
Downloading Image.. 25
Downloading Image.. 26
Downloading Image.. 27
Downloading Image.. 28
Downloading Image.. 29
Traceback (most recent call last):
  File "part_1_login.py", line 119, in <module>
    app=App()
  File "part_1_login.py", line 32, in __init__
    self.downloading_images()
  File "part_1_login.py", line 50, in downloading_images
    response=requests.get(link,stream=True)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 519, in request
    prep = self.prepare_request(req)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 462, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 313, in prepare
    self.prepare_url(url, params)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 387, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png': No schema supplied. Perhaps you meant http:///static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png?

It couldn't download the instagram logo image as it doesn't have the absolute/full image path.so I just added if loop to download only images which has the full path.它无法下载 instagram 徽标图像,因为它没有绝对/完整图像路径。所以我只是添加了 if 循环以仅下载具有完整路径的图像。 Please check and let me know if it helps.请检查并让我知道它是否有帮助。

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.driver.implicitly_wait(15)        
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print link
            if "http" in link:
                print('Downloading Image..',index,':',link)
                response=requests.get(link,stream=True)
                try:
                    with open(image_path,'wb') as file:
                        shutil.copyfileobj(response.raw,file)
                except Exception as e:
                    print(e)
                    print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

Update:更新:

Looks like few images are getting removed from the DOM/html page if you scroll to the bottom of the page.So we need to find some other logic to extract all the image.如果滚动到页面底部,似乎很少有图像会从 DOM/html 页面中删除。所以我们需要找到一些其他逻辑来提取所有图像。 I am looking into it.我正在调查它。 I will let you know if i get something for your problem.如果我能解决您的问题,我会通知您。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM