簡體   English   中英

無法使用 selenium webdriver python 下載目標用戶的 Instagram 帖子中的圖像

[英]Unable to download images in Instagram posts of target user using selenium webdriver python

我正在嘗試通過在搜索框中搜索用戶名來下載特定人的所有 Instagram 帖子。 在我使用 selenium webdriver 定位到目標配置文件后,我正在使用

driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

向下滾動到該個人資料中的所有帖子。 后來我試圖找到這些圖像的所有網址。 不幸的是,我只能獲得 37 張圖片中的 30 張圖片網址。 但是,我可以通過檢查瀏覽器在瀏覽器中看到剩余的 7 張圖像。

我的代碼

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print('Downloading Image..',index,':',link)
            response=requests.get(link,stream=True)
            try:
                with open(image_path,'wb') as file:
                    shutil.copyfileobj(response.raw,file)
            except Exception as e:
                print(e)
                print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

以下是我的日志


DevTools listening on ws://127.0.0.1:59120/devtools/browser/8310d943-619d-4278-9d52-4ae4aa68047f
length of all images: 30
Downloading Image.. 0
Downloading Image.. 1
Downloading Image.. 2
Downloading Image.. 3
Downloading Image.. 4
Downloading Image.. 5
Downloading Image.. 6
Downloading Image.. 7
Downloading Image.. 8
Downloading Image.. 9
Downloading Image.. 10
Downloading Image.. 11
Downloading Image.. 12
Downloading Image.. 13
Downloading Image.. 14
Downloading Image.. 15
Downloading Image.. 16
Downloading Image.. 17
Downloading Image.. 18
Downloading Image.. 19
Downloading Image.. 20
Downloading Image.. 21
Downloading Image.. 22
Downloading Image.. 23
Downloading Image.. 24
Downloading Image.. 25
Downloading Image.. 26
Downloading Image.. 27
Downloading Image.. 28
Downloading Image.. 29
Traceback (most recent call last):
  File "part_1_login.py", line 119, in <module>
    app=App()
  File "part_1_login.py", line 32, in __init__
    self.downloading_images()
  File "part_1_login.py", line 50, in downloading_images
    response=requests.get(link,stream=True)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 519, in request
    prep = self.prepare_request(req)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 462, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 313, in prepare
    self.prepare_url(url, params)
  File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 387, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png': No schema supplied. Perhaps you meant http:///static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png?

它無法下載 instagram 徽標圖像,因為它沒有絕對/完整圖像路徑。所以我只是添加了 if 循環以僅下載具有完整路徑的圖像。 請檢查並讓我知道它是否有幫助。

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

class App:
    def __init__(self,username='*******',password='*****',target_username='******',
                                                                     path="C:\\Users\\ranga\\Music\\Selenium\\photos"):

        self.username=username
        self.password=password
        self.target_username=target_username
        self.path=path
        self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
        self.driver.implicitly_wait(15)        
        self.main_url="https://www.instagram.com"
        self.driver.get(self.main_url)
        self.error=False
        sleep(3)
        # login function
        self.log_in()
        sleep(2)
        if self.error is False:
            self.close_dialogbox_if_there()
            self.open_target_profile()
        if self.error is False:
            self.scroll_down()
        if self.error is False:
            if not os.path.exists(path):
                os.mkdir(path)
            self.downloading_images()
        self.driver.close()

    def downloading_images(self):
        #sleep(5)
        #print(self.driver.page_source)
        soup=BeautifulSoup(self.driver.page_source,'html.parser')
        self.driver.set_page_load_timeout(3)
        #print(soup.prettify())
        all_images=soup.find_all('img')
        print('length of all images:',len(all_images))
        for index,img in enumerate(all_images):
            #print('img:{0}'.format(img))
            #input('Stop for now')
            file_name='image_'+str(index)+'.jpg'
            image_path=os.path.join(self.path,file_name)
            link=img['src']
            print link
            if "http" in link:
                print('Downloading Image..',index,':',link)
                response=requests.get(link,stream=True)
                try:
                    with open(image_path,'wb') as file:
                        shutil.copyfileobj(response.raw,file)
                except Exception as e:
                    print(e)
                    print('Could not download image number',index)

    def close_dialogbox_if_there(self):
        try:
            sleep(2)
            close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
            close_button.click()
        except Exception:
            pass
    def open_target_profile(self):
        try:
            search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
            search_bar.send_keys(self.target_username)
            target_profile=self.main_url + '/' + self.target_username +'/'
            self.driver.get(target_profile)
            sleep(2)
        except Exception:
            self.error=True
            print('Could not find Search bar')

    def scroll_down(self):
        try:
            no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
            no_of_posts=str(no_of_posts.text).replace(',','')
            self.no_of_posts=int(no_of_posts)
            if self.no_of_posts>12:
                no_of_scrolls= int(self.no_of_posts/12) + 1
                for value in range(no_of_scrolls):
                    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
                    sleep(2)
        except Exception:
            self.error=True
            print('Could not find number of posts while scroll down')

    def log_in(self):
        try:
            login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
            login_button.click()
            sleep(2)
            try:
                user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
                user_name_input.send_keys(self.username)
                password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
                password_input.send_keys(self.password)
                password_input.submit()
            except Exception:
                print('Some exception occured while trying to find username or password')
                self.error=True
        except Exception:
            self.error=True
            print('Unable to find login button')


if __name__ == '__main__':
    app=App()

更新:

如果滾動到頁面底部,似乎很少有圖像會從 DOM/html 頁面中刪除。所以我們需要找到一些其他邏輯來提取所有圖像。 我正在調查它。 如果我能解決您的問題,我會通知您。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM