![](/img/trans.png)
[英]how to download posts data(images/videos) from instagram with python
[英]Unable to download images in Instagram posts of target user using selenium webdriver python
我正在嘗試通過在搜索框中搜索用戶名來下載特定人的所有 Instagram 帖子。 在我使用 selenium webdriver 定位到目標配置文件后,我正在使用
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
向下滾動到該個人資料中的所有帖子。 后來我試圖找到這些圖像的所有網址。 不幸的是,我只能獲得 37 張圖片中的 30 張圖片網址。 但是,我可以通過檢查瀏覽器在瀏覽器中看到剩余的 7 張圖像。
我的代碼
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
class App:
def __init__(self,username='*******',password='*****',target_username='******',
path="C:\\Users\\ranga\\Music\\Selenium\\photos"):
self.username=username
self.password=password
self.target_username=target_username
self.path=path
self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
self.main_url="https://www.instagram.com"
self.driver.get(self.main_url)
self.error=False
sleep(3)
# login function
self.log_in()
sleep(2)
if self.error is False:
self.close_dialogbox_if_there()
self.open_target_profile()
if self.error is False:
self.scroll_down()
if self.error is False:
if not os.path.exists(path):
os.mkdir(path)
self.downloading_images()
self.driver.close()
def downloading_images(self):
#sleep(5)
#print(self.driver.page_source)
soup=BeautifulSoup(self.driver.page_source,'html.parser')
self.driver.set_page_load_timeout(3)
#print(soup.prettify())
all_images=soup.find_all('img')
print('length of all images:',len(all_images))
for index,img in enumerate(all_images):
#print('img:{0}'.format(img))
#input('Stop for now')
file_name='image_'+str(index)+'.jpg'
image_path=os.path.join(self.path,file_name)
link=img['src']
print('Downloading Image..',index,':',link)
response=requests.get(link,stream=True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image number',index)
def close_dialogbox_if_there(self):
try:
sleep(2)
close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
close_button.click()
except Exception:
pass
def open_target_profile(self):
try:
search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
search_bar.send_keys(self.target_username)
target_profile=self.main_url + '/' + self.target_username +'/'
self.driver.get(target_profile)
sleep(2)
except Exception:
self.error=True
print('Could not find Search bar')
def scroll_down(self):
try:
no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
no_of_posts=str(no_of_posts.text).replace(',','')
self.no_of_posts=int(no_of_posts)
if self.no_of_posts>12:
no_of_scrolls= int(self.no_of_posts/12) + 1
for value in range(no_of_scrolls):
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
sleep(2)
except Exception:
self.error=True
print('Could not find number of posts while scroll down')
def log_in(self):
try:
login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
login_button.click()
sleep(2)
try:
user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
user_name_input.send_keys(self.username)
password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
password_input.send_keys(self.password)
password_input.submit()
except Exception:
print('Some exception occured while trying to find username or password')
self.error=True
except Exception:
self.error=True
print('Unable to find login button')
if __name__ == '__main__':
app=App()
以下是我的日志
DevTools listening on ws://127.0.0.1:59120/devtools/browser/8310d943-619d-4278-9d52-4ae4aa68047f
length of all images: 30
Downloading Image.. 0
Downloading Image.. 1
Downloading Image.. 2
Downloading Image.. 3
Downloading Image.. 4
Downloading Image.. 5
Downloading Image.. 6
Downloading Image.. 7
Downloading Image.. 8
Downloading Image.. 9
Downloading Image.. 10
Downloading Image.. 11
Downloading Image.. 12
Downloading Image.. 13
Downloading Image.. 14
Downloading Image.. 15
Downloading Image.. 16
Downloading Image.. 17
Downloading Image.. 18
Downloading Image.. 19
Downloading Image.. 20
Downloading Image.. 21
Downloading Image.. 22
Downloading Image.. 23
Downloading Image.. 24
Downloading Image.. 25
Downloading Image.. 26
Downloading Image.. 27
Downloading Image.. 28
Downloading Image.. 29
Traceback (most recent call last):
File "part_1_login.py", line 119, in <module>
app=App()
File "part_1_login.py", line 32, in __init__
self.downloading_images()
File "part_1_login.py", line 50, in downloading_images
response=requests.get(link,stream=True)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png': No schema supplied. Perhaps you meant http:///static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png?
它無法下載 instagram 徽標圖像,因為它沒有絕對/完整圖像路徑。所以我只是添加了 if 循環以僅下載具有完整路徑的圖像。 請檢查並讓我知道它是否有幫助。
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
class App:
def __init__(self,username='*******',password='*****',target_username='******',
path="C:\\Users\\ranga\\Music\\Selenium\\photos"):
self.username=username
self.password=password
self.target_username=target_username
self.path=path
self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
self.driver.implicitly_wait(15)
self.main_url="https://www.instagram.com"
self.driver.get(self.main_url)
self.error=False
sleep(3)
# login function
self.log_in()
sleep(2)
if self.error is False:
self.close_dialogbox_if_there()
self.open_target_profile()
if self.error is False:
self.scroll_down()
if self.error is False:
if not os.path.exists(path):
os.mkdir(path)
self.downloading_images()
self.driver.close()
def downloading_images(self):
#sleep(5)
#print(self.driver.page_source)
soup=BeautifulSoup(self.driver.page_source,'html.parser')
self.driver.set_page_load_timeout(3)
#print(soup.prettify())
all_images=soup.find_all('img')
print('length of all images:',len(all_images))
for index,img in enumerate(all_images):
#print('img:{0}'.format(img))
#input('Stop for now')
file_name='image_'+str(index)+'.jpg'
image_path=os.path.join(self.path,file_name)
link=img['src']
print link
if "http" in link:
print('Downloading Image..',index,':',link)
response=requests.get(link,stream=True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image number',index)
def close_dialogbox_if_there(self):
try:
sleep(2)
close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
close_button.click()
except Exception:
pass
def open_target_profile(self):
try:
search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
search_bar.send_keys(self.target_username)
target_profile=self.main_url + '/' + self.target_username +'/'
self.driver.get(target_profile)
sleep(2)
except Exception:
self.error=True
print('Could not find Search bar')
def scroll_down(self):
try:
no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
no_of_posts=str(no_of_posts.text).replace(',','')
self.no_of_posts=int(no_of_posts)
if self.no_of_posts>12:
no_of_scrolls= int(self.no_of_posts/12) + 1
for value in range(no_of_scrolls):
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
sleep(2)
except Exception:
self.error=True
print('Could not find number of posts while scroll down')
def log_in(self):
try:
login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
login_button.click()
sleep(2)
try:
user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
user_name_input.send_keys(self.username)
password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
password_input.send_keys(self.password)
password_input.submit()
except Exception:
print('Some exception occured while trying to find username or password')
self.error=True
except Exception:
self.error=True
print('Unable to find login button')
if __name__ == '__main__':
app=App()
更新:
如果滾動到頁面底部,似乎很少有圖像會從 DOM/html 頁面中刪除。所以我們需要找到一些其他邏輯來提取所有圖像。 我正在調查它。 如果我能解決您的問題,我會通知您。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.