I am trying to download all instagram posts of a particular person by searching username in searchbox. After I located to target profile using selenium webdriver, I am using
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
to scroll down to all posts present in that profile. Later I am trying to find all urls of those images. Unfortunately, i am able to get 30 image urls only of 37 images. But, I can see the remaining 7 images in browser by inspecting the browser.
My code
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
class App:
def __init__(self,username='*******',password='*****',target_username='******',
path="C:\\Users\\ranga\\Music\\Selenium\\photos"):
self.username=username
self.password=password
self.target_username=target_username
self.path=path
self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
self.main_url="https://www.instagram.com"
self.driver.get(self.main_url)
self.error=False
sleep(3)
# login function
self.log_in()
sleep(2)
if self.error is False:
self.close_dialogbox_if_there()
self.open_target_profile()
if self.error is False:
self.scroll_down()
if self.error is False:
if not os.path.exists(path):
os.mkdir(path)
self.downloading_images()
self.driver.close()
def downloading_images(self):
#sleep(5)
#print(self.driver.page_source)
soup=BeautifulSoup(self.driver.page_source,'html.parser')
self.driver.set_page_load_timeout(3)
#print(soup.prettify())
all_images=soup.find_all('img')
print('length of all images:',len(all_images))
for index,img in enumerate(all_images):
#print('img:{0}'.format(img))
#input('Stop for now')
file_name='image_'+str(index)+'.jpg'
image_path=os.path.join(self.path,file_name)
link=img['src']
print('Downloading Image..',index,':',link)
response=requests.get(link,stream=True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image number',index)
def close_dialogbox_if_there(self):
try:
sleep(2)
close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
close_button.click()
except Exception:
pass
def open_target_profile(self):
try:
search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
search_bar.send_keys(self.target_username)
target_profile=self.main_url + '/' + self.target_username +'/'
self.driver.get(target_profile)
sleep(2)
except Exception:
self.error=True
print('Could not find Search bar')
def scroll_down(self):
try:
no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
no_of_posts=str(no_of_posts.text).replace(',','')
self.no_of_posts=int(no_of_posts)
if self.no_of_posts>12:
no_of_scrolls= int(self.no_of_posts/12) + 1
for value in range(no_of_scrolls):
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
sleep(2)
except Exception:
self.error=True
print('Could not find number of posts while scroll down')
def log_in(self):
try:
login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
login_button.click()
sleep(2)
try:
user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
user_name_input.send_keys(self.username)
password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
password_input.send_keys(self.password)
password_input.submit()
except Exception:
print('Some exception occured while trying to find username or password')
self.error=True
except Exception:
self.error=True
print('Unable to find login button')
if __name__ == '__main__':
app=App()
Below are my logs
DevTools listening on ws://127.0.0.1:59120/devtools/browser/8310d943-619d-4278-9d52-4ae4aa68047f
length of all images: 30
Downloading Image.. 0
Downloading Image.. 1
Downloading Image.. 2
Downloading Image.. 3
Downloading Image.. 4
Downloading Image.. 5
Downloading Image.. 6
Downloading Image.. 7
Downloading Image.. 8
Downloading Image.. 9
Downloading Image.. 10
Downloading Image.. 11
Downloading Image.. 12
Downloading Image.. 13
Downloading Image.. 14
Downloading Image.. 15
Downloading Image.. 16
Downloading Image.. 17
Downloading Image.. 18
Downloading Image.. 19
Downloading Image.. 20
Downloading Image.. 21
Downloading Image.. 22
Downloading Image.. 23
Downloading Image.. 24
Downloading Image.. 25
Downloading Image.. 26
Downloading Image.. 27
Downloading Image.. 28
Downloading Image.. 29
Traceback (most recent call last):
File "part_1_login.py", line 119, in <module>
app=App()
File "part_1_login.py", line 32, in __init__
self.downloading_images()
File "part_1_login.py", line 50, in downloading_images
response=requests.get(link,stream=True)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "C:\Users\ranga\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png': No schema supplied. Perhaps you meant http:///static/images/web/mobile_nav_type_logo.png/735145cfe0a4.png?
It couldn't download the instagram logo image as it doesn't have the absolute/full image path.so I just added if loop to download only images which has the full path. Please check and let me know if it helps.
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import requests
import shutil
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
class App:
def __init__(self,username='*******',password='*****',target_username='******',
path="C:\\Users\\ranga\\Music\\Selenium\\photos"):
self.username=username
self.password=password
self.target_username=target_username
self.path=path
self.driver=webdriver.Firefox(executable_path="C:\\Users\\ranga\\Music\\Selenium\\geckodriver.exe")
self.driver.implicitly_wait(15)
self.main_url="https://www.instagram.com"
self.driver.get(self.main_url)
self.error=False
sleep(3)
# login function
self.log_in()
sleep(2)
if self.error is False:
self.close_dialogbox_if_there()
self.open_target_profile()
if self.error is False:
self.scroll_down()
if self.error is False:
if not os.path.exists(path):
os.mkdir(path)
self.downloading_images()
self.driver.close()
def downloading_images(self):
#sleep(5)
#print(self.driver.page_source)
soup=BeautifulSoup(self.driver.page_source,'html.parser')
self.driver.set_page_load_timeout(3)
#print(soup.prettify())
all_images=soup.find_all('img')
print('length of all images:',len(all_images))
for index,img in enumerate(all_images):
#print('img:{0}'.format(img))
#input('Stop for now')
file_name='image_'+str(index)+'.jpg'
image_path=os.path.join(self.path,file_name)
link=img['src']
print link
if "http" in link:
print('Downloading Image..',index,':',link)
response=requests.get(link,stream=True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image number',index)
def close_dialogbox_if_there(self):
try:
sleep(2)
close_button=self.driver.find_element_by_xpath("/html/body/div[3]/div/div/div[3]/button[2]")
close_button.click()
except Exception:
pass
def open_target_profile(self):
try:
search_bar=self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
search_bar.send_keys(self.target_username)
target_profile=self.main_url + '/' + self.target_username +'/'
self.driver.get(target_profile)
sleep(2)
except Exception:
self.error=True
print('Could not find Search bar')
def scroll_down(self):
try:
no_of_posts=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/header/section/ul/li[1]/span/span')
no_of_posts=str(no_of_posts.text).replace(',','')
self.no_of_posts=int(no_of_posts)
if self.no_of_posts>12:
no_of_scrolls= int(self.no_of_posts/12) + 1
for value in range(no_of_scrolls):
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
sleep(2)
except Exception:
self.error=True
print('Could not find number of posts while scroll down')
def log_in(self):
try:
login_button=self.driver.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div[2]/p/a')
login_button.click()
sleep(2)
try:
user_name_input=self.driver.find_element_by_xpath('//input[@name="username"]')
user_name_input.send_keys(self.username)
password_input=self.driver.find_element_by_xpath('//input[@name="password"]')
password_input.send_keys(self.password)
password_input.submit()
except Exception:
print('Some exception occured while trying to find username or password')
self.error=True
except Exception:
self.error=True
print('Unable to find login button')
if __name__ == '__main__':
app=App()
Update:
Looks like few images are getting removed from the DOM/html page if you scroll to the bottom of the page.So we need to find some other logic to extract all the image. I am looking into it. I will let you know if i get something for your problem.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.