[英]How to get all download links from unsplash using Selenium?
尝试从 Unsplash 下载一组图像。
当我尝试查看 len(links) 时,我只得到 29,而它应该是 63。
不确定是什么问题:
from selenium import webdriver
def driver_download(location_for_download):
# options = Options()
# options.headless = True
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': location_for_download}
chrome_options.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
return driver
url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'
driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')
#I have clicked Load more images, all images are showing on page.
driver.get(url)
x = driver.find_elements_by_tag_name('a')
count = 0
for i in x:
if i.get_attribute('title') == 'Download photo':
count+=1
我试过滚动到页面底部和中间。 仍然是相同数量的结果。
本站使用GET方式每10张图片获取JSON数据。 我不熟悉 Python,但我会给你 R 脚本让你翻译成 Python。 不过,此站点不需要 Selenium。
library(rvest)
library(stringr)
library(rjson)
all_links <- character()
for (i in 1:7) {
url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
pg <- fromJSON(file = url)
links <- character()
for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
all_links <- c(all_links, links)
}
基本上,这个想法是您获得 JSON 文件,下载链接将位于每个项目的 $link$download 节点。
所以我做了更多的工作,下面是工作脚本。
这不是 go 关于它的最佳方式。
有 1 个步骤仍需要用户点击。 这可以自动化吗?
import os
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def driver_download(location_for_download):
# options = Options()
# options.headless = True
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': location_for_download}
chrome_options.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
return driver
def get_picture_links(url, location):
# Check if location exists, if not create the location
if os.path.isdir(location) == False:
os.mkdir(location)
driver = driver_download()
driver.maximize_window()
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
count = 0
links = []
for i in range(7):
if count == 0:
time.sleep(4)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
else:
if count == 1:
# Click the button
time.sleep(4)
input('Please click Load More Photos')
body = driver.find_element_by_css_selector('body')
body.send_keys(Keys.PAGE_DOWN)
time.sleep(5)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
links = list(set(links))
print('Found: %s Pictures to Download.' % str(len(links)))
driver.quit()
return links
def get_pictures(location):
print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
driver = driver_download(location)
for link in links:
time.sleep(4)
driver.get(link)
time.sleep(20)
driver.quit()
print('Pictures have been downloaded..Renaming now')
def rename_pictures(location):
# Rename the files
os.chdir(location)
files = os.listdir()
files = [i for i in files if '.jpg' or '.jpeg' in i]
count = 1
for i in files:
os.rename(i, str(count) + '.jpg')
count += 1
print('Everything done! Open the folder to see the files')
location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)
您可以为此更改用户点击所在的 else。 我使用 from "selenium.webdriver.common.by import By" 但你可以将其更改为你的格式: driver.find_element_by_xpath('/html/body/div/div/div[2]/div[5]/div[3 ]/div[1]/按钮').click()
else:
if count == 1:
driver.find_element(By.XPATH, '/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
time.sleep(4)
body = driver.find_element_by_css_selector('body')
body.send_keys(Keys.PAGE_DOWN)
time.sleep(5)
x = driver.find_elements_by_tag_name('a')
for i in x:
if i.get_attribute('title') == 'Download photo':
links.append(i.get_attribute('href'))
count += 1
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.