繁体   English   中英

如何使用 Selenium 从 unsplash 获取所有下载链接?

[英]How to get all download links from unsplash using Selenium?

尝试从 Unsplash 下载一组图像。

当我尝试查看 len(links) 时,我只得到 29,而它应该是 63。

不确定是什么问题:

from selenium import webdriver


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver

url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'

driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')

#I have clicked Load more images, all images are showing on page.

driver.get(url)

x = driver.find_elements_by_tag_name('a')

count = 0

for i in x:
    if i.get_attribute('title') == 'Download photo':
        count+=1

我试过滚动到页面底部和中间。 仍然是相同数量的结果。

本站使用GET方式每10张图片获取JSON数据。 我不熟悉 Python,但我会给你 R 脚本让你翻译成 Python。 不过,此站点不需要 Selenium。

library(rvest)
library(stringr)
library(rjson)

all_links <- character()
for (i in 1:7) {
  url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
  pg <- fromJSON(file = url)
  links <- character()
  for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
  
  all_links <- c(all_links, links)
}

基本上,这个想法是您获得 JSON 文件,下载链接将位于每个项目的 $link$download 节点。

所以我做了更多的工作,下面是工作脚本。

这不是 go 关于它的最佳方式。

有 1 个步骤仍需要用户点击。 这可以自动化吗?

import os
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver


def get_picture_links(url, location):
    # Check if location exists, if not create the location
    if os.path.isdir(location) == False:
        os.mkdir(location)
    driver = driver_download()
    driver.maximize_window()

    driver.get(url)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
    count = 0
    links = []
    for i in range(7):
        if count == 0:
            time.sleep(4)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
        else:
            if count == 1:
                # Click the button
                time.sleep(4)
                input('Please click Load More Photos')
            body = driver.find_element_by_css_selector('body')
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(5)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
    links = list(set(links))
    print('Found: %s Pictures to Download.' % str(len(links)))
    driver.quit()
    return links


def get_pictures(location):
    print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
    driver = driver_download(location)
    for link in links:
        time.sleep(4)
        driver.get(link)
    time.sleep(20)
    driver.quit()
    print('Pictures have been downloaded..Renaming now')


def rename_pictures(location):
    # Rename the files
    os.chdir(location)
    files = os.listdir()
    files = [i for i in files if '.jpg' or '.jpeg' in i]

    count = 1

    for i in files:
        os.rename(i, str(count) + '.jpg')
        count += 1
    print('Everything done! Open the folder to see the files')

location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)

您可以为此更改用户点击所在的 else。 我使用 from "selenium.webdriver.common.by import By" 但你可以将其更改为你的格式: driver.find_element_by_xpath('/html/body/div/div/div[2]/div[5]/div[3 ]/div[1]/按钮').click()

else:
        if count == 1:
            driver.find_element(By.XPATH, '/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
            time.sleep(4)
        body = driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(5)
        x = driver.find_elements_by_tag_name('a')
        for i in x:
            if i.get_attribute('title') == 'Download photo':
                links.append(i.get_attribute('href'))
        count += 1

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM