简体   繁体   中英

How to get all download links from unsplash using Selenium?

Trying to download a collection of images from Unsplash.

When I try to see the len(links) I only get 29 while it should be 63.

Not sure what the issue is:

from selenium import webdriver


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver

url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'

driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')

#I have clicked Load more images, all images are showing on page.

driver.get(url)

x = driver.find_elements_by_tag_name('a')

count = 0

for i in x:
    if i.get_attribute('title') == 'Download photo':
        count+=1

I have tried scrolling to bottom of the page and middle. Still the same number of results.

This website uses the GET method to get JSON data for every 10 pictures. I'm not familiar with Python but I'll give you the R script for you to translate to Python. You don't need Selenium for this site though.

library(rvest)
library(stringr)
library(rjson)

all_links <- character()
for (i in 1:7) {
  url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
  pg <- fromJSON(file = url)
  links <- character()
  for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
  
  all_links <- c(all_links, links)
}

Basically, the idea is you get the JSON file and the download links would be at the $link$download nodes of each item.

So I worked on it a bit more, and below is the working script.

It is not the best way to go about it.

There is 1 step which still requires a click from user. Can this be automated?

import os
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver


def get_picture_links(url, location):
    # Check if location exists, if not create the location
    if os.path.isdir(location) == False:
        os.mkdir(location)
    driver = driver_download()
    driver.maximize_window()

    driver.get(url)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
    count = 0
    links = []
    for i in range(7):
        if count == 0:
            time.sleep(4)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
        else:
            if count == 1:
                # Click the button
                time.sleep(4)
                input('Please click Load More Photos')
            body = driver.find_element_by_css_selector('body')
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(5)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
    links = list(set(links))
    print('Found: %s Pictures to Download.' % str(len(links)))
    driver.quit()
    return links


def get_pictures(location):
    print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
    driver = driver_download(location)
    for link in links:
        time.sleep(4)
        driver.get(link)
    time.sleep(20)
    driver.quit()
    print('Pictures have been downloaded..Renaming now')


def rename_pictures(location):
    # Rename the files
    os.chdir(location)
    files = os.listdir()
    files = [i for i in files if '.jpg' or '.jpeg' in i]

    count = 1

    for i in files:
        os.rename(i, str(count) + '.jpg')
        count += 1
    print('Everything done! Open the folder to see the files')

location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)

You can change the else where the user click is located for this. I use from "selenium.webdriver.common.by import By" but u can change it to your format: driver.find_element_by_xpath('/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()

else:
        if count == 1:
            driver.find_element(By.XPATH, '/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
            time.sleep(4)
        body = driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(5)
        x = driver.find_elements_by_tag_name('a')
        for i in x:
            if i.get_attribute('title') == 'Download photo':
                links.append(i.get_attribute('href'))
        count += 1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM