How to get all download links from unsplash using Selenium?

Question

Trying to download a collection of images from Unsplash.

When I try to see the len(links) I only get 29 while it should be 63.

Not sure what the issue is:

from selenium import webdriver


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver

url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'

driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')

#I have clicked Load more images, all images are showing on page.

driver.get(url)

x = driver.find_elements_by_tag_name('a')

count = 0

for i in x:
    if i.get_attribute('title') == 'Download photo':
        count+=1

I have tried scrolling to bottom of the page and middle. Still the same number of results.

Answer 1

This website uses the GET method to get JSON data for every 10 pictures. I'm not familiar with Python but I'll give you the R script for you to translate to Python. You don't need Selenium for this site though.

library(rvest)
library(stringr)
library(rjson)

all_links <- character()
for (i in 1:7) {
  url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
  pg <- fromJSON(file = url)
  links <- character()
  for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
  
  all_links <- c(all_links, links)
}

Basically, the idea is you get the JSON file and the download links would be at the $link$download nodes of each item.

Answer 2

So I worked on it a bit more, and below is the working script.

It is not the best way to go about it.

There is 1 step which still requires a click from user. Can this be automated?

import os
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver


def get_picture_links(url, location):
    # Check if location exists, if not create the location
    if os.path.isdir(location) == False:
        os.mkdir(location)
    driver = driver_download()
    driver.maximize_window()

    driver.get(url)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
    count = 0
    links = []
    for i in range(7):
        if count == 0:
            time.sleep(4)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
        else:
            if count == 1:
                # Click the button
                time.sleep(4)
                input('Please click Load More Photos')
            body = driver.find_element_by_css_selector('body')
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(5)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
    links = list(set(links))
    print('Found: %s Pictures to Download.' % str(len(links)))
    driver.quit()
    return links


def get_pictures(location):
    print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
    driver = driver_download(location)
    for link in links:
        time.sleep(4)
        driver.get(link)
    time.sleep(20)
    driver.quit()
    print('Pictures have been downloaded..Renaming now')


def rename_pictures(location):
    # Rename the files
    os.chdir(location)
    files = os.listdir()
    files = [i for i in files if '.jpg' or '.jpeg' in i]

    count = 1

    for i in files:
        os.rename(i, str(count) + '.jpg')
        count += 1
    print('Everything done! Open the folder to see the files')

location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)

Answer 3

You can change the else where the user click is located for this. I use from "selenium.webdriver.common.by import By" but u can change it to your format: driver.find_element_by_xpath('/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()

else:
        if count == 1:
            driver.find_element(By.XPATH, '/html/body/div/div/div[2]/div[5]/div[3]/div[1]/button').click()
            time.sleep(4)
        body = driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(5)
        x = driver.find_elements_by_tag_name('a')
        for i in x:
            if i.get_attribute('title') == 'Download photo':
                links.append(i.get_attribute('href'))
        count += 1

How to get all download links from unsplash using Selenium?

Question

3 answers

solution1
1 2020-07-03 02:45:26

solution2
0 2020-07-04 09:33:09

solution3
0 2022-04-11 11:50:47

How to get all download links from unsplash using Selenium?

Question

3 answers

solution1 1 2020-07-03 02:45:26

solution2 0 2020-07-04 09:33:09

solution3 0 2022-04-11 11:50:47

solution1
1 2020-07-03 02:45:26

solution2
0 2020-07-04 09:33:09

solution3
0 2022-04-11 11:50:47