I want to scrape a google scholar page with 'show more' button. I understand from my previous question that it is not a html but a javascript and there are several ways to scrape such pages. I tries selenium and tried the following code.
from selenium import webdriver
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
chrome_path = r"....path....."
driver = webdriver.Chrome(chrome_path)
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
driver.find_element_by_xpath('/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button/span/span[2]').click()
soup = BeautifulSoup(driver.page_source,'html.parser')
papers = soup.find_all('tr',{'class':'gsc_a_tr'})
for paper in papers:
title = paper.find('a',{'class':'gsc_a_at'}).text
author = paper.find('div',{'class':'gs_gray'}).text
journal = [a.text for a in paper.select("td:nth-child(1) > div:nth-child(3)")]
print('Paper Title:', title, '\nAuthor:', author, '\nJournal:', journal)
The browser now clicks the 'show more' button and displays the entire page. But, I am still getting the information only for the first 20 papers. I dont understand why. Please help!
Thanks!
I believe your problem is that the new elements haven't completely loaded in when your program checks the website. Try importing time and then sleeping for a few minutes. Like this (I removed the headless features so you can see the program work):
from selenium import webdriver
import time
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
driver = webdriver.Chrome()
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
time.sleep(3)
driver.find_element_by_id("gsc_bpf_more").click()
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
papers = soup.find_all('tr', {'class': 'gsc_a_tr'})
for paper in papers:
title = paper.find('a', {'class': 'gsc_a_at'}).text
author = paper.find('div', {'class': 'gs_gray'}).text
journal = [a.text for a in paper.select("td:nth-child(1) > div:nth-child(3)")]
print('Paper Title:', title, '\nAuthor:', author, '\nJournal:', journal)
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
# Awkward method
# Loading all available articles and then iterating over them
for i in range(1, 3):
driver.find_element_by_css_selector('#gsc_bpf_more').click()
# waits until elements are loaded
time.sleep(3)
# Container where all data located
for result in driver.find_elements_by_css_selector('#gsc_a_b .gsc_a_t'):
title = result.find_element_by_css_selector('.gsc_a_at').text
authors = result.find_element_by_css_selector('.gsc_a_at+ .gs_gray').text
publication = result.find_element_by_css_selector('.gs_gray+ .gs_gray').text
print(title)
print(authors)
print(publication)
# just for separating purpose
print()
Part of the output:
Tax/subsidy policies in the presence of environmentally aware consumers
S Bansal, S Gangopadhyay
Journal of Environmental Economics and Management 45 (2), 333-355
Choice and design of regulatory instruments in the presence of green consumers
S Bansal
Resource and Energy economics 30 (3), 345-368
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.