I'm trying to scrape https://arxiv.org/search/?query=healthcare&searchtype=allI through the Selenium and python. The for loop takes too long to execute. I tried to scrape with headless browsers and PhantomJS, but it doesnt scrape the abstract field (Need the abstract field expanded with the more button clicked)
import pandas as pd
import selenium
import re
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox
browser = Firefox()
url_healthcare = 'https://arxiv.org/search/?query=healthcare&searchtype=all'
browser.get(url_healthcare)
dfs = []
for i in range(1, 39):
articles = browser.find_elements_by_tag_name('li[class="arxiv-result"]')
for article in articles:
title = article.find_element_by_tag_name('p[class="title is-5 mathjax"]').text
arxiv_id = article.find_element_by_tag_name('a').text.replace('arXiv:','')
arxiv_link = article.find_elements_by_tag_name('a')[0].get_attribute('href')
pdf_link = article.find_elements_by_tag_name('a')[1].get_attribute('href')
authors = article.find_element_by_tag_name('p[class="authors"]').text.replace('Authors:','')
try:
link1 = browser.find_element_by_link_text('▽ More')
link1.click()
except:
time.sleep(0.1)
abstract = article.find_element_by_tag_name('p[class="abstract mathjax"]').text
date = article.find_element_by_tag_name('p[class="is-size-7"]').text
date = re.split(r"Submitted|;",date)[1]
tag = article.find_element_by_tag_name('div[class="tags is-inline-block"]').text.replace('\n', ',')
try:
doi = article.find_element_by_tag_name('div[class="tags has-addons"]').text
doi = re.split(r'\s', doi)[1]
except NoSuchElementException:
doi = 'None'
all_combined = [title, arxiv_id, arxiv_link, pdf_link, authors, abstract, date, tag, doi]
dfs.append(all_combined)
print('Finished Extracting Page:', i)
try:
link2 = browser.find_element_by_class_name('pagination-next')
link2.click()
except:
browser.close
time.sleep(0.1)
The following implementation achieves this in 16 seconds .
To speed up the execution process I have taken the following measures:
Selenium
entirely (No clicking required)abstract
, used BeautifulSoup
's output and processed it latermultiprocessing
to speed up the process significantlyfrom multiprocessing import Process, Manager
import requests
from bs4 import BeautifulSoup
import re
import time
start_time = time.time()
def get_no_of_pages(showing_text):
no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))
pages = no_of_results//200 + 1
print("total pages:",pages)
return pages
def clean(text):
return text.replace("\n", '').replace(" ",'')
def get_data_from_page(url,page_number,data):
print("getting page",page_number)
response = requests.get(url+"start="+str(page_number*200))
soup = BeautifulSoup(response.content, "lxml")
arxiv_results = soup.find_all("li",{"class","arxiv-result"})
for arxiv_result in arxiv_results:
paper = {}
paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)
links = arxiv_result.find_all("a")
paper["arxiv_ids"]= links[0].text.replace('arXiv:','')
paper["arxiv_links"]= links[0].get('href')
paper["pdf_link"]= links[1].get('href')
paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))
split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("▽ More\n\n\n",1)
if len(split_abstract) == 2:
paper["abstract"] = clean(split_abstract[1].replace("△ Less",''))
else:
paper["abstract"] = clean(split_abstract[0].replace("△ Less",''))
paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]
paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text)
doi = arxiv_results[0].find("div",{"class":"tags has-addons"})
if doi is None:
paper["doi"] = "None"
else:
paper["doi"] = re.split(r'\s', doi.text)[1]
data.append(paper)
print(f"page {page_number} done")
if __name__ == "__main__":
url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'
response = requests.get(url+"start=0")
soup = BeautifulSoup(response.content, "lxml")
with Manager() as manager:
data = manager.list()
processes = []
get_data_from_page(url,0,data)
showing_text = soup.find("h1",{"class":"title is-clearfix"}).text
for i in range(1,get_no_of_pages(showing_text)):
p = Process(target=get_data_from_page, args=(url,i,data))
p.start()
processes.append(p)
for p in processes:
p.join()
print("Number of entires scraped:",len(data))
stop_time = time.time()
print("Time taken:", stop_time-start_time,"seconds")
Output:
>>> python test.py
getting page 0
page 0 done
total pages: 10
getting page 1
getting page 4
getting page 2
getting page 6
getting page 5
getting page 3
getting page 7
getting page 9
getting page 8
page 9 done
page 4 done
page 1 done
page 6 done
page 2 done
page 7 done
page 3 done
page 5 done
page 8 done
Number of entires scraped: 1890
Time taken: 15.911492586135864 seconds
Note:
.py
file. For Jupyter notebook refer this .data
list won't match the ordering on the website as Manager
will append dictionaries
into it as they come. gets
page 0
and then calculates the number of pages
and then goes for multiprocessing
for the remaining pages. This has the disadvantage that while the 0th page
was being worked on, no other process was running. So if you remove that part and simply run the loop for 10 pages
then the time taken should fall at around 8 seconds .You can try with request and beautiful soup approach. No need to click more link.
from requests import get
from bs4 import BeautifulSoup
# you can change the size to retrieve all the results at one shot.
url = 'https://arxiv.org/search/?query=healthcare&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0'
response = get(url,verify = False)
soup = BeautifulSoup(response.content, "lxml")
#print(soup)
queryresults = soup.find_all("li", attrs={"class": "arxiv-result"})
for result in queryresults:
title = result.find("p",attrs={"class": "title is-5 mathjax"})
print(title.text)
#If you need full abstract content - try this (you do not need to click on more button
for result in queryresults:
abstractFullContent = result.find("span",attrs={"class": "abstract-full has-text-grey-dark mathjax"})
print(abstractFullContent.text)
Output:
Interpretable Deep Learning for Automatic Diagnosis of 12-lead Electrocardiogram
Leveraging Technology for Healthcare and Retaining Access to Personal Health Data to Enhance Personal Health and Well-being
Towards new forms of particle sensing and manipulation and 3D imaging on a smartphone for healthcare applications
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.