简体   繁体   中英

Is there a way to optimize the for loop? Selenium is taking too long to scrape 38 pages

I'm trying to scrape https://arxiv.org/search/?query=healthcare&searchtype=allI through the Selenium and python. The for loop takes too long to execute. I tried to scrape with headless browsers and PhantomJS, but it doesnt scrape the abstract field (Need the abstract field expanded with the more button clicked)

import pandas as pd
import selenium
import re
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox

browser = Firefox()
url_healthcare = 'https://arxiv.org/search/?query=healthcare&searchtype=all'
browser.get(url_healthcare)

dfs = []
for i in range(1, 39):
    articles = browser.find_elements_by_tag_name('li[class="arxiv-result"]')

    for article in articles:
        title = article.find_element_by_tag_name('p[class="title is-5 mathjax"]').text
        arxiv_id = article.find_element_by_tag_name('a').text.replace('arXiv:','')
        arxiv_link = article.find_elements_by_tag_name('a')[0].get_attribute('href') 
        pdf_link = article.find_elements_by_tag_name('a')[1].get_attribute('href')
        authors = article.find_element_by_tag_name('p[class="authors"]').text.replace('Authors:','')

        try:
                link1 = browser.find_element_by_link_text('▽ More')
                link1.click()
        except:
                time.sleep(0.1)

        abstract = article.find_element_by_tag_name('p[class="abstract mathjax"]').text
        date = article.find_element_by_tag_name('p[class="is-size-7"]').text
        date = re.split(r"Submitted|;",date)[1]
        tag = article.find_element_by_tag_name('div[class="tags is-inline-block"]').text.replace('\n', ',')
        
        try:
            doi = article.find_element_by_tag_name('div[class="tags has-addons"]').text
            doi = re.split(r'\s', doi)[1] 
        except NoSuchElementException:
            doi = 'None'

        all_combined = [title, arxiv_id, arxiv_link, pdf_link, authors, abstract, date, tag, doi]
        dfs.append(all_combined)

    print('Finished Extracting Page:', i)

    try:
        link2 = browser.find_element_by_class_name('pagination-next')
        link2.click()
    except:
        browser.close
        
    time.sleep(0.1)


The following implementation achieves this in 16 seconds .

To speed up the execution process I have taken the following measures:

  • Removed Selenium entirely (No clicking required)
  • For abstract , used BeautifulSoup 's output and processed it later
  • Added multiprocessing to speed up the process significantly
from multiprocessing import Process, Manager
import requests 
from bs4 import BeautifulSoup
import re
import time

start_time = time.time()

def get_no_of_pages(showing_text):
    no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))
    pages = no_of_results//200 + 1
    print("total pages:",pages)
    return pages 

def clean(text):
    return text.replace("\n", '').replace("  ",'')

def get_data_from_page(url,page_number,data):
    print("getting page",page_number)
    response = requests.get(url+"start="+str(page_number*200))
    soup = BeautifulSoup(response.content, "lxml")
    
    arxiv_results = soup.find_all("li",{"class","arxiv-result"})

    for arxiv_result in arxiv_results:
        paper = {} 
        paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)
        links = arxiv_result.find_all("a")
        paper["arxiv_ids"]= links[0].text.replace('arXiv:','')
        paper["arxiv_links"]= links[0].get('href')
        paper["pdf_link"]= links[1].get('href')
        paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))

        split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("▽ More\n\n\n",1)
        if len(split_abstract) == 2:
            paper["abstract"] = clean(split_abstract[1].replace("△ Less",''))
        else: 
            paper["abstract"] = clean(split_abstract[0].replace("△ Less",''))

        paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]
        paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text) 
        doi = arxiv_results[0].find("div",{"class":"tags has-addons"})       
        if doi is None:
            paper["doi"] = "None"
        else:
            paper["doi"] = re.split(r'\s', doi.text)[1] 

        data.append(paper)
    
    print(f"page {page_number} done")


if __name__ == "__main__":
    url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'

    response = requests.get(url+"start=0")
    soup = BeautifulSoup(response.content, "lxml")

    with Manager() as manager:
        data = manager.list()  
        processes = []
        get_data_from_page(url,0,data)


        showing_text = soup.find("h1",{"class":"title is-clearfix"}).text
        for i in range(1,get_no_of_pages(showing_text)):
            p = Process(target=get_data_from_page, args=(url,i,data))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        print("Number of entires scraped:",len(data))

        stop_time = time.time()

        print("Time taken:", stop_time-start_time,"seconds")

Output:

>>> python test.py
getting page 0
page 0 done
total pages: 10
getting page 1
getting page 4
getting page 2
getting page 6
getting page 5
getting page 3
getting page 7
getting page 9
getting page 8
page 9 done
page 4 done
page 1 done
page 6 done
page 2 done
page 7 done
page 3 done
page 5 done
page 8 done
Number of entires scraped: 1890
Time taken: 15.911492586135864 seconds

Note:

  • Please write the above code in a .py file. For Jupyter notebook refer this .
  • Multiprocessing code taken from here .
  • The ordering of entries in the data list won't match the ordering on the website as Manager will append dictionaries into it as they come.
  • The above code finds the number of pages on its own and is thus generalized to work on any arxiv search result. Unfortunately, to do this it first gets page 0 and then calculates the number of pages and then goes for multiprocessing for the remaining pages. This has the disadvantage that while the 0th page was being worked on, no other process was running. So if you remove that part and simply run the loop for 10 pages then the time taken should fall at around 8 seconds .

You can try with request and beautiful soup approach. No need to click more link.

from requests import get
from bs4 import BeautifulSoup

# you can change the size to retrieve all the results at one shot.

url = 'https://arxiv.org/search/?query=healthcare&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0'
response = get(url,verify = False)
soup = BeautifulSoup(response.content, "lxml")
#print(soup)
queryresults = soup.find_all("li", attrs={"class": "arxiv-result"})

for result in queryresults:
    title = result.find("p",attrs={"class": "title is-5 mathjax"})
    print(title.text)

#If you need full abstract content - try this (you do not need to click on more button
    for result in queryresults:
        abstractFullContent = result.find("span",attrs={"class": "abstract-full has-text-grey-dark mathjax"})
        print(abstractFullContent.text)

Output:

 Interpretable Deep Learning for Automatic Diagnosis of 12-lead Electrocardiogram
            
  Leveraging Technology for Healthcare and Retaining Access to Personal Health Data to Enhance Personal Health and Well-being
  Towards new forms of particle sensing and manipulation and 3D imaging on a smartphone for healthcare applications

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM