简体   繁体   中英

Multithreading with requests-html

After a couple of weeks of attempting to figure this out on my own i've given up. Please be aware that i'm using requests_html and not requests as the pages i'm attempting to scrape are JS rendered. The script runs fine without multithreading, but it's painfully slow. When I attempt to include multithreading, I get the following error(s) when I run this script:

Exception in thread Thread-1:
Traceback (most recent call last):
  File "selena_multi.py", line 79, in selena_parse
    r.html.render()
  File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 572, in render
    self.session.browser  # Automatycally create a event loop and browser
  File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 679, in browser
    self.loop = asyncio.get_event_loop()
  File "/usr/lib/python3.6/asyncio/events.py", line 694, in get_event_loop
    return get_event_loop_policy().get_event_loop()
  File "/usr/lib/python3.6/asyncio/events.py", line 602, in get_event_loop
    % threading.current_thread().name)
RuntimeError: There is no current event loop in thread 'Thread-1'.

Here's the script:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from requests_html import HTMLSession
import time, sys, re, json, threading


class Selena:
    def __init__(self):
    global shoe_names

    self.shoe_model = {}
    self.reduced_specs_list = {}
    self.shoe_specs_individual = []
    self.master_list = []
    self.specs_master_list = []
    self.counter = 0
    self.shoe_names = []


def selena_main(self):
    # selenium params
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    driver = webdriver.Chrome(options=chrome_options)
    driver.get("http://www.goat.com/sneakers")

    # # click 'see more'
    # click_error_count = 0 
    # while True:
    #     try:
    #         see_more = driver.find_element_by_xpath('//*[@id="root"]/div/div/div[2]/div[1]/div[2]/div[2]/div/div[2]/button/span').click()
    #         time.sleep(1)

    #     except Exception as e:
    #         if click_error_count == 3:
    #             break
    #         print("\nFind element exited.\n{}\n".format(e))
    #         click_error_count += 1
    #         time.sleep(3)
    #         pass

    # get the html & find the links
    goat_html = driver.page_source
    shoes = re.findall('<a class="cell" (.+?)</a>', goat_html, re.S)
    shoe_string = " ".join(shoes)
    self.shoe_names = re.findall('truncate">(.+?)</p>', shoe_string, re.S)

    # convert shoes list to string and find links
    links = re.findall('href="(.+?)"', shoe_string, re.S)

    if len(links) == 0 or len(links) == 12:
        print(len(links))
        print("No links returned.")
        return

    print("{} links found.".format(len(links)))

    threads = [threading.Thread(target=self.selena_parse, args=(l,)) for l in links]
    for thr in threads:
        thr.start()

    for thr in threads:
        thr.join()


def selena_parse(self, links):
    print(links)

    try:
        self.shoe_model.clear()
        del self.shoe_specs_individual[:]
        self.reduced_specs_list.clear()
        session = HTMLSession()
        link_string = 'https://www.goat.com{}/available-sizes'.format(links)
        r = session.get(link_string, stream=True, timeout=3)
        r.raise_for_status()
        r.html.render()

        goat2 = r.html.find("#root", first=True)
        goat2_shoe_data = re.findall('fXQURg">(.+?)</div>', goat2.html, re.S)
        goat2_other_data = re.findall('hUbwah">(.+?)</span>', goat2.html, re.S)

        # get brand
        goat2_brands = re.findall('"nutr-link-brand"(.+?)/a>', goat2.html, re.S)
        goat2_brand = re.findall('>(.+?)<', goat2_brands[0], re.S)

        # get sku from url
        goat2_dash_sep = [u for u in link_string.split('-')]
        goat2_dash_sep_pt2 = goat2_dash_sep[-2].split('/')
        sku = goat2_dash_sep_pt2[0]

        # get image url
        goat2_imgs = re.findall('<img(.+?)style', goat2.html, re.S)
        goat2_img_url = re.findall('src="(.+?)"', goat2_imgs[0], re.S)
        img = goat2_img_url[0]

        for n in range(len(goat2_shoe_data) - 1):
            if n % 2 == 0:
                g2_cleaned = re.findall('<span>(.+?)</span>', goat2_shoe_data[n+1], re.S)
                self.shoe_specs_individual.append({goat2_shoe_data[n]:g2_cleaned[0]})

        for new in self.shoe_specs_individual:
            self.reduced_specs_list.update(new)

        self.specs_master_list.append(self.reduced_specs_list)

        self.shoe_model.update({"id": self.counter, "url": link_string, 
        "style": sku, "name": self.shoe_names[self.counter], 
        "release_date": goat2_other_data[0], "color": goat2_other_data[1], "brand": goat2_brand[0], 
        "nickname": goat2_other_data[2],
        "image_url": img, "specs": self.specs_master_list[self.counter]})

        self.master_list.append(json.loads(json.dumps(self.shoe_model)))

        self.selena_dump()
        r.close()

    except Exception as e:
        print("No good on {}.\nError: {}.\n".format(links, e))
        time.sleep(1)



def selena_dump(self):
    # dump into json
    all_shoes = {"shoes": self.master_list}
    with open('goat.json', 'w') as output:
        o = json.dumps(all_shoes, indent=4)
        output.write(o)


if __name__ == '__main__':
    with open('goat.json', 'w') as new_file:
        pass

    sel = Selena()
    sel.selena_main()

According to this issue on the project's github page ( https://github.com/psf/requests-html/issues/155 ), the requests_html library doesn't work when run in multiple threads. The author suggests you use the AsyncHTMLSession instead of HTMLSession .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM