繁体   English   中英

创建Selenium Scraper类的多个实例并并行运行

[英]Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

因此,我用硒创建了一个网络抓取器,可以无限地抓取一个网页。 我正在尝试创建此刮板的两个实例并并行运行它们,以便将站点的两个不同部分(或整个两个不同的站点)同时刮掉。 使用我当前的代码,两个进程都启动,两个chrome实例启动,但是实际上只有一个开始抓取。 另一个仅位于目标网页上,永不移动。 我目前的刮板课程看起来像这样

class clBot(Scraper):

def __init__(self, light_or_dark):
    light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
                              '//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
    dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
    if light_or_dark == "light":
        self.xpaths_to_scrape = light_side_xpaths
        self.csv_file = "lightside.csv"
    elif light_or_dark == "dark":
        self.xpaths_to_scrape = dark_side_xpaths
        self.csv_file = "darkside.csv"
    else:
        print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
        quit()
    self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    self.options = webdriver.ChromeOptions()
    #self.options.add_argument('--headless')
    self.options.add_argument('user-agent={self.user_agent}')
    self.current_region = ''
    self.driver = webdriver.Chrome(chrome_options=self.options)
    self.driver.get('https://craigslist.org')

    def run(self):
        self.navigate_pages()


def identify_phone_number(self, string, phone_number_list):
    reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
    if len(reg) > 0:
        for r in reg:
            if r.strip() not in phone_number_list:
                with open(self.csv_file, 'a') as csv:
                    csv.write("{}\n".format(r.strip()))
                print("Extracted {} from listing".format(r.strip()))
            else:
                print('Phone number already in list.')


def extract_phone_number(self):
    try:
        with open(self.csv_file, 'r') as csv:
            current_phone_numbers = csv.read()
        posting_body = self.driver.find_element_by_id('postingbody')
        self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
        contact_info = self.driver.find_element_by_class_name('showcontact')
        contact_info.click()
        time.sleep(1)
        self.identify_phone_number(posting_body.text, current_phone_numbers)
    except TimeoutException:
        self.identify_phone_number(posting_body.text, current_phone_numbers)
        print('There is no phone number in this listing.')



def scrape_pages(self):
    i=1
    while True:
        try:
            self.scraper_wait_class_until_all(self.driver, 'result-row')
            results = self.driver.find_elements_by_class_name('result-row')
            print("clicking result {}".format(i))
            results[i].find_element_by_class_name('result-title').click()
            self.extract_phone_number()
            self.driver.back()
            i+=1
        except IndexError:
            self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            print('Navigating to next page.')
            next_button.click()
            i=1

def choose_xpath_to_scrape(self, list_of_xpaths):
    xpath_index = randint(0, len(list_of_xpaths)-1)
    xpath = list_of_xpaths[xpath_index]
    return xpath
def navigate_pages(self):
    try:
        while True:
            try:
                self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
                rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
                nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
                child_items = nearby_cl.find_elements_by_class_name('s')
                random = randint(1, len(child_items)-1)
                time.sleep(3)
                print("Clicking {}".format(child_items[random].text))
                child_items[random].click()
                for xpath in self.xpaths_to_scrape:
                    area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
                    area_to_scrape.click()
                    self.scrape_pages()
                    self.driver.back()
                    time.sleep(1)
            except WebDriverException:
                continue
    except Exception as e:
        print(e)
        return
    finally:
        self.driver.quit()

打开两个进程并对其进行初始化的main.py文件如下:

import scraper

from multiprocessing import Process, Manager



if __name__ == "__main__":
    manager = Manager()
    d = manager.dict()
    l = manager.list(range(10))
    darksideScraper = scraper.clBot('light')
    lightsideScraper = scraper.clBot('dark')

    darkside = Process(target=darksideScraper.navigate_pages())
    lightside = Process(target=lightsideScraper.navigate_pages())
    darkside.start()
    lightside.start()
    darkside.join()
    lightside.join()

任何帮助,将不胜感激!

尝试将目标作为函数的引用而不是调用它,例如Process(target=darksideScraper.navigate_pages) 有关如何使用多重处理的另一个示例,也请参考此内容

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM