[英]Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel
因此,我用硒创建了一个网络抓取器,可以无限地抓取一个网页。 我正在尝试创建此刮板的两个实例并并行运行它们,以便将站点的两个不同部分(或整个两个不同的站点)同时刮掉。 使用我当前的代码,两个进程都启动,两个chrome实例启动,但是实际上只有一个开始抓取。 另一个仅位于目标网页上,永不移动。 我目前的刮板课程看起来像这样
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
'//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
打开两个进程并对其进行初始化的main.py文件如下:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
任何帮助,将不胜感激!
尝试将目标作为函数的引用而不是调用它,例如Process(target=darksideScraper.navigate_pages)
。 有关如何使用多重处理的另一个示例,也请参考此内容 。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.