繁体   English   中英

用硒刮掉多个页面

[英]Scrape multiple pages with selenium

如何使用 selenuim 抓取多个页面我正在尝试抓取多个页面,但它们向我显示错误是否有任何方法与我分享我正在尝试通过单击按钮来抓取多个页面这些是页面链接https://www.ifep.ro /justice/lawyers/lawyerspanel.aspx

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

productlink=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
        driver.get(URL)
        time.sleep(3)
        links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("https://www.ifep.ro/"):
                productlink.append(link_href)
              
                
        for k in range(1,5):      
            for product in productlink:
                driver.get(product)
                time.sleep(2)
                title = driver.find_element(By.CSS_SELECTOR, '#HeadingContent_lblTitle').text
                d1 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[1]").text
                d1 = d1.strip()
                d2 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[2]").text
                d2 = d2.strip()
                d3 =driver.find_element_by_xpath(
                    "//div[@class='col-md-10']//p[3]//span").text
                d3 = d3.strip()
                d4 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[4]").text
                d4 = d4.strip()
                
                WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{k}")) ).click()
                
                
                print(title,d1,d2,d3,d4)
             
                # driver.back()
        time.sleep(2)

        driver.quit()


supplyvan_scraper()

你有一些错误:

  1. 您正在链接循环内单击(下一页)。
  2. 访问所有链接后,您需要返回单击下一页。 如果您访问了 15 个链接,则需要返回 15 次或保存源 URL 并返回源 URL。

更好的解决方案:

  1. 仅通过单击下一页来抓取所有链接。
  2. 访问所有链接,抓取所有数据并打印它们。

在这里,您有一些与playwright一起编写的代码:

import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.webkit.launch(headless=False)
    baseurl = "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx"
    page = browser.new_page()
    page.goto(baseurl)
    productlinks = []
    for k in range(1, 10):
        links = page.query_selector_all("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("LawyerFile.aspx"):
                productlinks.append("https://www.ifep.ro/justice/lawyers/" + link_href)
        page.wait_for_selector("#MainContent_PagerTop_NavNext").click()
        time.sleep(2)  # wait for load the page
    for product in productlinks:
        page.goto(product)
        title = page.wait_for_selector('#HeadingContent_lblTitle').text_content()
        d1 = page.wait_for_selector("//div[@class='col-md-10']//p[1]").text_content()
        d1 = d1.strip()
        d2 = page.wait_for_selector("//div[@class='col-md-10']//p[2]").text_content()
        d2 = d2.strip()
        d3 = page.wait_for_selector("//div[@class='col-md-10']//p[3]//span").text_content()
        d3 = d3.strip()
        d4 = page.wait_for_selector("//div[@class='col-md-10']//p[4]").text_content()
        d4 = d4.strip()
        print(title, d1, d2, d3, d4)
    browser.close()

输出:

ILIE Marius-Constantin, Baroul Ilfov Avocat Definitiv, Baroul Ilfov Dată înscriere: 14-03-2011 ACTIV Instanţe cu drept de concluzii: Toate instanţele
DIN GEORGIANA-CLAUDIA, Baroul Bucureşti Avocat Definitiv, Baroul Bucureşti Dată înscriere: 15-05-2008 ACTIV Instanţe cu drept de concluzii: Toate instanţele
MOLDANSCHI ANDREEA-IOANA, Baroul Bucureşti Avocat Stagiar, Baroul Bucureşti Dată înscriere: 30-05-2022 ACTIV Instanţe cu drept de concluzii: Judecătorii

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM