[英]Scrape multiple pages with selenium
如何使用 selenuim 抓取多个页面我正在尝试抓取多个页面,但它们向我显示错误是否有任何方法与我分享我正在尝试通过单击按钮来抓取多个页面这些是页面链接https://www.ifep.ro /justice/lawyers/lawyerspanel.aspx
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
productlink=[]
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
driver.get(URL)
time.sleep(3)
links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
for link in links:
link_href = link.get_attribute("href")
if link_href.startswith("https://www.ifep.ro/"):
productlink.append(link_href)
for k in range(1,5):
for product in productlink:
driver.get(product)
time.sleep(2)
title = driver.find_element(By.CSS_SELECTOR, '#HeadingContent_lblTitle').text
d1 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[1]").text
d1 = d1.strip()
d2 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[2]").text
d2 = d2.strip()
d3 =driver.find_element_by_xpath(
"//div[@class='col-md-10']//p[3]//span").text
d3 = d3.strip()
d4 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[4]").text
d4 = d4.strip()
WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{k}")) ).click()
print(title,d1,d2,d3,d4)
# driver.back()
time.sleep(2)
driver.quit()
supplyvan_scraper()
你有一些错误:
更好的解决方案:
在这里,您有一些与playwright一起编写的代码:
import time
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.webkit.launch(headless=False)
baseurl = "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx"
page = browser.new_page()
page.goto(baseurl)
productlinks = []
for k in range(1, 10):
links = page.query_selector_all("//div[@class='list-group']//a")
for link in links:
link_href = link.get_attribute("href")
if link_href.startswith("LawyerFile.aspx"):
productlinks.append("https://www.ifep.ro/justice/lawyers/" + link_href)
page.wait_for_selector("#MainContent_PagerTop_NavNext").click()
time.sleep(2) # wait for load the page
for product in productlinks:
page.goto(product)
title = page.wait_for_selector('#HeadingContent_lblTitle').text_content()
d1 = page.wait_for_selector("//div[@class='col-md-10']//p[1]").text_content()
d1 = d1.strip()
d2 = page.wait_for_selector("//div[@class='col-md-10']//p[2]").text_content()
d2 = d2.strip()
d3 = page.wait_for_selector("//div[@class='col-md-10']//p[3]//span").text_content()
d3 = d3.strip()
d4 = page.wait_for_selector("//div[@class='col-md-10']//p[4]").text_content()
d4 = d4.strip()
print(title, d1, d2, d3, d4)
browser.close()
输出:
ILIE Marius-Constantin, Baroul Ilfov Avocat Definitiv, Baroul Ilfov Dată înscriere: 14-03-2011 ACTIV Instanţe cu drept de concluzii: Toate instanţele
DIN GEORGIANA-CLAUDIA, Baroul Bucureşti Avocat Definitiv, Baroul Bucureşti Dată înscriere: 15-05-2008 ACTIV Instanţe cu drept de concluzii: Toate instanţele
MOLDANSCHI ANDREEA-IOANA, Baroul Bucureşti Avocat Stagiar, Baroul Bucureşti Dată înscriere: 30-05-2022 ACTIV Instanţe cu drept de concluzii: Judecătorii
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.