[英]Goes to Next page but it does not scrape its elements using Selenium and Scrapy
我正在嘗試使用 Selenium 抓取所有頁面並單擊下一頁按鈕。 但是,當我轉到下一頁時,URL 並沒有改變。 我可以移動到所有頁面,但我只能從第一頁抓取項目,不知道如何讓它適用於所有頁面。 關於我應該做什么的任何建議?
先感謝您!
編碼:
class MilieuProperties(scrapy.Spider):
name = 'milieu_properties'
start_urls = [
# FOR SALE
'https://www.milieuproperties.com/search-results.aspx?paramb=ADVANCE%20SEARCH:%20Province%20(Western%20Cape),%20%20Area%20(Cape%20Town)',
'https://www.milieuproperties.com/RentalByCategory.aspx'
]
def __init__(self):
#headless options
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome('path',options=options)
def parse(self,response):
self.driver.get(response.url)
current_page_number = self.driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text
while True:
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ContentPlaceHolder1_lvDataPager1"]/a[text()="Next" and not(@class)]')))
elem.click()
except TimeoutException:
break
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text != current_page_number)
current_page_number = self.driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text
offering = response.css('span#ContentPlaceHolder1_lblbreadcum::text').get()
try:
offering = 'rent' if 'Rental' in offering else 'buy'
except TypeError:
offering = 'buy'
base_link = response.request.url.split('/')
try:
base_link = base_link[0] + '//' + base_link[2] + '/'
except:
pass
for p in response.xpath('//div[@class="ct-itemProducts ct-u-marginBottom30 ct-hover"]'):
link = base_link + p.css('a::attr(href)').get()
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'url': link,
'offering': offering,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
. . .
您可以在不使用 Scrapy 的情況下獲取數據。 試試這個代碼:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
links = []
url = 'https://www.milieuproperties.com/search-results.aspx?paramb=ADVANCE%20SEARCH:%20Province%20(Western%20Cape),%20%20Area%20(Cape%20Town)'
driver.get(url)
current_page_number = driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text
while True:
links.extend([link.get_attribute('href') for link in driver.find_elements_by_css_selector('.hoverdetail a')])
try:
elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ContentPlaceHolder1_lvDataPager1"]/a[text()="Next" and not(@class)]')))
elem.click()
except TimeoutException:
break
WebDriverWait(driver, 10).until(lambda driver: driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text != current_page_number)
current_page_number = driver.find_element_by_css_selector('#ContentPlaceHolder1_lvDataPager1>span').text
print(links)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.