[英]Scraping asb.net site does not work when using a function in selenium in python
我想抓取一個 .net 網站,我制作這個代碼
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
class BoursakuwaitSpider(scrapy.Spider):
name = 'boursakuwait'
custom_settings = {
'FEED_URI': 'second.json',
'FEED_FORMAT': 'json',
}
start_urls = ['https://casierjudiciaire.justice.gov.ma/verification.aspx']
def parse(self, no_response):
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install())
browser.get('https://casierjudiciaire.justice.gov.ma/verification.aspx')
time.sleep(10)
response = Selector(text=browser.page_source)
當我使用 function 解析代碼不起作用但如果我只使用 class 像這樣:
import time
import scrapy
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
class BoursakuwaitSpider(scrapy.Spider):
name = 'boursakuwait'
custom_settings = {
'FEED_URI': 'second.json',
'FEED_FORMAT': 'json',
}
start_urls = ['https://casierjudiciaire.justice.gov.ma/verification.aspx']
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install())
browser.get('https://casierjudiciaire.justice.gov.ma/verification.aspx')
time.sleep(10)
response = Selector(text=browser.page_source)
代碼工作正確。 但對我來說,我想使用 function(第一個代碼)我不知道問題出在哪里。 請任何幫助。
首先,創建def start_requests(self):
方法。 然后在這個方法下設置所有的selenium依賴。
您必須將the browser/ driver
從一個 def 方法轉移到另一個注入 self 關鍵字。 以下代碼正在工作:
例子:
import time
import scrapy
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from scrapy.crawler import CrawlerProcess
from selenium.webdriver.chrome.options import Options
class BoursakuwaitSpider(scrapy.Spider):
name = 'boursakuwait'
# custom_settings = {
# 'FEED_URI': 'second.json',
# 'FEED_FORMAT': 'json',
# }
def start_requests(self):
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
url = 'https://stackoverflow.com'
self.browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
self.browser.get(url)
time.sleep(5)
yield scrapy.Request(
url='https://stackoverflow.com',
callback=self.parse
)
def parse(self,response):
self.browser.get(response.url)
time.sleep(5)
#response = Selector(text=self.browser.page_source)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(BoursakuwaitSpider)
process.start()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.