[英]Python Scrapy 503 Service Unavailable
当我尝试抓取 checkatrade网站时,我不断收到“503 服务不可用”。 我尝试将并发请求设置为 1,将 download_delay 设置为 10,有一个 user_agent,但它只是在第一次尝试 start_url 时被阻止
下面的代码显示了我的尝试,我也尝试过 selenium(我在代码下面的注释中留下了 selenium 代码 - 即使这样它也没有注册 start_url):
import scrapy
from urllib.parse import urljoin
#from scrapy.http import TextResponse
#from selenium import webdriver
from checkatrade.items import CheckatradeItem
class checkatradeSpider(scrapy.Spider):
name = "checkatrade"
allowed_domains = ["checkatrade.com"]
start_urls = ["https://www.checkatrade.com/Directory/A"]
# def __init__(self):
# try:
# self.driver = webdriver.Chrome("C:/Users/andrew/Downloads/chromedriver_win32/chromedriver.exe")
# except:
# self.driver = webdriver.Chrome("C:/Users/andre/Downloads/chromedriver_win32/chromedriver.exe")
def parse(self, response):
# self.driver.get(response.url)
# response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
# for sel in response1.xpath('//*[@class="directory"]/tbody/tr'):
for sel in response.xpath('//*[@class="directory"]/tbody/tr'):
member = sel.xpath('normalize-space(.//td/a/text())').extract()
memberurl = sel.xpath('normalize-space(.//td/a/@href)').extract()
basedin = sel.xpath('normalize-space(.//td[2]/text())').extract()
memberfor = sel.xpath('normalize-space(.//td[3]/text())').extract()
reports = sel.xpath('normalize-space(.//td[4]/text())').extract()
rating = sel.xpath('normalize-space(.//td[5]/text())').extract()
item = CheckatradeItem()
item['member'] = member
item['memberurl'] = memberurl
item['basedin'] = basedin
item['memberfor'] = memberfor
item['reports'] = reports
item['rating'] = rating
yield item
# self.driver.close()
# try:
# self.driver = webdriver.Chrome("C:/Users/andrew/Downloads/chromedriver_win32/chromedriver.exe")
# except:
# self.driver = webdriver.Chrome("C:/Users/andre/Downloads/chromedriver_win32/chromedriver.exe")
但它的工作没有任何问题。
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
#options to add as arguments
from selenium.webdriver.chrome.options import Options
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('https://www.checkatrade.com/Directory/A')
time.sleep(2)
for sel in driver.find_elements(By.XPATH,'//*[@class="directory"]/tbody/tr'):
member = sel.find_element(By.XPATH,'.//td/a').text
print(member)
Output:
A B Smith & Sons (ASmithAndSons)
A - Klass Carpentry and Joinery Ltd (AklassCarpentryAndJoinery)
A Plaster (APlaster)
A - Z Asbestos Management Ltd (AZAsbestosManagement)
A - Z Property Services (AZPropertyServices1024583)
A - Z Repairs & Groundworks (AZRepairsGroundworks)
A & A Building Construction (AandAConstructionLondon)
A & A Brown Joiners Cabinet Makers (AABrownJoinersCabinetMakers)
A & A Builders (Surrey) Ltd (AABuildersLtd)
A & A Building Services (AAndABuildingServices)
A & A Cctv & Security Ltd (AACctvSecurityLtd)
A & A Concrete Repairs (AAConcreteRepairs)
A & A Decorator (AandADecorator)
A & A Domestics (AandADomestics)
A & A Double Glazing (AADoubleGlazing)
A & A Drain Services Ltd (AADrainServicesLtd)
A & A Electrical (AAElectrical978398)
A & A Electrics (AAElectrics)
A & A Fire Protection (AAFireProtection)
A & A Insulation Services Ltd (AandAInsulationServicesLtd)
A & A King Building Contractors (AAKingBuildingContractors)
A & A Lamb Ltd (AALambLtd)
A & A Landscape and Building (AAndALandscapeandBuilding)
A & A Landscaping & Paving (AALandscapingPaving)
... 很快
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.