[英]Scrapy post request does not work, but normal python request does
我正在嘗試從 WizzAir 獲取有關某些航班的一些數據。 我開始使用 scrapy,因為我想將爬蟲擴展到其他公司和域。 問題是,當我向 WizzAir API 發出 API 請求時,由於 akamai 機器人檢測器,我總是得到 403,但是對於正常的 python 請求,整個過程都有效。
我總是使用 scrapy 方法得到 403 和 404。
我在多個地方看過:
但我無法使 scrapy 方法起作用。
正常請求:
import requests
from datetime import datetime, timedelta
import json
import sys
# This is to set the payload to each price type.
def alter_price(price_type, flights):
if price_type == "wdc":
[flight.update({"priceType": "wdc"}) for flight in flights]
else:
[flight.update({"priceType": "regular"}) for flight in flights]
return flights
headers = {
'authority': 'be.wizzair.com',
'accept': 'application/json, text/plain, */*',
'origin': 'https://wizzair.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'content-type': 'application/json;charset=UTF-8',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'referer': 'https://wizzair.com/en-gb/flights/timetable',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6'
}
# If you need data other than Budapest:
data = {"flightList":[{"departureStation":"", # Change this
"arrivalStation":"",
"from":"",
"to":""},
{"departureStation":"",
"arrivalStation":"", # and this
"from":"",
"to":""}],"priceType":"","adultCount":1,"childCount":0,"infantCount":0}
# These were collected by hand from the wizzair website, because I couldn't download them with code.
# The other airport is always Budapest as defined in the payload.
destinations = ["CRL",]
data_list = []
base = datetime.today()
# Here you can set how many periods you want to download (period = 42 days)
for period in range(6):
# Only a maximum of 42 days is supported by wizzair.
data["flightList"][0]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
data["flightList"][1]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
data["flightList"][0]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
data["flightList"][1]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
for price_type in ["regular"]:
data["priceType"] = price_type
print(f"Downloading started with the following params for all destinations: {period}, {price_type}")
for destination in destinations:
data["flightList"][0]["arrivalStation"] = destination
data["flightList"][1]["departureStation"] = destination
response = requests.post('https://be.wizzair.com/14.3.0/Api/search/timetable', headers=headers, json={
"flightList": [
{
"departureStation": "GVA",
"arrivalStation": "OTP",
"from": "2022-12-16",
"to": "2023-01-01"
}
],
"priceType": "regular",
"adultCount": 1,
"childCount": 0,
"infantCount": 0
})
if response.status_code == 200:
data_list.append(alter_price(price_type, response.json()["outboundFlights"]))
else:
print("HTTP status: ", response.status_code)
print("Something went wrong with this payload: ", data)
flat_list = [item for sublist in data_list for item in sublist]
print(flat_list)
Scrapy蜘蛛:
import json
import sys
import time
from typing import List, Dict
import requests
from scrapy import Spider, Request, FormRequest
from datetime import date, timedelta
from copy import deepcopy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from scrapy.http import HtmlResponse
class WizzairSpider(Spider):
name = 'WizzAir'
allowed_domains = ['wizzair.com']
start_url = 'https://wizzair.com'
# 42 is supported by WizzAir -> 30 just to be safe
MAX_DELTA = timedelta(days=30)
PRICE_TYPES = [{'priceType': 'regular'}]
def __init__(self, *args, **kwargs):
super().__init__(args, **kwargs)
self.sources = ['GVA']
self.destinations = ['OTP']
self.intervals = [2]
def _prepare_request(self, source: str, destination: str, departure_date: date) -> List[Dict]:
def apply_extras(base_template: dict, extras: dict) -> dict:
base_template.update(extras)
return base_template
arrival_date = departure_date + WizzairSpider.MAX_DELTA
base_request = {
"flightList": [
{
"departureStation": source,
"arrivalStation": destination,
"from": departure_date.strftime("%Y-%m-%d"),
"to": arrival_date.strftime("%Y-%m-%d")
}
],
"priceType": "",
"adultCount": 1,
"childCount": 0,
"infantCount": 0
}
return list(map(lambda extra: apply_extras(deepcopy(base_request), extra), WizzairSpider.PRICE_TYPES))
def start_requests(self):
today = date.today()
for time_distance in self.intervals:
departure_date = today + timedelta(days=time_distance)
for source in self.sources:
for destination in self.destinations:
if source == destination:
continue
for payload in self._prepare_request(source, destination, departure_date):
yield Request(
url='https://wizzair.com/14.3.0/Api/search/timetable',
method='POST',
callback=self.parse,
body=json.dumps(payload),
headers={
"authority": "be.wizzair.com",
"accept": "application/json, text/plain, */*",
"origin": "https://wizzair.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"content-type": "application/json;charset=UTF-8",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"referer": "https://wizzair.com/en-gb/flights/timetable",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
}
)
def errback_httpbin(self, failure):
print("got error")
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
def parse(self, response: HtmlResponse, **kwargs):
print('status code', response.status)
print('------', response.body)
嘗試使用scrapy.FormReqeust
並將您的數據傳遞給 formdata 屬性
yield FormRequest(
url='https://wizzair.com/14.3.0/Api/search/timetable',
callback=self.parse,
formdata=payload,
headers={
"authority": "be.wizzair.com",
"accept": "application/json, text/plain, */*",
"origin": "https://wizzair.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"content-type": "application/json;charset=UTF-8",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"referer": "https://wizzair.com/en-gb/flights/timetable",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
}
)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.