Scrapy post 請求不起作用，但正常的 python 請求可以

Question

我正在嘗試從 WizzAir 獲取有關某些航班的一些數據。 我開始使用 scrapy，因為我想將爬蟲擴展到其他公司和域。 問題是，當我向 WizzAir API 發出 API 請求時，由於 akamai 機器人檢測器，我總是得到 403，但是對於正常的 python 請求，整個過程都有效。

我總是使用 scrapy 方法得到 403 和 404。

我在多個地方看過：

但我無法使 scrapy 方法起作用。

正常的請求方法有效，但 scrapy方法無效。

正常請求：

import requests
from datetime import datetime, timedelta
import json
import sys

# This is to set the payload to each price type.
def alter_price(price_type, flights):
    if price_type == "wdc":
        [flight.update({"priceType": "wdc"}) for flight in flights]
    else:
        [flight.update({"priceType": "regular"}) for flight in flights]
    return flights

headers = {
    'authority': 'be.wizzair.com',
    'accept': 'application/json, text/plain, */*',
    'origin': 'https://wizzair.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'content-type': 'application/json;charset=UTF-8',
    'sec-fetch-site': 'same-site',
    'sec-fetch-mode': 'cors',
    'referer': 'https://wizzair.com/en-gb/flights/timetable',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6'
}

# If you need data other than Budapest:
data = {"flightList":[{"departureStation":"", # Change this
                       "arrivalStation":"",
                       "from":"",
                       "to":""},
                      {"departureStation":"",
                       "arrivalStation":"", # and this
                       "from":"",
                       "to":""}],"priceType":"","adultCount":1,"childCount":0,"infantCount":0}

# These were collected by hand from the wizzair website, because I couldn't download them with code.
# The other airport is always Budapest as defined in the payload.
destinations = ["CRL",]

data_list = []
base = datetime.today()
# Here you can set how many periods you want to download (period = 42 days)
for period in range(6):
    # Only a maximum of 42 days is supported by wizzair.
    data["flightList"][0]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
    data["flightList"][1]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")

    data["flightList"][0]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
    data["flightList"][1]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
    for price_type in ["regular"]:
        data["priceType"] = price_type
        print(f"Downloading started with the following params for all destinations: {period}, {price_type}")
        for destination in destinations:
            data["flightList"][0]["arrivalStation"] = destination
            data["flightList"][1]["departureStation"] = destination

            response = requests.post('https://be.wizzair.com/14.3.0/Api/search/timetable', headers=headers, json={
                "flightList": [
                    {
                        "departureStation": "GVA",
                        "arrivalStation": "OTP",
                        "from": "2022-12-16",
                        "to": "2023-01-01"
                    }
                ],
                "priceType": "regular",
                "adultCount": 1,
                "childCount": 0,
                "infantCount": 0
            })

            if response.status_code == 200:
                data_list.append(alter_price(price_type, response.json()["outboundFlights"]))
            else:
                print("HTTP status: ", response.status_code)
                print("Something went wrong with this payload: ", data)

flat_list = [item for sublist in data_list for item in sublist]
print(flat_list)

Scrapy蜘蛛：

import json
import sys
import time
from typing import List, Dict

import requests
from scrapy import Spider, Request, FormRequest
from datetime import date, timedelta
from copy import deepcopy

from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from scrapy.http import HtmlResponse


class WizzairSpider(Spider):
    name = 'WizzAir'
    allowed_domains = ['wizzair.com']
    start_url = 'https://wizzair.com'

    # 42 is supported by WizzAir -> 30 just to be safe
    MAX_DELTA = timedelta(days=30)

    PRICE_TYPES = [{'priceType': 'regular'}]

    def __init__(self, *args, **kwargs):
        super().__init__(args, **kwargs)
        self.sources = ['GVA']
        self.destinations = ['OTP']
        self.intervals = [2]

    def _prepare_request(self, source: str, destination: str, departure_date: date) -> List[Dict]:
        def apply_extras(base_template: dict, extras: dict) -> dict:
            base_template.update(extras)
            return base_template

        arrival_date = departure_date + WizzairSpider.MAX_DELTA
        base_request = {
            "flightList": [
                {
                    "departureStation": source,
                    "arrivalStation": destination,
                    "from": departure_date.strftime("%Y-%m-%d"),
                    "to": arrival_date.strftime("%Y-%m-%d")
                }
            ],
            "priceType": "",
            "adultCount": 1,
            "childCount": 0,
            "infantCount": 0
        }

        return list(map(lambda extra: apply_extras(deepcopy(base_request), extra), WizzairSpider.PRICE_TYPES))

    def start_requests(self):
        today = date.today()

        for time_distance in self.intervals:
            departure_date = today + timedelta(days=time_distance)

            for source in self.sources:
                for destination in self.destinations:
                    if source == destination:
                        continue

                    for payload in self._prepare_request(source, destination, departure_date):
                        yield Request(
                            url='https://wizzair.com/14.3.0/Api/search/timetable',
                            method='POST',
                            callback=self.parse,
                            body=json.dumps(payload),
                            headers={
                                "authority": "be.wizzair.com",
                                "accept": "application/json, text/plain, */*",
                                "origin": "https://wizzair.com",
                                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                                "content-type": "application/json;charset=UTF-8",
                                "sec-fetch-site": "same-site",
                                "sec-fetch-mode": "cors",
                                "referer": "https://wizzair.com/en-gb/flights/timetable",
                                "accept-encoding": "gzip, deflate, br",
                                "accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
                            }
                        )

    def errback_httpbin(self, failure):
        print("got error")
        # log all failures
        self.logger.error(repr(failure))

        # in case you want to do something special for some errors,
        # you may need the failure's type:

        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.error('HttpError on %s', response.url)

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError on %s', request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.error('TimeoutError on %s', request.url)

    def parse(self, response: HtmlResponse, **kwargs):
        print('status code', response.status)
        print('------', response.body)

Answer 1

嘗試使用scrapy.FormReqeust並將您的數據傳遞給 formdata 屬性

yield FormRequest(
url='https://wizzair.com/14.3.0/Api/search/timetable',
callback=self.parse,
formdata=payload,
headers={
    "authority": "be.wizzair.com",
    "accept": "application/json, text/plain, */*",
    "origin": "https://wizzair.com",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "content-type": "application/json;charset=UTF-8",
    "sec-fetch-site": "same-site",
    "sec-fetch-mode": "cors",
    "referer": "https://wizzair.com/en-gb/flights/timetable",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
}

)

Scrapy post 請求不起作用，但正常的 python 請求可以

問題描述

1 個解決方案

解決方案1
-2 2022-12-09 07:33:09

Scrapy post 請求不起作用，但正常的 python 請求可以

問題描述

1 個解決方案

解決方案1 -2 2022-12-09 07:33:09

解決方案1
-2 2022-12-09 07:33:09