繁体   English   中英

Python Scraping asp.net with requests“基于会话的SearchQueue为空”

[英]Python Scraping asp.net with requests "session based SearchQueue is empty"

尝试使用 python 请求抓取该县网站的多个搜索阶段。 基本上是尝试进行搜索,过滤结果(该代码尚未出现),然后转到该结果页面。 获取 asp.net 错误消息The session based SearchQueue is empty. 到目前为止,我的代码可能看起来很长,但我包含了我在请求中使用的所有表单数据。 只是试图搜索名字“史密斯”

基本上,我正在发出一个空请求,获取__VIEWSTATE和其他值,然后发出一个工作正常的搜索请求。 然后我再次从搜索结果页面获取__VIEWSTATE和朋友,并尝试使用我认为是hdLink值的搜索结果,尽管我不确定。 你认为我可能缺少一个__EVENTTARGET吗? 把自己逼疯了,因为我不知道在这里寻找什么。 还发布了错误页面的图像。 感谢任何可以提供知识的人。

测试文件

import CountyFormDataList

import requests
import json

from scrapy import Selector

url = "http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER"

r = requests.post(url)

scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()

payload = json.loads(
    "{" + CountyFormDataList.formDataList["CommonSearchASPX"]["search"]["ownerSearch"].format(
        scriptManager,
        viewState,
        viewStateGenerator,
        eventValidation,
        "SMITH"
    ) + "}"
)
cookies = CountyFormDataList.formDataList["CommonSearchASPX"]["cookies"]
headers = CountyFormDataList.formDataList["CommonSearchASPX"]["headers"]

r = requests.post(url, data=payload, cookies=cookies, headers=headers)

scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()

payload = json.loads(
    "{" + CountyFormDataList.formDataList["CommonSearchASPX"]["result"]["resultJSON"].format(
        scriptManager,
        viewState,
        viewStateGenerator,
        eventValidation,
        "SMITH",
        "sIndex=0&idx=1"
    ) + "}"
)

r = requests.post(url, data=payload, cookies=cookies, headers=headers)

f = open("ohioOutput.html", "w")
f.write(r.text)
f.close()

CountyFormDataList.py:

formDataList = {
    "CommonSearchASPX" : { #from commonsearch aspx websites, example: http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER
        "cookies" : { #cookies for search to accept disclaimer
            'DISCLAIMER': '1'
        },
        "headers" : {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Content-Length": "4348",
            "Content-Type": "application/x-www-form-urlencoded",
            "Host": "auditor.ashtabulacounty.us",
            "Origin": "https://auditor.ashtabulacounty.us",
            "Referer": "https://auditor.ashtabulacounty.us/PT/search/CommonSearch.aspx?mode=OWNER",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "same-origin",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
        },
        "search" : {
            "ownerSearch" : """
                "ScriptManager1_TSM" : "{}",
                "__EVENTTARGET" : "btSearch",
                "__EVENTARGUMENT" : "",
                "__VIEWSTATE" : "{}",
                "__VIEWSTATEGENERATOR" : "{}",
                "__EVENTVALIDATION" : "{}",
                "PageNum": 1,
                "SortBy" : "PARID",
                "SortDir":  "asc",
                "PageSize": 100,
                "hdAction" : "Search",
                "hdIndex": 0,
                "sIndex": -1,
                "hdListType" : "PA",
                "hdJur" : "",
                "hdSelectAllChecked" : "false",
                "inpOwner" : "{}",
                "selSortBy" : "PARID",
                "selSortDir":  "asc",
                "selPageSize": 100,
                "searchOptions$hdBeta" : "",
                "btSearch" : "",
                "hdLink" : "",
                "AkaCfgResults$hdPins" : "",
                "ReportsListParIDs" : "",
                "RadWindow_NavigateUrl_ClientState" : "",
                "mode" : "OWNER",
                "mask" : "",
                "param1" : "",
                "searchimmediate" : ""
            """
        },
        "result" : { #result page, found by clicking a result item on search page
            "resultJSON" : """
                "ScriptManager1_TSM" : "{}",
                "__EVENTTARGET" : "",
                "__EVENTARGUMENT" : "",
                "__VIEWSTATE" : "{}",
                "__VIEWSTATEGENERATOR" : "{}",
                "__EVENTVALIDATION" : "{}",
                "PageNum":1,
                "SortBy" : "TAXID",
                "SortDir" : "+asc",
                "PageSize":100,
                "hdAction" : "Link",
                "hdIndex":1,
                "sIndex":-1,
                "hdListType" : "PA",
                "hdJur" : "",
                "hdSelectAllChecked" : "false",
                "inpOwner" : "{}",
                "selSortBy" : "TAXID",
                "selSortDir" : "+asc",
                "selPageSize":100,
                "searchOptions$hdBeta" : "",
                "hdLink" : "../Datalets/Datalet.aspx?{}",
                "AkaCfgResults$hdPins" : "",
                "ReportsListParIDs" : "",
                "RadWindow_NavigateUrl_ClientState" : "",
                "mode" : "OWNER",
                "mask" : "",
                "param1" : "",
                "searchimmediate" : ""
            """
        }
    }
}

错误页面结果

我最终查看了Microsoft Docs ,然后找到了 Alex Ronquillo 的一篇文章,其中包括概述了我需要的一些信息的 python 会话对象。 我将代码修改为以下内容:

import CountyFormDataList

import requests
import json

from scrapy import Selector

with requests.Session() as session:
    url = "http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER"

    r = session.post(url)

    scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
    viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
    viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
    eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()

    payload = json.loads(
        "{" + CountyFormDataList.formDataList["CommonSearchASPX"]["search"]["ownerSearch"].format(
            scriptManager,
            viewState,
            viewStateGenerator,
            eventValidation,
            "SMITH"
        ) + "}"
    )
    cookies = CountyFormDataList.formDataList["CommonSearchASPX"]["cookies"]
    headers = CountyFormDataList.formDataList["CommonSearchASPX"]["headers"]

    r = session.post(url, data=payload, cookies=cookies, headers=headers)

    scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
    viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
    viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
    eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()

    payload = json.loads(
        "{" + CountyFormDataList.formDataList["CommonSearchASPX"]["result"]["resultJSON"].format(
            scriptManager,
            viewState,
            viewStateGenerator,
            eventValidation,
            "SMITH",
            "sIndex=0&idx=1"
        ) + "}"
    )

    r = session.post(url, data=payload, cookies=cookies, headers=headers)

f = open("ohioOutput.html", "w")
f.write(r.text)
f.close()

像这样保留会话的简单调整可以解决问题,并且网页似乎返回了正确的信息。 我不完全理解幕后的复杂性,但我将继续努力。 希望这对处于类似情况的人有所帮助。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM