[英]Multiple post requests within same session is not working while web scraping ASP.NET site
[英]Python Scraping asp.net with requests "session based SearchQueue is empty"
尝试使用 python 请求抓取该县网站的多个搜索阶段。 基本上是尝试进行搜索,过滤结果(该代码尚未出现),然后转到该结果页面。 获取 asp.net 错误消息The session based SearchQueue is empty.
到目前为止,我的代码可能看起来很长,但我包含了我在请求中使用的所有表单数据。 只是试图搜索名字“史密斯”
基本上,我正在发出一个空请求,获取__VIEWSTATE
和其他值,然后发出一个工作正常的搜索请求。 然后我再次从搜索结果页面获取__VIEWSTATE
和朋友,并尝试使用我认为是hdLink
值的搜索结果,尽管我不确定。 你认为我可能缺少一个__EVENTTARGET
吗? 把自己逼疯了,因为我不知道在这里寻找什么。 还发布了错误页面的图像。 感谢任何可以提供知识的人。
测试文件
import CountyFormDataList
import requests
import json
from scrapy import Selector
url = "http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER"
r = requests.post(url)
scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()
payload = json.loads(
"{" + CountyFormDataList.formDataList["CommonSearchASPX"]["search"]["ownerSearch"].format(
scriptManager,
viewState,
viewStateGenerator,
eventValidation,
"SMITH"
) + "}"
)
cookies = CountyFormDataList.formDataList["CommonSearchASPX"]["cookies"]
headers = CountyFormDataList.formDataList["CommonSearchASPX"]["headers"]
r = requests.post(url, data=payload, cookies=cookies, headers=headers)
scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()
payload = json.loads(
"{" + CountyFormDataList.formDataList["CommonSearchASPX"]["result"]["resultJSON"].format(
scriptManager,
viewState,
viewStateGenerator,
eventValidation,
"SMITH",
"sIndex=0&idx=1"
) + "}"
)
r = requests.post(url, data=payload, cookies=cookies, headers=headers)
f = open("ohioOutput.html", "w")
f.write(r.text)
f.close()
CountyFormDataList.py:
formDataList = {
"CommonSearchASPX" : { #from commonsearch aspx websites, example: http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER
"cookies" : { #cookies for search to accept disclaimer
'DISCLAIMER': '1'
},
"headers" : {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Content-Length": "4348",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "auditor.ashtabulacounty.us",
"Origin": "https://auditor.ashtabulacounty.us",
"Referer": "https://auditor.ashtabulacounty.us/PT/search/CommonSearch.aspx?mode=OWNER",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
},
"search" : {
"ownerSearch" : """
"ScriptManager1_TSM" : "{}",
"__EVENTTARGET" : "btSearch",
"__EVENTARGUMENT" : "",
"__VIEWSTATE" : "{}",
"__VIEWSTATEGENERATOR" : "{}",
"__EVENTVALIDATION" : "{}",
"PageNum": 1,
"SortBy" : "PARID",
"SortDir": "asc",
"PageSize": 100,
"hdAction" : "Search",
"hdIndex": 0,
"sIndex": -1,
"hdListType" : "PA",
"hdJur" : "",
"hdSelectAllChecked" : "false",
"inpOwner" : "{}",
"selSortBy" : "PARID",
"selSortDir": "asc",
"selPageSize": 100,
"searchOptions$hdBeta" : "",
"btSearch" : "",
"hdLink" : "",
"AkaCfgResults$hdPins" : "",
"ReportsListParIDs" : "",
"RadWindow_NavigateUrl_ClientState" : "",
"mode" : "OWNER",
"mask" : "",
"param1" : "",
"searchimmediate" : ""
"""
},
"result" : { #result page, found by clicking a result item on search page
"resultJSON" : """
"ScriptManager1_TSM" : "{}",
"__EVENTTARGET" : "",
"__EVENTARGUMENT" : "",
"__VIEWSTATE" : "{}",
"__VIEWSTATEGENERATOR" : "{}",
"__EVENTVALIDATION" : "{}",
"PageNum":1,
"SortBy" : "TAXID",
"SortDir" : "+asc",
"PageSize":100,
"hdAction" : "Link",
"hdIndex":1,
"sIndex":-1,
"hdListType" : "PA",
"hdJur" : "",
"hdSelectAllChecked" : "false",
"inpOwner" : "{}",
"selSortBy" : "TAXID",
"selSortDir" : "+asc",
"selPageSize":100,
"searchOptions$hdBeta" : "",
"hdLink" : "../Datalets/Datalet.aspx?{}",
"AkaCfgResults$hdPins" : "",
"ReportsListParIDs" : "",
"RadWindow_NavigateUrl_ClientState" : "",
"mode" : "OWNER",
"mask" : "",
"param1" : "",
"searchimmediate" : ""
"""
}
}
}
我最终查看了Microsoft Docs ,然后找到了 Alex Ronquillo 的一篇文章,其中包括概述了我需要的一些信息的 python 会话对象。 我将代码修改为以下内容:
import CountyFormDataList
import requests
import json
from scrapy import Selector
with requests.Session() as session:
url = "http://property.franklincountyauditor.com/_web/search/CommonSearch.aspx?mode=OWNER"
r = session.post(url)
scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()
payload = json.loads(
"{" + CountyFormDataList.formDataList["CommonSearchASPX"]["search"]["ownerSearch"].format(
scriptManager,
viewState,
viewStateGenerator,
eventValidation,
"SMITH"
) + "}"
)
cookies = CountyFormDataList.formDataList["CommonSearchASPX"]["cookies"]
headers = CountyFormDataList.formDataList["CommonSearchASPX"]["headers"]
r = session.post(url, data=payload, cookies=cookies, headers=headers)
scriptManager = Selector(text=r.text).xpath('//*[@id="ScriptManager1_TSM"]/@value').get()
viewState = Selector(text=r.text).xpath('//*[@id="__VIEWSTATE"]/@value').get()
viewStateGenerator = Selector(text=r.text).xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').get()
eventValidation = Selector(text=r.text).xpath('//*[@id="__EVENTVALIDATION"]/@value').get()
payload = json.loads(
"{" + CountyFormDataList.formDataList["CommonSearchASPX"]["result"]["resultJSON"].format(
scriptManager,
viewState,
viewStateGenerator,
eventValidation,
"SMITH",
"sIndex=0&idx=1"
) + "}"
)
r = session.post(url, data=payload, cookies=cookies, headers=headers)
f = open("ohioOutput.html", "w")
f.write(r.text)
f.close()
像这样保留会话的简单调整可以解决问题,并且网页似乎返回了正确的信息。 我不完全理解幕后的复杂性,但我将继续努力。 希望这对处于类似情况的人有所帮助。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.