[英]Can't scrape names from next pages using requests
我正在嘗試使用 python 腳本解析從網頁中遍歷多個頁面的名稱。 通過我目前的嘗試,我可以從它的登錄頁面獲取名稱。 但是,我找不到任何想法來使用請求和 BeautifulSoup 從下一頁獲取名稱。
到目前為止我的嘗試:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for elem in soup.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)
我試圖修改我的腳本以僅從第二頁獲取內容,以確保它在涉及下一頁按鈕時正常工作,但不幸的是它仍然從第一頁獲取數據:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['__EVENTARGUMENT'] = 'Page$Next'
payload.pop('btnClose')
payload.pop('btnMapClose')
res = s.post(url,data=payload,headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
})
sauce = BeautifulSoup(res.text,"lxml")
for elem in sauce.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)
導航到下一頁是通過帶有 __VIEWSTATE 游標的 POST 請求執行的。
如何使用請求做到這一點:
向第一頁發出 GET 請求;
解析需要的數據和 __VIEWSTATE 游標;
使用接收到的光標為下一頁准備 POST 請求;
運行它,解析所有數據和下一頁的新光標。
我不會提供任何代碼,因為它需要寫下幾乎所有爬蟲的代碼。
==== 添加 ====
您幾乎完成了,但是您錯過了兩件重要的事情。
有必要在第一個 GET 請求中發送標頭。 如果沒有發送標頭 - 我們會得到損壞的令牌(很容易在視覺上檢測到 - 他們最后沒有 ==)
我們需要將__ASYNCPOST添加到我們發送的有效負載中。 (這很有趣:它不是布爾值 True,而是字符串 'true')
這是代碼。 我刪除了 bs4 並添加了 lxml(我不喜歡 bs4,它很慢)。 我們確切地知道我們需要發送哪些數據,所以讓我們只解析幾個輸入。
import re
import requests
from lxml import etree
def get_nextpage_tokens(response_body):
""" Parse tokens from XMLHttpRequest response for making next request to next page and create payload """
try:
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATE'] = re.search(r'__VIEWSTATE\|([^\|]+)', response_body).group(1)
payload['__VIEWSTATEGENERATOR'] = re.search(r'__VIEWSTATEGENERATOR\|([^\|]+)', response_body).group(1)
payload['__EVENTVALIDATION'] = re.search(r'__EVENTVALIDATION\|([^\|]+)', response_body).group(1)
payload['__ASYNCPOST'] = 'true'
return payload
except:
return None
if __name__ == '__main__':
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
}
with requests.Session() as s:
page_num = 1
r = s.get(url, headers=headers)
parser = etree.HTMLParser()
tree = etree.fromstring(r.text, parser)
# Creating payload
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATE'] = tree.xpath("//input[@name='__VIEWSTATE']/@value")[0]
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATEGENERATOR'] = tree.xpath("//input[@name='__VIEWSTATEGENERATOR']/@value")[0]
payload['__EVENTVALIDATION'] = tree.xpath("//input[@name='__EVENTVALIDATION']/@value")[0]
payload['__ASYNCPOST'] = 'true'
headers['X-Requested-With'] = 'XMLHttpRequest'
while True:
page_num += 1
res = s.post(url, data=payload, headers=headers)
print(f'page {page_num} data: {res.text}') # FIXME: Parse data
payload = get_nextpage_tokens(res.text) # Creating payload for next page
if not payload:
# Break if we got no tokens - maybe it was last page (it must be checked)
break
重要的
響應不是格式良好的 HTML。 所以你必須處理它:切割桌子或其他東西。 祝你好運!
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.