[英]How to scrapy data move to next page using Python and BeautifulSoup
我想使用 python 3.5 和 ZC2ED0329D2D3CF54C78317B209D7C0D5 刮取數據https://www.arduinothai.com/category/2/arduino-compatible-board 。 我可以成功抓取第一頁上的數據,但我無法從其他頁面抓取數據。 這是我的代碼
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
URL='https://www.arduinothai.com/category/2/arduino-compatible-board'
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
''' Find All page in website
Count_Next_Pages = soups.find_all('span','tsk-all')
TotalProduct = int(Count_Next_Pages[1].text)
TotalProductPerPage = 40
TotalPages = (round(TotalProduct/TotalProductPerPage))
count=0
for i in range(int(TotalPages)):
count+=1
i='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp='+str(count)
Request_Data=requests.get(i)
Soups_Data=BeautifulSoup(Request_Data.text,'lxml')
AllProduct=Soups_Data.find_all('div',class_='productDetail')
for x in AllProduct:
AllProductDeatil = x.find('a').get("gaeepd")
IDProductLink = json.loads(AllProductDeatil)["id"]
#Scrape ProductID
ProductID = x.find('span','code').get_text(strip=True)
pattren = r'[A-Z]{2}\d{5}|\d{5}|....\d{5}'
regex = re.compile(pattren)
ProDuctIDResult = regex.findall(ProductID)
ProductIDStr = ConvertListToStr(ProDuctIDResult)
ProductIDAll.append(ProductIDStr)
#Scrape Stock
URL_Prefix =requests.get('https://www.arduinothai.com/product/'+str(IDProductLink))
SoupStock = BeautifulSoup(URL_Prefix.text, 'lxml')
ChkStock = SoupStock.find('span', class_='num').text
StockOfProduct.append(ChkStock)
if((ProductCategory_jsonData==('Single Set')) or (ProductCategory_jsonData==('Triple Set')) or (ProductCategory_jsonData==('STM32'))):
ListOfProduct.append((ProductIDStr, NameOfProduct, PriceOfProduct, OldProPricesStr, ChkStock, Link_URL, ProductCategory_jsonData))
data_df = pd.DataFrame({
'ProductID': ProductIDAll,
'ProdcutName':Productname,
'Productprice':Productprice,
'OldProductPrice': OldProductPrice,
'StockOfProduct': StockOfProduct,
'Link': LinkProduct,
'Category':CategoryProduct
})
df=pd.DataFrame(ListOfProduct, columns=['ProductID', 'ProductName','Discount','Price','Stock','Link','TypeOfProduct'])
pd.set_option('display.max_rows', df.shape[0]+1)
df
只需運行兩個頁面網址的代碼:
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
for i in [1,2]:
URL='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp=' + str(i)
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
# your scrape here
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.