[英]Getting table data from web page using python beautifulsoup
我有一個顯示一些產品的網頁,我需要對這些產品中的每一個進行 go 並獲取名為技術細節的選項卡下的表數據,並將這些數據放入 excel 中的一個大表中。 我寫了以下代碼,但我似乎得到了一個空白的 excel 文件。 它哪里出錯了?
import requests
import xlsxwriter
from bs4 import BeautifulSoup
def cpap_spider(url):
global row_i
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('td', {'class': 'name name2_padd'}):
href = link.get('href')
title = link.string
worksheet.write(row_i, 0, title)
each_item(href)
print(href)
def each_item(item_url):
global cols_names, row_i
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
table = soup.find('table', {'class': 'width_table'})
if table:
table_rows = table.find_all('tr')
else:
return
for row in table_rows:
cols = row.select('td')
for ele in range(0, len(cols)):
temp = cols[ele].text.strip()
if temp:
if temp[-1] == ':':
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names = []
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('st.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, 'Title')
cpap_spider('https://www.respshop.com/cpap-machines/manual/')
workbook.close()
產品信息通過 Ajax 從另一個 URL 加載。
此腳本將沿產品的名稱/網址加載所有技術參數:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.respshop.com/cpap-masks/nasal/'
product_info_url = 'https://www.respshop.com/product_info.php'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
all_data = []
for item in soup.select('td.name a'):
s = BeautifulSoup(requests.get(item['href'], headers=headers).content, 'html.parser')
sku = s.select_one('[itemprop="mpn"]').text
print(item.text, sku)
products_id = re.search(r'p-(\d+)\.html', item['href'])[1]
s = BeautifulSoup(requests.post(product_info_url, data={'products_id': products_id, 'tab': 3}, headers=headers).content, 'html.parser')
row = {'Name': item.text, 'SKU': sku, 'URL': item['href']}
for k, v in zip(s.select('#cont_3 td.main:nth-child(1)'),
s.select('#cont_3 td.main:nth-child(2)')):
row[k.get_text(strip=True)] = v.get_text(strip=True)
all_data.append(row)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
印刷:
ComfortGel Blue Nasal CPAP Mask - Philips Respironics 1070038, 1070037, 1070039, 1070040, 1070050, 1070051, 1070052, 1070049
Wisp Nasal Mask - Philips Respironics 1094051, 1094050, 1109298
Dreamwear Nasal Mask - Philips Respironics 1116700, 1116680, 1116681, 1116682, 1116683, 1116685, 1116686, 1116687, 1116688, 1116690, 1116691, 1116692, 1116693
Airfit N20 Nasal CPAP Mask by ResMed w/ 5 Free Cushions 63536, 63538, 63539
Airfit N30i - ResMed Nasal Mask 63800, 63801
New Respironics DreamWear Nasal Mask With Headgear Arm FitPack 1142376
ResMed AirFit N30 CPAP Nasal Cradle Mask 64222, 64223, 64224
...etc.
創建data.csv
(來自 LibreOffice 的屏幕截圖):
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.