使用 python beautifulsoup 從 web 頁面獲取表數據

Question

我有一個顯示一些產品的網頁，我需要對這些產品中的每一個進行 go 並獲取名為技術細節的選項卡下的表數據，並將這些數據放入 excel 中的一個大表中。 我寫了以下代碼，但我似乎得到了一個空白的 excel 文件。 它哪里出錯了？

import requests
import xlsxwriter
from bs4 import BeautifulSoup


def cpap_spider(url):
    global row_i
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    for link in soup.findAll('td', {'class': 'name name2_padd'}):
        href = link.get('href')
        title = link.string
        worksheet.write(row_i, 0, title)
        each_item(href)
        print(href)
        

def each_item(item_url):
    global cols_names, row_i
    
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    table = soup.find('table', {'class': 'width_table'})
    if table:
        table_rows = table.find_all('tr')
    else:
        return
    for row in table_rows:
      cols = row.select('td')
      for ele in range(0, len(cols)):
        temp = cols[ele].text.strip()
        if temp:
          if temp[-1] == ':':
            temp = temp[:-1]
          # Name of column
          if ele == 0:
            try:
              cols_names_i = cols_names.index(temp)
            except:
              cols_names.append(temp)
              cols_names_i = len(cols_names) -  1
              worksheet.write(0, cols_names_i + 1, temp)
              continue;
          worksheet.write(row_i, cols_names_i + 1, temp)      
    row_i += 1
    
cols_names = []
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('st.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, 'Title')
    
cpap_spider('https://www.respshop.com/cpap-machines/manual/')

workbook.close()

Answer 1

產品信息通過 Ajax 從另一個 URL 加載。

此腳本將沿產品的名稱/網址加載所有技術參數：

import re
import requests
import pandas as pd
from bs4 import BeautifulSoup


url = 'https://www.respshop.com/cpap-masks/nasal/'
product_info_url = 'https://www.respshop.com/product_info.php'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}

soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')

all_data = []
for item in soup.select('td.name a'):
    s = BeautifulSoup(requests.get(item['href'], headers=headers).content, 'html.parser')
    sku = s.select_one('[itemprop="mpn"]').text
    print(item.text, sku)
    products_id = re.search(r'p-(\d+)\.html', item['href'])[1]

    s = BeautifulSoup(requests.post(product_info_url, data={'products_id': products_id, 'tab': 3}, headers=headers).content, 'html.parser')

    row = {'Name': item.text, 'SKU': sku, 'URL': item['href']}
    for k, v in zip(s.select('#cont_3 td.main:nth-child(1)'),
                    s.select('#cont_3 td.main:nth-child(2)')):
        row[k.get_text(strip=True)] = v.get_text(strip=True)
    all_data.append(row)

df = pd.DataFrame(all_data)
df.to_csv('data.csv')

印刷：

ComfortGel Blue Nasal CPAP Mask - Philips Respironics  1070038, 1070037, 1070039, 1070040, 1070050, 1070051, 1070052, 1070049
Wisp Nasal Mask - Philips Respironics  1094051, 1094050, 1109298
Dreamwear Nasal Mask - Philips Respironics 1116700, 1116680, 1116681, 1116682, 1116683, 1116685, 1116686, 1116687, 1116688, 1116690, 1116691, 1116692, 1116693
Airfit N20 Nasal CPAP Mask by ResMed w/ 5 Free Cushions 63536, 63538, 63539
Airfit N30i - ResMed Nasal Mask  63800, 63801
New Respironics DreamWear Nasal Mask With Headgear Arm FitPack 1142376
ResMed AirFit N30 CPAP Nasal Cradle Mask 64222, 64223, 64224

...etc.

創建data.csv （來自 LibreOffice 的屏幕截圖）：

使用 python beautifulsoup 從 web 頁面獲取表數據

問題描述

1 個解決方案

解決方案1
1 2020-07-13 08:20:42

使用 python beautifulsoup 從 web 頁面獲取表數據

問題描述

1 個解決方案

解決方案1 1 2020-07-13 08:20:42

解決方案1
1 2020-07-13 08:20:42