[英]Can't turn scraped data into csv file
import csv
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.medplusmedicalsupply.com/exam-and-
diagnostic").text
soup = BeautifulSoup(html, "lxml")
products = soup.findAll('div', {"class": "product details product-item-details"})
for product in products:
product_details = dict()
product_details['name'] = product.find('a').text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['brand'] = product.find('div', {'class': 'value'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product.find('div', {'class': 'pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['availability'] = product.find('div', {'class': 'avail pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['price'] = product.find('span', {'class': 'price'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
print(product_details)
prod = product_details
with open('../../www/products.csv', 'w+', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['Name', 'Brand', 'Packaging', 'Availability', 'Price'])
for product in products:
writer.writerow([product['Name'], product['Brand'],product['Packaging'], product['Availability'], product['Price']])
結果:
{'name': 'Kimberly Clark Blue Nitrile Exam Gloves -100/BX', 'brand': 'Kimberly Clark', 'packaging': 'Brand: \n\nKimberly Clark', 'availability': 'Availability: \n\n1-3 Days', 'price': '$26.94'}
{'name': 'Pro Advantage Gloves - 200/BX', 'brand': 'Pro Advantage', 'packaging': 'Brand: \n\nPro Advantage', 'availability': 'Availability: \n\n1-3 Days', 'price': '$28.94'}
{'name': 'Kimberly Clark Powder Free Exam Gloves - 500/CS', 'brand': 'Kimberly Clark', 'packaging': 'Brand: \n\nKimberly Clark', 'availability': 'Availability: \n\n1-3 Days', 'price': '$136.99'}
{'name': 'Kimberly Clark Sterile Pairs Exam Gloves - 400/CS', 'brand': 'Kimberly Clark', 'packaging': 'Brand: \n\nKimberly Clark', 'availability': 'Availability: \n\n1-3 Days', 'price': '$205.99'}
{'name': 'Dynarex Multi Care Vinyl Exam Gloves Non-Latex - 1000/CS', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$61.99'}
{'name': 'Dynarex Tillotson Powder Free Plus Latex Exam Gloves - 1000/CS', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$109.99'}
{'name': 'Dynarex 6512 True Advantage High Risk Nitrile Exam Gloves, 8 mil - 500/CS', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$105.99'}
{'name': 'Dynarex 6512 True Advantage High Risk Nitrile Exam Gloves, 8 mil - 50/BX', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$20.99'}
{'name': 'Dynarex Tillotson Tru Advantage PF Nitrile Exam Gloves - 1000/CS', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$101.99'}
{'name': 'Dynarex Tillotson Tru Advantage PF Nitrile Exam Gloves - 100/BX', 'brand': 'Dynarex', 'packaging': 'Brand: \n\nDynarex', 'availability': 'Availability: \n\n1-3 Days', 'price': '$18.99'}
KeyError
Traceback (most recent call last)
<ipython-input-10-668855af45c8> in <module>()
24 writer.writerow(['Name', 'Brand', 'Packaging', 'Availability', 'Price'])
25 for product in products:
---> 26 writer.writerow([product['Name'], product['Brand'],product['Packaging'], product['Availability'], product['Price']])
27
28
~\Anaconda3\lib\site-packages\bs4\element.py in __getitem__(self, key)
1009 """tag[key] returns the value of the 'key' attribute for the tag,
1010 and throws an exception if it's not there."""
-> 1011 return self.attrs[key]
1012
1013 def __iter__(self):
KeyError: 'Name'
實際上 Martineau 是對的 - Name
應該改成name
並且應該對product_details
字典中的每個鍵進行類似的更改。
但這不是這里唯一的問題 - 您的抓取不會按您的意願工作,因為product_details['packaging']
和product_details['availability']
在它們的字符串中間有換行符。 所以它們也需要清洗。
同樣在您當前的流程中,您不會得到結果,因為您正在將每一行數據收集到product_details
字典中,但是您正在從products
對象(帶有所有標簽內容)寫入csv
。 並且您最初只是在每次迭代時重寫product_details
,甚至不使用此字典寫入輸出文件。
因此,在寫循環中,您需要更改products
上product_details
。
這是一個工作場景:
import csv
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.medplusmedicalsupply.com/exam-and-diagnostic").text
soup = BeautifulSoup(html, "lxml")
products = soup.findAll('div', {"class": "product details product-item-details"})
with open('.../products.csv', 'w+', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['Name' + '\t' + 'Brand' + '\t' + 'Packaging' + '\t' + 'Availability' + '\t' + 'Price'])
for product in products:
product_details = dict()
product_details['name'] = product.find('a').text.strip('\n\r\t": ').strip('\n\r\t": ')
product_details['brand'] = product.find('div', {'class': 'value'}).text.strip('\n\r\t": ')
product_details['packaging'] = product.find('div', {'class': 'pack'}).text.strip('\n\r\t": ')
product_details['availability'] = product.find('div', {'class': 'avail pack'}).text.strip('\n\r\t": ')
product_details['price'] = product.find('span', {'class': 'price'}).text.strip('\n\r\t": ')
product_details['packaging'] = product_details['packaging'][9:] # here we're cutting redundant part of string "Brand: \n\n"
product_details['availability'] = product_details['availability'][16:] # here we're cutting redundant part of string "Availability: \n\n"
writer.writerow([product_details['name'] + '\t' + product_details['brand'] + '\t' + product_details['packaging'] + '\t' + product_details['availability'] + '\t' + product_details['price']])
我將csv
文件的分隔符更改為tab
符,因為在某些情況下產品name
包含逗號,例如,對於excel
,它將是錯誤的列的字符分隔符,因此在某些行中,您會有多余的列。
您在 keyName 中有錯誤,它應該是lowercase
並且在下面的代碼中,
for product in products:
writer.writerow......
products
是BeautifulSoup object
而不是在上一個循環中生成的產品列表,您不能將其用於 csv 值。
最后,你可以試試這個解決方案
import csv
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.medplusmedicalsupply.com/exam-and-diagnostic").text
soup = BeautifulSoup(html, "html.parser")
products = soup.findAll('div', {"class": "product details product-item-details"})
all_product = []
for product in products:
product_details = dict()
product_details['name'] = product.find('a').text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['brand'] = product.find('div', {'class': 'value'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product.find('div', {'class': 'pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['availability'] = product.find('div', {'class': 'avail pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['price'] = product.find('span', {'class': 'price'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
all_product.append(product_details)
print(all_product)
with open('products.csv', 'w+') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['Name', 'Brand', 'Packaging', 'Availability', 'Price'])
for product in all_product:
writer.writerow([product['name'], product['brand'],product['packaging'], product['availability'], product['price']])
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.