[英]Scraping e-commerce in python - cannot fetch product categories and total amounts
到目前為止,我的代碼可以檢索“魅力”類別中正在出售的物品數量。 但是我無法使其打印出類別名稱。
該站點使用無限滾動器-但是我設法確定了站點的位置,因此站點URL包含{},並用while循環填充了該URL。
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
while page<=1000:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
category_tags = soup.select('span.breadcrumb-element')
return Total_items
return category_tags
if __name__ == '__main__':
page = 0
product_list = []
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items)
print(category_tags)
這是我需要的:我需要打印出刮擦物品的類別,可以在此行中找到該類別:
category_tags = soup.select('span.breadcrumb-element')
但是我不能以某種方式打印它。
當我們使用它時,如何使代碼打印出所有商品,而不僅僅是打印商品?
謝謝。
編輯:所以建立家伙的代碼之一,我最終與此。
import requests
from bs4 import BeautifulSoup
import re
url1 = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
url2 = "https://us.pandora.net/en/bracelets/?sz=30&start={}&format=page-element"
url3 = "https://us.pandora.net/en/rings/?sz=30&start={}&format=page-element"
url4 = "https://us.pandora.net/en/necklaces/?sz=30&start={}&format=page-element"
url5 = "https://us.pandora.net/en/earrings/?sz=30&start={}&format=page-element"
#res = requests.get(link.format(url1),headers={"User-Agent":"Mozilla/5.0"})
soup1 = BeautifulSoup(requests.get(url1.format(0)).text, 'lxml')
soup2 = BeautifulSoup(requests.get(url2.format(0)).text, 'lxml')
soup3 = BeautifulSoup(requests.get(url3.format(0)).text, 'lxml')
soup4 = BeautifulSoup(requests.get(url4.format(0)).text, 'lxml')
soup5 = BeautifulSoup(requests.get(url5.format(0)).text, 'lxml')
total_items1 = ''.join(re.findall(r'\d', soup1.select_one('span.products-count').text))
total_items2 = ''.join(re.findall(r'\d', soup2.select_one('span.products-count').text))
total_items3 = ''.join(re.findall(r'\d', soup3.select_one('span.products-count').text))
total_items4 = ''.join(re.findall(r'\d', soup4.select_one('span.products-count').text))
total_items5 = ''.join(re.findall(r'\d', soup5.select_one('span.products-count').text))
#categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')
#total_items_sale1 = ''.join(re.findall(r'\d', soup1.select_one('.grid-tile .price-standard')))
#total_items_sale1
#total_items_sale1
#total_items_sale1
#total_items_sale1
#print('Categories:')
#for category in categories:
#print('\t{}'.format(category))
print('\nTotal Charms: {}'.format(total_items1))
print('\nTotal Bracelets: {}'.format(total_items2))
print('\nTotal Rings: {}'.format(total_items3))
print('\nTotal Necklaces: {}'.format(total_items4))
print('\nTotal Earrings: {}'.format(total_items5))
我知道這看起來很可怕。 我們如何縮短它?
那里不能有2個退貨。 該函數在第一次返回后停止,因此,如果要返回多個對象,可以將其放在一行中。 您還需要將其附加到循環內的列表中。 您可以在循環之外進行操作。 注意,我將其從1000更改為300以進行測試。
其次,我認為您想要的是文本。
要打印所有商品,您需要獲取每個商品,而不僅僅是具有'price-standard'
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
categories = []
while page<=300:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
print(page)
category_tags = soup.select('span.breadcrumb-element')[0]
try:
categories.append(category_tags.text)
except:
categories.append('N/A')
return Total_items, categories
page = 0
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items[0])
print(total_items[1])
以下是獲取完整產品的方法:
def fetch_items(link,page):
Total_items = 0
names = []
categories = []
prices = []
sales = []
while page<=300:
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
products = soup.find_all("li", class_=lambda value: value and value.startswith("grid-tile"))
for each in products:
Total_items += 1
category = each.find('div', {'class':'product-tile'})['data-cgid']
name = each.find('div', {'class':'product-name'}).text.strip()
price = each.find('div', {'class':'product-pricing'}).text.strip()
sale_price = each.find('span', {'class':'price-sales'}).text.strip()
names.append(name)
categories.append(category)
prices.append(price)
sales.append(sale_price)
print(page)
page+=30
return Total_items, names, categories, prices, sales
results = fetch_items(url,page)
雖然不能確定您希望這些結果如何。 但是,如果您願意,可以將其轉儲到表中:
import pandas as pd
df = pd.DataFrame(
{'name':results[1],
'category':results[2],
'price':results[3],
'sale':results[4]})
輸出:
print (df.head(10).to_string())
name category price sale
0 American Icons Dangle Charm charms $60.00 $60.00
1 Disney Pixar, Toy Story, Buzz Lightyear Dangle... charms $70.00 $70.00
2 Disney Pixar, Toy Story, Woody Dangle Charm charms $60.00 $60.00
3 Spinning Globe Dangle Charm charms $60.00 $60.00
4 Elephant Charm charms $45.00 $45.00
5 Canada Dangle Charm, Pandora Rose™ charms $65.00 $65.00
6 Sparkling Monkey Charm charms $70.00 $70.00
7 Propeller Plane Dangle Charm charms $55.00 $55.00
8 Spotted Heart Charm charms $50.00 $50.00
9 Pink Travel Bag Charm charms $50.00 $50.00
查看服務器的結果,您不必遍歷所有頁面。 您在一頁上擁有的所有信息:
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
sale_url = "https://us.pandora.net/en/sale/sale-charms/?sz=30&start={}&format=page-element"
soup = BeautifulSoup(requests.get(url.format(0)).text, 'lxml')
sale_soup = BeautifulSoup(requests.get(sale_url.format(0)).text, 'lxml')
total_items = soup.select_one('#products_count')['value']
total_sale_items = sale_soup.select_one('#products_count')['value']
categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')]
print('Categories:')
for category in categories:
print('\t{}'.format(category))
print('\nTotal items: {}'.format(total_items))
print('Total sale items: {}'.format(total_sale_items))
印刷品:
Categories:
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
$0 - $50
$50 - $100
$100 - $150
$150 & Over
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
Total items: 959
Total sale items: 376
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.