I'm pretty new to Python and mainly need it for getting information from website.
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.example.com'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'c5'}):
href = link.get('href')
time.sleep(0.3)
# print(href)
single_item(href)
page += 1
def single_item(item_url):
s_code = requests.get(item_url)
p_text = s_code.text
soup = BeautifulSoup(p_text, "html.parser")
upc = ('div', {'class': 'product-upc'})
for upc in soup.findAll('span', {'class': 'upcNum'}):
print(upc.string)
sku = ('span', {'data-selenium': 'bhSku'})
for sku in soup.findAll('span', {'class': 'fs16 c28'}):
print(sku.text)
price = ('span', {'class': 'price'})
for price in soup.findAll('meta', {'itemprop': 'price'}):
print(price)
outFile = open(r'C:\Users\abc.txt', 'a')
outFile.write(str(upc))
outFile.write("\n")
outFile.write(str(sku))
outFile.write("\n")
outFile.write(str(price))
outFile.write('\n')
outFile.close()
spider(1)
What i want to get is "UPC:813066012487, price:26.45 and SKU:KBPTMCC2" without any span, meta or content attributes.I attached my output below Here is my output: screenshot
Where do i do wrong ? Hope someone can figure it out! Thanks!!
The data you want is in the div attribute data-itemdata , you can call json.loads
and it will give you a dict that you can access to get what you want:
from bs4 import BeautifulSoup
import requests
import json
soup = BeautifulSoup(requests.get("https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/1/N/4005352853+35").content, "html.parser")
for d in soup.select("div[data-selenium=itemDetail]"):
data = json.loads(d["data-itemdata"])
print(data)
Each data dict will look like:
{u'catagoryId': u'20861',
u'inCart': False,
u'inWish': False,
u'is': u'REG',
u'itemCode': u'KBPTMCC2',
u'li': [],
u'price': u'26.45',
u'searchTerm': u'',
u'sku': u'890522'}
So just access by key ie price = data["price"]
.
To get the UPC we just need to visit the items page, we can get the url from h3 with the data-selenium attribute:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum").text.strip()
data = json.loads(d["data-itemdata"])
Not all pages have a UPC value so you will have to decide what to do, if you just want products with UPC's first check if the select finds anything:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum")
if upc:
data = json.loads(d["data-itemdata"])
text = (upc.text.strip()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.