can you please help me, I was thinking for long time but did not know what to write:(
**I need two values: asin and price
asin = # I need the values that is between <div data-asin="
and "
in the webpage source code
price = # I need the values that is between <span class="a-price" data-a-size="l" data-a-color="base"><span class="a-offscreen">SAR
and </span>
in the webpage source code
from bs4 import BeautifulSoup as soup
from concurrent.futures import ThreadPoolExecutor
import requests
import time
number_of_threads = 6
out_filename = time.strftime('soldbysouq-shopanddodandsupermarket' + "%Y%m%d-%H%M%S")
headers = "price,asin,\n"
def extract_data_from_url_func(url):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
containers = # tried a lot of things without luck
output = ''
for container in containers:
asin = # I need the values that is between **<div data-asin="** and **"** in the webpage source code
price = # I need the values that is between **<span class="a-price" data-a-size="l" data-a-color="base"><span class="a-offscreen">SAR** and **</span>** in the webpage source code
output_list = [price,asin,]
output = output + ",".join(output_list) + "\n"
print(output)
return output
with open("amazonlist1.csv", "r") as fr:
URLS = list(map(lambda x: x.strip(), fr.readlines()))
with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
results = executor.map( extract_data_from_url_func, URLS)
responses = []
for result in results:
responses.append(result)
with open(out_filename, "w", encoding='utf-8-sig') as fw:
fw.write(headers)
for response in responses:
fw.write(response + "\n")
This will solve your data-asin
thing:
items = page_soup.select('div[data-asin]')
for item in items:
print(item['data-asin'])
B0887QV8BB
B0856Q8K78
B01ISO92V2
B07N3XNX77
B07XQXPF21
B07VKNWD5P
.
.
.
.
price = [i.text for i in page_soup.findAll('span', class_="a-offscreen")]
Edit:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
soup=bs(driver.page_source,'html.parser')
time.sleep(3)
items = soup.select('div[data-asin]')
for item in items:
print(item['data-asin'])
B07N7CPZ5V
B0856YYBC6
B0856WVLDH
B07N6PYYZL
B07P9WKRHD
B07WCVVBLT
B07Q1D8XWP
.
.
.
Don't forget to install the required package: pip install webdriver-manager
Full Code:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
soup=bs(driver.page_source,'html.parser')
time.sleep(3)
items = soup.select('div[data-asin]')
dasin = [i['data-asin'] for i in items]
items = soup.findAll('span', class_= 'a-price')
price = [float(i.text.split()[0]) for i in items]
info = list(zip(dasin, price))
info:
[('B07N7CPZ5V', 3.24),
('B0856YYBC6', 3.97),
('B0856WVLDH', 3.97),
('B07N6PYYZL', 3.97),
('B07P9WKRHD', 4.0),
('B07WCVVBLT', 39.0),
('B07Q1D8XWP', 5.0),
('B07NDY31Q2', 111.93),
('B07N78MQM2', 5.29),
('B083B1K2NF', 5.29),
('B0856T4P15', 5.29),
('B07MMK4KQT', 6.04),
('B083X1YGWK', 6.48),
('B07PKHR3ZZ', 6.61),
('B07NF1C183', 6.61),
('B07P9W46JT', 6.61),
('B07P5HWJK7', 6.61),
('B0857BQGJP', 6.74),
('B085714YYH', 7.39),
('B0856Y5G3J', 7.94),
('B0856XLCQD', 7.94),
('B0856QQWCY', 7.94),
('B07NF3RZN5', 7.94),
('B07NF17XYP', 7.94),
('', 7.94)]
from bs4 import BeautifulSoup as soup
import requests
import re
def function(url):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
# all objects with data-asin attr with any value aside from empty (there is empty data-asin on page without price or item)
containers = page_soup.find_all(attrs={'data-asin': re.compile(r'.+')})
for container in containers:
asin = container['data-asin'] # accessing attrs of Tag is simple as that
print(asin)
# get all a-price class elements (should be one) and their child, then take string from first
# output without [0] is ['1,699.00 ريال', None]
price = [item.string for item in container.find_next(attrs={'class': 'a-price'}).children][0]
print(price)
# example output for your URL (for me is Egypt region for some reason so it's Egyptian pounds)
# https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1
# B0887QV8BB
# 939.00 ريال
# B0856Q8K78
# 445.36 ريال
# B01ISO92V2
# 85.82 ريال
# B07N3XNX77
# 92.00 ريال
# B07XQXPF21
# 1,794.00 ريال
# B07VKNWD5P
# 262.45 ريال
# B07RCG7VSB
# 84.53 ريال
# B083TK2W87
# 15.87 ريال
# B07CXGPW8G
# 118.00 ريال
# B07ZRCT1VH
# 54.75 ريال
# B07P6YKSQP
# 69.00 ريال
# B085JVBMTT
# 322.20 ريال
# B089G813SB
# 499.00 ريال
# B07B88KQZ8
# 82.95 ريال
# B08521415W
# 995.00 ريال
# B06XGBC43Z
# 18.50 ريال
# B07Y3KGRL8
# 3,104.00 ريال
# B084ZTCM1G
# 1,399.00 ريال
# B0856Y5G11
# 995.00 ريال
# B075THDT3M
# 129.00 ريال
# B01GGKYKQM
# 11.00 ريال
# B0851ZWC87
# 399.00 ريال
# B084FFWSLH
# 799.00 ريال
# B0871TDTLV
# 1,699.00 ريال
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.