简体   繁体   中英

in python, how I get values between specific string in web page

can you please help me, I was thinking for long time but did not know what to write:(

**I need two values: asin and price

asin = # I need the values that is between <div data-asin=" and " in the webpage source code

price = # I need the values that is between <span class="a-price" data-a-size="l" data-a-color="base"><span class="a-offscreen">SAR and </span> in the webpage source code

Example web page https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1

from bs4 import BeautifulSoup as soup
from concurrent.futures import ThreadPoolExecutor
import requests
import time
   
number_of_threads = 6

out_filename = time.strftime('soldbysouq-shopanddodandsupermarket' + "%Y%m%d-%H%M%S")

headers = "price,asin,\n"

def extract_data_from_url_func(url):
    print(url)
    response = requests.get(url)
    page_soup = soup(response.text, "html.parser")

    containers = # tried a lot of things without luck
    output = ''
    for container in containers:
        asin = # I need the values that is between **<div data-asin="** and **"** in the webpage source code 
        price = # I need the values that is between **<span class="a-price" data-a-size="l" data-a-color="base"><span class="a-offscreen">SAR**  and **</span>**  in the webpage source code 
       
          
        output_list = [price,asin,]
        output = output + ",".join(output_list) + "\n"
        print(output)

    return output

with open("amazonlist1.csv", "r") as fr:
    URLS = list(map(lambda x: x.strip(), fr.readlines()))

with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
    results = executor.map( extract_data_from_url_func, URLS)
    responses = []
    for result in results:
        responses.append(result)


with open(out_filename, "w", encoding='utf-8-sig') as fw:
  fw.write(headers)
  for response in responses:
      fw.write(response + "\n")

This will solve your data-asin thing:

items = page_soup.select('div[data-asin]')
for item in items:
    print(item['data-asin'])

B0887QV8BB
B0856Q8K78
B01ISO92V2
B07N3XNX77
B07XQXPF21
B07VKNWD5P
.
.
.
.

price = [i.text for i in page_soup.findAll('span', class_="a-offscreen")]

Edit:

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time

url = 'https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)

soup=bs(driver.page_source,'html.parser')
time.sleep(3)

items = soup.select('div[data-asin]')
for item in items:
    print(item['data-asin'])

B07N7CPZ5V
B0856YYBC6
B0856WVLDH
B07N6PYYZL
B07P9WKRHD
B07WCVVBLT
B07Q1D8XWP
.
.
.

Don't forget to install the required package: pip install webdriver-manager

Edit2:

Full Code:

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time

url = 'https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)

soup=bs(driver.page_source,'html.parser')
time.sleep(3)

items = soup.select('div[data-asin]')
dasin = [i['data-asin'] for i in items]
items = soup.findAll('span', class_= 'a-price')
price = [float(i.text.split()[0]) for i in items]
info = list(zip(dasin, price))

info:

[('B07N7CPZ5V', 3.24),
 ('B0856YYBC6', 3.97),
 ('B0856WVLDH', 3.97),
 ('B07N6PYYZL', 3.97),
 ('B07P9WKRHD', 4.0),
 ('B07WCVVBLT', 39.0),
 ('B07Q1D8XWP', 5.0),
 ('B07NDY31Q2', 111.93),
 ('B07N78MQM2', 5.29),
 ('B083B1K2NF', 5.29),
 ('B0856T4P15', 5.29),
 ('B07MMK4KQT', 6.04),
 ('B083X1YGWK', 6.48),
 ('B07PKHR3ZZ', 6.61),
 ('B07NF1C183', 6.61),
 ('B07P9W46JT', 6.61),
 ('B07P5HWJK7', 6.61),
 ('B0857BQGJP', 6.74),
 ('B085714YYH', 7.39),
 ('B0856Y5G3J', 7.94),
 ('B0856XLCQD', 7.94),
 ('B0856QQWCY', 7.94),
 ('B07NF3RZN5', 7.94),
 ('B07NF17XYP', 7.94),
 ('', 7.94)]
from bs4 import BeautifulSoup as soup
import requests
import re

def function(url):
    
    print(url)
    response = requests.get(url)
    page_soup = soup(response.text, "html.parser")

    # all objects with data-asin attr with any value aside from empty (there is empty data-asin on page without price or item)
    containers = page_soup.find_all(attrs={'data-asin': re.compile(r'.+')})
    for container in containers:
        asin = container['data-asin'] # accessing attrs of Tag is simple as that
        print(asin)
        # get all a-price class elements (should be one) and their child, then take string from first
        # output without [0] is ['1,699.00 ريال', None]
        price = [item.string for item in container.find_next(attrs={'class': 'a-price'}).children][0]
        print(price)

# example output for your URL (for me is Egypt region for some reason so it's Egyptian pounds)
# https://www.amazon.sa/s?i=electronics&bbn=16966387031&rh=n:12463162031%2Cn:12463163031%2Cn:16966387031%2Cp_6:A2XPWB6MYN7ZDK&s=price-asc-rank&dc&fst=as:off&qid=1592532915&rnid=16641811031&ref=sr_st_price-asc-rank&dc&page=1
# B0887QV8BB
# 939.00 ريال
# B0856Q8K78
# 445.36 ريال
# B01ISO92V2
# 85.82 ريال
# B07N3XNX77
# 92.00 ريال
# B07XQXPF21
# 1,794.00 ريال
# B07VKNWD5P
# 262.45 ريال
# B07RCG7VSB
# 84.53 ريال
# B083TK2W87
# 15.87 ريال
# B07CXGPW8G
# 118.00 ريال
# B07ZRCT1VH
# 54.75 ريال
# B07P6YKSQP
# 69.00 ريال
# B085JVBMTT
# 322.20 ريال
# B089G813SB
# 499.00 ريال
# B07B88KQZ8
# 82.95 ريال
# B08521415W
# 995.00 ريال
# B06XGBC43Z
# 18.50 ريال
# B07Y3KGRL8
# 3,104.00 ريال
# B084ZTCM1G
# 1,399.00 ريال
# B0856Y5G11
# 995.00 ريال
# B075THDT3M
# 129.00 ريال
# B01GGKYKQM
# 11.00 ريال
# B0851ZWC87
# 399.00 ريال
# B084FFWSLH
# 799.00 ريال
# B0871TDTLV
# 1,699.00 ريال

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM