I am trying to get a product name and its price from one local website.
The website is loaded dynamically, therefore requests doesn't support it. I am using Selenium and Beautiful Soup.
However it double counts each product ( I get 2 links for the same product), is there any solution for this?
Also, after getting product links I need to get product information (eg, Name and Price), but again it double counts products and don't get the name and price.
My code:
import pandas as pd
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
productlinks = []
baseurl = "https://www.technodom.kz/"
options = Options()
options.headless = True
driver = webdriver.Chrome(r"C:\path\to\chromedriver.exe", options=options)
for x in range(1, 5):
driver.get(
f"https://www.technodom.kz/bytovaja-tehnika/uhod-za-odezhdoj/stiral-nye-mashiny/f/brands/lg/brands/samsung?page={x}"
)
# Wait for the page to fully render
sleep(3)
soup = BeautifulSoup(driver.page_source, "lxml")
product_list = soup.find_all("li", class_="ProductCard")
for item in product_list:
for link in item.find_all("a", href=True):
productlinks.append(baseurl + link["href"])
print(productlinks)
wmlist = []
for link in productlinks:
driver.get(link)
soup = BeautifulSoup(driver.page_source, "lxml")
print(link)
name = soup.find('h1', class_='ProductHeader-Title').text.strip()
price = soup.find('p', class_='ProductPrice ProductInformation-Price').text.strip()
wm = {
'Model':name,
'Price': price
}
wmlist.append(wm)
print('Saving:', wm['Model'])
df = pd.DataFrame(wmlist)
df.to_excel("TD pricesTEST.xlsx", sheet_name='TEW', index=False)
Those nested loops are to be blamed for doubling your output. Also, all you need is in one <a>
tag with the class of ProductCard-Content
.
I've simplified your code a bit and here's how you can get product names, prices, and links and finally dump them to an Excel file:
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
final_output = []
pages = list(range(1, 5))
for page_number in pages:
print(f"Scraping page: {page_number} / {len(pages)}")
driver.get(
f"https://www.technodom.kz/bytovaja-tehnika/uhod-za-odezhdoj/"
f"stiral-nye-mashiny/f/brands/lg/brands/samsung?page={page_number}"
)
sleep(5)
soup = BeautifulSoup(
driver.page_source,
"lxml",
).find_all("a", class_="ProductCard-Content")
links = [f"https://www.technodom.kz/{anchor['href']}" for anchor in soup]
names = [name.find("h4").getText() for name in soup]
prices = [price.find("data")["value"] for price in soup]
final_output.append(
[
[name, price, link] for name, price, link
in zip(names, prices, links)
]
)
df = pd.DataFrame(
[data for sub_list in final_output for data in sub_list],
columns=["NAME", "PRICE", "LINK"],
)
df.to_excel("test.xlsx", sheet_name='TEW', index=False)
Output:
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.