简体   繁体   中英

Python selenium web scraped data to csv export

So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !

###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd


#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")

#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()

##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()

##main
x = 3
for i in range(x):
    try:
        main = WebDriverWait(driver, 7).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
        )
        
        ##find listings in table
        inzeraty = main.find_elements_by_class_name("vypis")
        for vypis in inzeraty:
            nadpis = vypis.find_element_by_class_name("nadpis")    
            ##print listings to check correctness
            nadpist = nadpis.text
            print(nadpist)
        
        ##find the price and print 
        for vypis in inzeraty:
            cena = vypis.find_element_by_class_name("cena")
            cenat = cena.text
            print(cenat)
        
        ##export to csv - not working
        time.sleep(1)
        print("Writing to csv")
        d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
        df = pd.DataFrame(data=d)
        df.to_csv("bobo.csv")
        time.sleep(1)
        print("Writing to csv done !")
        
        ##next page
        dalsia = driver.find_element_by_link_text("Ďalšia")
        dalsia.click()
    except:
        driver.quit()

i want the csv to look like:

  1. name,price
  2. name2, price2 it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

#driver path
driver = webdriver.Chrome()

#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()

##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()

##main
x = 3
d = []
for i in range(x):
    try:
        main = WebDriverWait(driver, 7).until(
            EC.presence_of_element_located(
                (By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))

        ##find listings in table
        inzeraty = main.find_elements_by_class_name("vypis")
        for vypis in inzeraty:
            d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
            "Cena": vypis.find_element_by_class_name("cena").text
                })

        ##next page
        dalsia = driver.find_element_by_link_text("Ďalšia")
        dalsia.click()
    except:
        driver.quit()

time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)

this gives me 59 items with price. first added to dict then to list, then send that to pandas.

All you need to do is create two empty lists nadpist_l , cenat_l and append data to that lists, finally save the lists as a dataframe.

UPDATED as per the comment

Check if this works

###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

#driver path
driver = webdriver.Chrome()

#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()

##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()

##main
x = 3
d = {}
for i in range(x):
    try:
        main = WebDriverWait(driver, 7).until(
            EC.presence_of_element_located(
                (By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))

        ##find listings in table
        inzeraty = main.find_elements_by_class_name("vypis")
        nadpist_l = []
        for vypis in inzeraty:
            nadpis = vypis.find_element_by_class_name("nadpis")
            ##print listings to check correctness
            nadpist = nadpis.text
            nadpist_l.append(nadpist)
            # print(nadpist)

        ##find the price and print
        cenat_l = []
        for vypis in inzeraty:
            cena = vypis.find_element_by_class_name("cena")
            cenat = cena.text
            cenat_l.append(cenat)
        print(len(cenat_l))

        ##export to csv - not working
        d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})

        ##next page
        dalsia = driver.find_element_by_link_text("Ďalšia")
        dalsia.click()
    except:
        driver.quit()

time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM