簡體   English   中英

Pandas Dataframe to Excel: no data from web scraping gets exported to excel

[英]Pandas Dataframe to Excel : no data from web scraping gets exported to excel

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from openpyxl import Workbook
import time
import pandas as pd 
from pandas import ExcelWriter

# initial value for table data 
total = []
result = []

#initial setting for excel file
wb = Workbook()
ws = wb.create_sheet()
filename = '/Users/sungyeon/Desktop/projects/text.xlsx'
writer = pd.ExcelWriter(filename)

#setting of crawling
driver = webdriver.Chrome('./chromedriver')
target_url = 'https://price.joinsland.joins.com/theme/index_theme.asp?sisaegbn=T05'
driver.get(target_url)

# selection of first dropbox
select1 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='sido']"))))
select1.options[0].click()
# selection of second dropbox
select2 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='gugun']"))))

# for loop for values of second dropbox
for i in range(0,3):
    try:
        select2.options[i].click()
        title = select2.options[i].text
        wb.create_sheet(title)
        driver.implicitly_wait(5)
        driver.find_element_by_class_name('btn_search').click()


# in case of stale element reference error 
    except StaleElementReferenceException as e:
        select2 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='gugun']"))))
        select2.options[i].click()
        title = select2.options[i].text
        wb.create_sheet(title)
        driver.find_element_by_class_name('btn_search').click()
        driver.implicitly_wait(5)

# setting of table data from crawled webpage 
driver.current_url
table = driver.find_element_by_class_name('tbl_box')
tbody = table.find_element_by_tag_name('tbody')
rows=tbody.find_elements_by_tag_name('tr')

# making lists of data from crawled data    
for index, value in enumerate(rows):
    body = value.find_elements_by_tag_name('td')
    print('ok5')
    for i in range(len(body)):
        try: 
            data = body[i].text
            result.append(data)

        except StaleElementReferenceException as e:
            body = value.find_elements_by_tag_name('td')
            continue

    # adding data of a row to list of final dataset
    total.append(result)
    # clearing temp list
    result=[]
    time.sleep(2)   

# to create a new sheet named after the name of dropbox value 
# forming dataframe from list of dataset
    df = pd.DataFrame.from_records(total)
# converting to excel file into the sheet named after the name of dropbox value
    df.to_excel(writer, sheet_name = title)
writer.save()
wb.save(filename=filename)

我曾嘗試使用 selenium 進行網絡抓取。 數據收集部分工作正常,但問題是我沒有導出到 excel 文件的數據。 文件,工作表創建做得很好,但我仍然沒有在每張工作表上寫入數據我猜它可能與縮進有關,但無法弄清楚。 或任何提示將不勝感激! 非常感謝!

發現幾個問題

  • 行循環缺少縮進
  • 不需要變量 wb 並導致保存空工作表
  • 可以更好地放置總計初始化

這是工作代碼

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from openpyxl import Workbook
import time
import pandas as pd 
from pandas import ExcelWriter

#initial setting for excel file
filename = '/Users/sungyeon/Desktop/projects/text.xlsx'
writer = pd.ExcelWriter(filename)

#setting of crawling
driver = webdriver.Chrome('./chromedriver')
target_url = 'https://price.joinsland.joins.com/theme/index_theme.asp?sisaegbn=T05'
driver.get(target_url)

# selection of first dropbox
select1 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='sido']"))))
select1.options[0].click()
# selection of second dropbox
select2 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='gugun']"))))

# for loop for values of second dropbox
for i in range(0,3):
    total = [] # total for this dropbox value
    try:
        select2.options[i].click()
        title = select2.options[i].text
        #wb.create_sheet(title)
        driver.implicitly_wait(5)
        driver.find_element_by_class_name('btn_search').click()


# in case of stale element reference error 
    except StaleElementReferenceException as e:
        select2 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='gugun']"))))
        select2.options[i].click()
        title = select2.options[i].text
        driver.find_element_by_class_name('btn_search').click()
        driver.implicitly_wait(5)

    # setting of table data from crawled webpage 
    driver.current_url
    table = driver.find_element_by_class_name('tbl_box')
    tbody = table.find_element_by_tag_name('tbody')
    rows = tbody.find_elements_by_tag_name('tr')

    # making lists of data from crawled data    
    for index, value in enumerate(rows):
        body = value.find_elements_by_tag_name('td')
        result=[]  # total for row
        for i in range(len(body)):
            try: 
                data = body[i].text
                result.append(data)

            except StaleElementReferenceException as e:
                body = value.find_elements_by_tag_name('td')
                continue

        # adding data of a row to list of final dataset
        total.append(result)
        # clearing temp list
        time.sleep(2)   

# to create a new sheet named after the name of dropbox value 
# forming dataframe from list of dataset
    df = pd.DataFrame.from_records(total)
# converting to excel file into the sheet named after the name of dropbox value
    df.to_excel(writer, sheet_name = title)
writer.save()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM