繁体   English   中英

Append csv 文件中的数据错误

[英]Append data wrong in csv file

from selenium import webdriver           
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer


options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20) 

URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)



email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8@outlook.com")

password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")

login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()

 


urls=[]        
product=[] 
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:

        try:        
                t1 =detail.select_one("h5.profile-title a").text
        except:
                pass
  
        wev={
                'Name':t1,
                }
        
        product.append(wev)
           
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
        href=link.get_attribute("href")
        urls.append(href)
        

for url in urls: 
        
        driver.get(url) 
        soup = BeautifulSoup(driver.page_source,"lxml")
        try:
                website=soup.select_one("p.adress-info a[target='_blank']").text
        except:
                website=''
        
        data={
                'website':website
        }
 
        product.append(data)
                
df=pd.DataFrame(product)
df.to_csv('firm.csv')

website的数据会下到CSV文件,如图所示是我追加数据的方式不对为什么数据往下移动我错的地方...请推荐我错的地方...... .

在此处输入图像描述

我想要这些格式的 output 请为这些建议解决方案...我想要这些格式的 output,如下所示...

在此处输入图像描述

您不能将 append wevdata分开 - 您需要网站和名称在同一个字典中才能让 pandas 知道它们属于同一行。


您可以将网站添加到单独的列表中,例如

sites = []
# for url in urls:   
        # driver.get...
        # soup = ....
        # try:....except:....

        data={
                'website':website
        }
 
        sites.append(data) 

然后zip合并

for pi, dictPair in enumerate(zip(product, sites)):
        product[pi].update(dictPair[1])

df = pd.DataFrame(product)
df.to_csv('firm.csv')

但是,我认为这不是确保正确的名称和网站匹配的最佳方式。


您应该从一开始就为每一行添加到同一个字典,而不是压缩和合并。

added_urls = []        
product = [] 
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:

        try:        
                t1 = detail.select_one("h5.profile-title a").text
        except:
                # pass # then you'll just be using the previous row's t1
                # [also, if this happens in the first loop, it will raise an error]

                t1 = 'MISSING' # '' #
  
        wev = {
                'Name':t1,
                }

        href = detail.select_one("h5.profile-title + p a[href]") 
        if href and href.get("href", '').startswith('http'): 
                wev['page_link'] = href.get("href")
                added_urls.append(href.get("href"))
        
        product.append(wev)

### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###       
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
        if href in added_urls: continue  # skip links that are already added
        href = link.get_attribute("href")

        # urls.append(href)
        added_urls.append(href)
        product.append({"page_link": href})
##########################################################
        

for pi, prod in enumerate(product): 
        if "page_link" not in prod or not prod["page_link"]: continue ## missing link
        url = prod["page_link"]
        
        driver.get(url) 
        soup = BeautifulSoup(driver.page_source,"lxml")
        try:
                website=soup.select_one("p.adress-info a[target='_blank']").text
        except:
                website=''
        
        del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv

        # data={'website':website}
        # product.append(data)
        product[pi]['website'] = website
                
df=pd.DataFrame(product)
df.to_csv('firm.csv')

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM