使用硒从表中抓取数据

Question

我想从以下页面中抓取“符号”、“名称”和“收益呼叫时间”下的所有公司信息： https : //finance.yahoo.com/calendar/earnings

到目前为止，这就是我对公司名称的了解，但我收到了错误消息：

“NoSuchElementException：没有这样的元素：无法定位元素：{"method":"xpath","selector":"///*[@id='cal-res-table']/div[1]/table/tbody /tr[1]/td[2]"}（会话信息：chrome=86.0.4240.198）"

from selenium import webdriver
import datetime

tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() #get tomorrow in iso format as needed
url = "https://finance.yahoo.com/calendar/earnings?day="+tomorrow
print ("url: " + url)

driver = webdriver.Chrome("C:/Users/jrod94/Downloads/chromedriver_win32/chromedriver.exe")
driver.get(url)
element = driver.find_element_by_xpath("//*[@id='cal-res-table']")
Companies = [a.get_attribute("Company") for a in element]

driver.close()

Answer 1

使用熊猫怎么样？

import datetime
import pandas as pd

pd.set_option('display.max_column',None)
tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() #get tomorrow in iso format as needed'''
url = pd.read_html("https://finance.yahoo.com/calendar/earnings?day="+tomorrow, header=0)
table = url[0]
print(table)

输出：-

  Symbol                         Company  Earnings Call Time EPS Estimate  \
0    WBAI                     500.Com Ltd  After Market Close            -   
1    BRBR             Bellring Brands Inc                 TAS         0.19   
2     BKE                      Buckle Inc  Before Market Open         0.54   
3     BNR        Burning Rock Biotech Ltd                 TAS        -0.12   
4     IEC            IEC Electronics Corp                 TAS            -   
5    GEOS      Geospace Technologies Corp                 TAS            -   
6    DREM  Dream Homes & Development Corp   Time Not Supplied            -   
7    DXLG        Destination XL Group Inc  Before Market Open            -   
8      FL                 Foot Locker Inc  Before Market Open         0.61   
9     HHR            HeadHunter Group PLC                 TAS         0.14   
10    HHR            HeadHunter Group PLC  Before Market Open         0.14   
11    RMR                   RMR Group Inc  Before Market Open         0.39   
12    GSX                 GSX Techedu Inc  Before Market Open        -0.31   
13    GSX                 GSX Techedu Inc                 TAS        -0.31   
14   HIBB              Hibbett Sports Inc  Before Market Open         0.45   
15   HAYN        Haynes International Inc                 TAS         -0.7   
16   IIIV                i3 Verticals Inc                 TAS         0.18   
17   AIHS          Senmiao Technology Ltd  Before Market Open

Answer 2

实际上，您的代码给出了错误，但与您不在同一行，但稍后。 也许问题是当您尝试访问元素时页面未加载。 在发生错误的行之前稍微延迟可能会解决问题。

from selenium import webdriver
import datetime
import time

tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() #get tomorrow in iso format as needed
url = "https://finance.yahoo.com/calendar/earnings?day="+tomorrow
print ("url: " + url)

driver = webdriver.Chrome("C:/Users/jrod94/Downloads/chromedriver_win32/chromedriver.exe")
driver.get(url)
time.sleep(1) # you can increase 1 if it still does not work
element = driver.find_element_by_xpath("//*[@id='cal-res-table']")
Companies = [a.get_attribute("Company") for a in element]

driver.close()

Answer 3

由于您的问题与selenium有关：

你应该看看Selenium-Waits

您正在等待 HTML 源代码中所有元素的呈现，下面的代码应该描述它：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def main(url):
    driver = webdriver.Firefox()
    driver.get(url)
    try:
        cnames = [x.text for x in WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "td[aria-label='Company']"))
        )]
    finally:
        print(cnames)
        driver.quit()


main("https://finance.yahoo.com/calendar/earnings")

输出：

['111 Inc', '360 DigiTech Inc', 'American Software Inc', 'American Software Inc', 'Corporacion America Airports SA', 'Atkore International Group Inc', 'Atkore International Group Inc', 'Helmerich and Payne Inc', 'Amtech Systems Inc', 'Amtech Systems Inc', 'Delta Apparel Inc', 'Delta Apparel Inc', 'Bellring Brands Inc', 'Berry Global Group Inc', 'Beacon Roofing Supply Inc', 'Natural Grocers By Vitamin Cottage Inc', "BJ's Wholesale Club Holdings Inc", 'Entera Bio Ltd', 'SG Blocks Inc', 'SG Blocks Inc', 'BEST Inc', 'Brady Corp', 'BioHiTech Global Inc', 'BioHiTech Global Inc', 'Oaktree Strategic Income Corporation', 'Caleres Inc', 'Pennantpark Investment Corp', 'Geospace Technologies Corp', 'Canadian Solar Inc', 'Oaktree Specialty Lending Corp', 'Matthews International Corp', 'Clearsign Technologies Corp', "Children's Place Inc", 'Elys Game Technology Corp', 'Dada Nexus Ltd', 'ESCO Technologies Inc', 'Euroseas Ltd', 'Fangdd Network Group Ltd', 'Fangdd Network Group Ltd', 'Golden Ocean Group Ltd', 'Hoegh LNG Partners LP', 'Post Holdings Inc', 'Huize Holding Ltd', 'Haynes International Inc', "Macy's Inc", 'OneWater Marine Inc', 'OneWater Marine Inc', 'Woodward Inc', 'StealthGas Inc', 'Maximus Inc', 'Ross Stores Inc', 'Intuit Inc', 'Ooma Inc', 'Williams-Sonoma Inc', 'Precipio Inc', 'NetEase Inc', 'Workday Inc', 'i3 Verticals Inc', 'Knot Offshore Partners LP', 'Maxeon Solar Technologies Ltd', 'Opera Ltd', 'Puxin Ltd', 'Puxin Ltd']

注意：您不需要使用selenium因为它会减慢您的任务速度。

我也看到没有理由import一个巨大的库，比如pandas来读取一个HTML表。

只需通过以下代码即可获取目标，您将在其中获得确切的call date ：

import requests
import re
import json
import csv

keys = ['ticker', 'companyshortname', 'startdatetime']


def main(url):
    r = requests.get(url)
    goal = json.loads(re.search(r"App\.main.*?({.+})", r.text).group(1))
    target = [[item[k] for k in keys] for item in goal['context']
              ['dispatcher']['stores']['ScreenerResultsStore']['results']['rows']]
    with open("result.csv", 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(keys)
        writer.writerows(target)


main("https://finance.yahoo.com/calendar/earnings")

输出：在线查看

使用硒从表中抓取数据

问题描述

3 个解决方案

解决方案1
2 已采纳 2020-11-19 07:48:38

解决方案2
2 2020-11-19 07:55:44

解决方案3
1 2020-11-19 08:01:08

使用硒从表中抓取数据

问题描述

3 个解决方案

解决方案1 2 已采纳 2020-11-19 07:48:38

解决方案2 2 2020-11-19 07:55:44

解决方案3 1 2020-11-19 08:01:08

解决方案1
2 已采纳 2020-11-19 07:48:38

解决方案2
2 2020-11-19 07:55:44

解决方案3
1 2020-11-19 08:01:08