![](/img/trans.png)
[英]How to extract data from dynamic collapsing table with hidden elements using Selenium in Python
[英]How to extract data from a dynamic table with selenium python?
我正在尝试从网站中提取数据。 我需要在搜索框中输入值,然后查找详细信息。 它将生成一个表。 生成表后,需要将详细信息写入文本文件或将它们插入数据库。 我正在尝试以下事情。
网站: https ://commtech.byu.edu/noauth/classSchedule/index.php 搜索文本:“CS 142”
示例代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
c_options = Options()
c_options.add_experimental_option("detach", True)
s = Service('C:/Users/sidat/OneDrive/Desktop/python/WebDriver/chromedriver.exe')
URL = "http://saasta.byu.edu/noauth/classSchedule/index.php"
driver = webdriver.Chrome(service=s, options=c_options)
driver.get(URL)
element = driver.find_element("id", "searchBar")
element.send_keys("C S 142", Keys.RETURN)
search_button = driver.find_element("id", "searchBtn")
search_button.click()
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id='sectionTable']")))
rows = table.find_elements("xpath", "//tr")
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td")
for cell in cells:
print(cell.text)
我正在使用 PyCharm 2022.3 编码和测试结果。 我的代码没有打印任何内容。 请帮我解决这个问题,将数据提取到文本文件和 SQL 数据库表中。
试试这个:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
c_options = Options()
c_options.add_experimental_option("detach", True)
s = Service('C:/Users/sidat/OneDrive/Desktop/python/WebDriver/chromedriver.exe')
driver = webdriver.Chrome()
URL = "http://saasta.byu.edu/noauth/classSchedule/index.php"
driver.get(URL)
driver.maximize_window()
element = driver.find_element("id", "searchBar")
element.send_keys("C S 142", Keys.RETURN)
search_button = driver.find_element("id", "searchBtn")
search_button.click()
header = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//table[@id='sectionTable']/thead/tr/th")))
for th in header:
print(f"{th.get_attribute('textContent')}")
rows = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//table[@id='sectionTable']/tbody/tr")))
for i in range(0, len(rows)):
cells = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, f"(//table[@id='sectionTable']/tbody/tr)[{i+1}]//td")))
for cell in cells:
print(cell.get_attribute('textContent'))
您正在等待表格,这是正确的,但表格已完全加载( td
尚未加载)。
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id='sectionTable']//td")))
然后你至少等待将任何内容放入td
元素
以下代码打印您要求的表格的内容。
如果您要单击它们或向它们发送文本,则需要等待元素可单击,或者如果您想阅读其文本内容,则需要等待可见性。
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 30)
url = "http://saasta.byu.edu/noauth/classSchedule/index.php"
driver.get(url)
wait.until(EC.element_to_be_clickable((By.ID, "searchBar"))).send_keys("C S 142", Keys.RETURN)
wait.until(EC.element_to_be_clickable((By.ID, "searchBtn"))).click()
table = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='sectionTable']")))
headers = table.find_elements("xpath", ".//thead//th")
cells = table.find_elements("xpath", ".//tbody//td")
headers_text = ""
for header in headers:
cell_text = header.text
headers_text = headers_text + cell_text.ljust(10)
cells_text = ""
for cell in cells:
c_text = cell.text
cells_text = cells_text + c_text.ljust(10)
print(headers_text)
print(cells_text)
输出是:
Section Type Mode InstructorCredits Term Days Start End Location Available Waitlist
002 DAY Classroom 3.00 TBA 0/0 0
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.