[英]How to parse data after specific text Python Selenium bs4
在我正在為其編寫解析器的站點之一上,我遇到了以下問題:我需要從表中獲取所有數據,但它們沒有在 html 代碼中簽名,而是交換了 html 示例
該表如下所示:表
一開始我是用XPATH來做這個的,但是解析的時候發現有些數據被交換了,比如引擎和注冊號,或者根本沒有交換。 所以 XPATH 不適合,因為在 csv 文件中,有里程的數據可以與發動機對齊
是否有可能在 selenium 或通過 bs4 先搜索一個詞,然后解析它之后的數據?
也就是在html代碼中找到Engine這個詞,然后取html文本下面的數據就是我需要的
我的代碼:
import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
def collect_data():
global driver
options = webdriver.ChromeOptions()
options.set_preference('general.useragent.override',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Background mode
# options.add_argument('headless')
try:
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get(
url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
)
time.sleep(10)
'''Collect all URLs'''
soup = BeautifulSoup(driver.page_source, 'lxml')
car_url_list = []
total_page = soup.find('span', class_='totPage').text
print('Ford Mustang')
print(f'Total pages: {total_page}')
print(f'Page 1 of {total_page} URL collected')
r = (int(total_page) + 1)
count = 1
for i in range(1, r, 1):
driver.get(
url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
car_cards = soup.find_all('a', class_='tricky_link')
count += 1
print(f'Page {count} of {total_page} URL collected')
for car_ulr in car_cards:
car_ulr = car_ulr.get('href')
car_url_list.append(car_ulr)
with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
for line in car_url_list:
file.write(f'{line}\n')
count = 0
row = []
'''Collect car's data'''
with open('ford_mustang_url.txt', encoding='utf8') as f:
r = len(car_url_list)
print('Total cars: ' + str(r))
for i in range(r):
driver.get(f.readline())
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'lxml')
count += 1
'''Car Data'''
car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace(' ', ' ').strip()
car_price = soup.find('span', class_='GAPrice').find('span').text
car_year = soup.find('div', class_='mid_border').get('data-year')
car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
car_url = soup.find('link', hreflang='en').get('href')
# car_engine
'''If section'''
if car_reg_number == 'ABC-123':
car_reg_number = None
if car_mileage == '100000000':
car_mileage = None
print(f'{count}. ' + car_name)
print('Price: ' + f'{car_price}')
print('Year: ' + f'{car_year}')
print('Mileage: ' + f'{car_mileage}')
print('Reg.Number: ' + f'{car_reg_number}')
print('URL: ' + f'{car_url}\n')
data = {
'Name': car_name,
'Price': car_price,
'Year': car_year,
'Mileage': car_mileage,
'Reg.Number': car_reg_number,
'URL': car_url,
}
row.append(data)
csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_title)
writer.writeheader()
writer.writerows(row)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def main():
collect_data()
if __name__ == '__main__':
main()
這是針對您的問題的解決方案,而不是基於 selenium (這不是這項工作的正確工具),它將生成一個包含您所追求的所有詳細信息的數據幀/csv:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
soup = BeautifulSoup(r.text, 'html.parser')
car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
for link in car_links:
urls_list.append(link)
for url in tqdm(urls_list):
r = scraper.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
dfs = pd.read_html(str(r.text))
df_list = []
title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
df_list.append(('make_model', title))
df_list.append(('variant', subtitle))
for i, row in dfs[0].iterrows():
df_list.append((row[0], row[1]))
df_list.append((row[3], row[4]))
correct_df = pd.DataFrame(df_list).T
new_header = correct_df.iloc[0]
correct_df = correct_df[1:]
correct_df.columns = new_header
big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')
一些注意事項:前 2 輛車的描述是芬蘭語,rest 是英文,所以結尾的 df/csv 會有點有趣,但數據會在那里。 此外,您可能會在終端中收到一些關於 pd append/use concat 的警告,但這些只是警告,程序將運行。
您可以使用pip install cloudscraper
,並使用pip install tqdm
。 當然,如果您熱衷於使用 Selenium,您可以對從 selenium 獲得的 html 應用相同的方法。
我通過使用 if else 找到了 selenium 的一些解決方案:
car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text
if car_engine == 'Engine':
car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
else:
car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]
對於驅動器類型它不起作用,所以我這樣做了......
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
pass
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.