[英]Having an issue while scraping the print preview page using selenium webdriver in python
刮取打印預覽頁面並將其導出到CSV文件時遇到問題。 當單擊“打印預覽” URL時,不僅僅提供文本數據,還提供了源代碼。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
是列出清單,例如。 ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]
即containt ULR和None
元件。
將driver.execute_script("arguments[0].click();",Search)
語句后的代碼替換為以下代碼:
View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
for url in View:
request = urllib.request.Request(url)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivPInfo"})
title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
print(my_dict)
O / P:
{'General Information': {'Information Type': 'Other Than Individual'}}
enter code here
driver.get(View)
div_2 = driver.find_element_by_xpath("//div[@id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[@id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)
從上面的代碼參考,我想從id = DivProject的div中刪除第二個標題(FSI Details),但是我無法獲得第二個標題。
enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2")[1].get_text(strip=True)
print(Project_title1)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.