簡體   English   中英

在python中使用Selenium Webdriver刮取打印預覽頁面時出現問題

[英]Having an issue while scraping the print preview page using selenium webdriver in python

刮取打印預覽頁面並將其導出到CSV文件時遇到問題。 當單擊“打印預覽” URL時,不僅僅提供文本數據,還提供了源代碼。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver, 
    20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search- 
    pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver, 
    10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver, 
     10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in 
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)
View = [item.get_attribute('href') for item in 
driver.find_elements_by_tag_name("a")]

是列出清單,例如。 ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]即containt ULR和None元件。

driver.execute_script("arguments[0].click();",Search)語句后的代碼替換為以下代碼:

View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
        item.get_attribute('href') is not None]
for url in View:
   request = urllib.request.Request(url)
   html = urllib.request.urlopen(request).read()
   soup = BeautifulSoup(html, 'html.parser')
   divPInfo = soup.find("div", {"id": "DivPInfo"})
   title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
      "h2").text.strip()
   x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})

   my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
   print(my_dict)

O / P:

{'General Information': {'Information Type': 'Other Than Individual'}}
enter code here

driver.get(View)
div_2 = driver.find_element_by_xpath("//div[@id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[@id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)

從上面的代碼參考,我想從id = DivProject的div中刪除第二個標題(FSI Details),但是我無法獲得第二個標題。

enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'}, 
   recursive=False).find("div", {'class': 'x_title'}).find(
  "h2")[1].get_text(strip=True)
print(Project_title1)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM