在python中使用Selenium Webdriver刮取打印預覽頁面時出現問題

Question

刮取打印預覽頁面並將其導出到CSV文件時遇到問題。 當單擊“打印預覽” URL時，不僅僅提供文本數據，還提供了源代碼。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver, 
    20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search- 
    pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver, 
    10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver, 
     10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in 
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)

Answer 1

View = [item.get_attribute('href') for item in 
driver.find_elements_by_tag_name("a")]

是列出清單，例如。 ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]即containt ULR和None元件。

將driver.execute_script("arguments[0].click();",Search)語句后的代碼替換為以下代碼：

View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
        item.get_attribute('href') is not None]
for url in View:
   request = urllib.request.Request(url)
   html = urllib.request.urlopen(request).read()
   soup = BeautifulSoup(html, 'html.parser')
   divPInfo = soup.find("div", {"id": "DivPInfo"})
   title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
      "h2").text.strip()
   x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})

   my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
   print(my_dict)

O / P：

{'General Information': {'Information Type': 'Other Than Individual'}}

Answer 2

enter code here

driver.get(View)
div_2 = driver.find_element_by_xpath("//div[@id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[@id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)

Answer 3

從上面的代碼參考，我想從id = DivProject的div中刪除第二個標題（FSI Details），但是我無法獲得第二個標題。

enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'}, 
   recursive=False).find("div", {'class': 'x_title'}).find(
  "h2")[1].get_text(strip=True)
print(Project_title1)

在python中使用Selenium Webdriver刮取打印預覽頁面時出現問題

問題描述

3 個解決方案

解決方案1
0 已采納 2019-07-08 11:09:17

解決方案2
0 2019-07-12 11:42:14

解決方案3
0 2019-07-15 13:29:17

在python中使用Selenium Webdriver刮取打印預覽頁面時出現問題

問題描述

3 個解決方案

解決方案1 0 已采納 2019-07-08 11:09:17

解決方案2 0 2019-07-12 11:42:14

解決方案3 0 2019-07-15 13:29:17

解決方案1
0 已采納 2019-07-08 11:09:17

解決方案2
0 2019-07-12 11:42:14

解決方案3
0 2019-07-15 13:29:17