![](/img/trans.png)
[英]Parsing text with bs4 works with selenium but does not work with requests in Python
[英]login to page with Selenium works - parsing with BS4 works - but not the combination of both
從 Wordpress 論壇獲取一些數據需要登錄和解析 - 兩部分。 兩者都可以作為獨立部分很好地工作。 我可以使用 selenium 登錄 - 完美 - 我可以使用 BS4 解析(抓取)數據。 但是當我將這兩個部分結合起來時,我遇到了 session 問題——我無法解決。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
#--| Setup
options = Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1980,1020")
#options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path=r'C:\chrome\chromedriver.exe', options=options)
#--| Parse or automation
browser.get("https://login.wordpress.org/?locale=en_US")
time.sleep(2)
user_name = browser.find_element_by_css_selector('#user_login')
user_name.send_keys("the username ")
password = browser.find_element_by_css_selector('#user_pass')
password.send_keys("the pass")
time.sleep(5)
submit = browser.find_elements_by_css_selector('#wp-submit')[0]
submit.click()
# Example send page source to BeautifulSoup or selenium for parse
soup = BeautifulSoup(browser.page_source, 'lxml')
use_bs4 = soup.find('title')
print(use_bs4.text)
#print('*' * 25)
#use_sel = browser.find_elements_by_css_selector('div > div._1vC4OE')
#print(use_sel[0].text)
注意 - 這很完美。 您可以使用以下組合進行檢查:
login: pluginfan
pass: testpasswd123
請參閱下面的帶有 bs4 的解析器和刮板 - 效果非常好。
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup as BS
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0'}) # this page needs header 'User-Agent`
url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/{}/'
for page in range(1, 3):
print('\n--- PAGE:', page, '---\n')
# read page with list of posts
r = session.get(url.format(page))
soup = BS(r.text, 'html.parser')
all_uls = soup.find('li', class_="bbp-body").find_all('ul')
for number, ul in enumerate(all_uls, 1):
print('\n--- post:', number, '---\n')
a = ul.find('a')
if a:
post_url = a['href']
post_title = a.text
print('text:', post_url)
print('href:', post_title)
print('---------')
# read page with post content
r = session.get(post_url)
sub_soup = BS(r.text, 'html.parser')
post_content = sub_soup.find(class_='bbp-topic-content').get_text(strip=True, separator='\n')
print(post_content)
但兩者的組合不起作用:猜想我無法使用請求創建新的 session,大多數與 session 一起工作,Selenium 創建的登錄部分運行我有一些問題
stadalone 解析器返回有效內容 - 很好!
--- post: 1 ---
text: https://wordpress.org/support/topic/advanced-button-with-icon/
href: Advanced Button with Icon?
---------
is it not possible to create a button with a font awesome icon to left / right?
--- post: 2 ---
text: https://wordpress.org/support/topic/expand-collapse-block/
href: Expand / Collapse block?
---------
At the very bottom I have an expandable requirements.
Do you have a better block? I would like to use one of yours if poss.
The page I need help with:
--- post: 3 ---
text: https://wordpress.org/support/topic/login-form-not-formatting-correctly/
href: Login Form Not Formatting Correctly
---------
Getting some weird formatting with the email & password fields running on outside the form.
Tried on two different sites.
Thanks
..... [,,,,,] ....
--- post: 22 ---
text: https://wordpress.org/support/topic/settings-import-export-2/
href: Settings Import & Export
---------
Traceback (most recent call last):
File "C:\Users\Kasper\Documents\_f_s_j\_mk_\_dev_\bs\____wp_forum_parser_without_login.py", line 43, in <module>
print(post_content)
File "C:\Program Files\Python37\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f642' in position 95: character maps to <undefined>
[Finished in 14.129s]
有任何想法嗎?
編輯:在這兩個版本中,我都添加了保存在 CSV 文件中。
如果您有Selenium
和requests
,那么有三種可能性
Selenium
登錄並獲取頁面。requests.Session
登錄並獲取頁面。Selenium
登錄,從 Selenium 獲取 session 信息並在requests
中使用使用Selenium
登錄和獲取頁面要簡單得多,但它的工作速度比requests
慢
它只需要使用
browser.get(url)
而不是r = session.get(post_url)
BeautifulSoup(browser.page_source, ...)
而不是BeautifulSoup(r.text, ...)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import csv
#--| Setup
options = Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1980,1020")
#options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path=r'C:\chrome\chromedriver.exe', options=options)
#browser = webdriver.Firefox()
# --- login ---
browser.get("https://login.wordpress.org/?locale=en_US")
time.sleep(2)
user_name = browser.find_element_by_css_selector('#user_login')
user_name.send_keys("my_login")
password = browser.find_element_by_css_selector('#user_pass')
password.send_keys("my_password")
#time.sleep(5)
submit = browser.find_elements_by_css_selector('#wp-submit')[0]
submit.click()
# Example send page source to BeautifulSoup or selenium for parse
soup = BeautifulSoup(browser.page_source, 'lxml')
use_bs4 = soup.find('title')
print(use_bs4.text)
#print('*' * 25)
#use_sel = browser.find_elements_by_css_selector('div > div._1vC4OE')
#print(use_sel[0].text)
# --- pages ---
data = []
url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/{}/'
for page in range(1, 3):
print('\n--- PAGE:', page, '---\n')
# read page with list of posts
browser.get(url.format(page))
soup = BeautifulSoup(browser.page_source, 'html.parser') # 'lxml'
all_uls = soup.find('li', class_="bbp-body").find_all('ul')
for number, ul in enumerate(all_uls, 1):
print('\n--- post:', number, '---\n')
a = ul.find('a')
if a:
post_url = a['href']
post_title = a.text
print('href:', post_url)
print('text:', post_title)
print('---------')
# read page with post content
browser.get(post_url)
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
post_content = sub_soup.find(class_='bbp-topic-content').get_text(strip=True, separator='\n')
print(post_content)
# keep on list as dictionary
data.append({
'href': post_url,
'text': post_title,
'content': post_content,
})
# --- save ---
with open("wp-forum-conversations.csv", "w") as f:
writer = csv.DictWriter(f, ["text", "href", "content"])
writer.writeheader()
writer.writerows(data) # all rows at once
編輯:
requests
工作得更快,但它需要更多地使用DevTools
/ Chrome
中的Firefox
來查看表單中的所有字段以及它發送到服務器的其他值。 當日志記錄正確時,它還需要查看重定向的位置。 順便說一句:在使用 DevTools 之前不要忘記關閉DevTools
因為requests
不會運行 JavaScript 並且頁面可能會以表單形式發送不同的值。 (它確實發送了不同的字段)
它需要完整User-Agent
才能正常工作。
首先我加載登錄頁面並從<input>
復制所有值以使用login
和password
發送它們
登錄后,我檢查它是否被重定向到不同的頁面 - 以確認它已正確記錄。 您還可以檢查頁面是否顯示您的姓名。
import requests
from bs4 import BeautifulSoup
import csv
s = requests.Session()
s.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0' # it needs full user-agent
})
# --- get page with login form ---
r = s.get("https://login.wordpress.org/?locale=en_US")
soup = BeautifulSoup(r.text, 'html.parser')
# get all fields in form
payload = {}
for field in soup.find_all('input'):
name = field['name']
value = field['value']
payload[name] = value
print(name, '=', value)
# --- login ---
payload['log'] = 'my_login'
payload['pwd'] = 'my_password'
r = s.post('https://login.wordpress.org/wp-login.php', data=payload)
print('redirected to:', r.url)
# --- check if logged in ---
# check if logged in - check if redirected to different page
if r.url.startswith('https://login.wordpress.org/wp-login.php'):
print('Problem to login')
exit()
# check if logged in - check displayed name
url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/1/'
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
name = soup.find('span', {'class': 'display-name'})
if not name:
print('Problem to login')
exit()
else:
print('name:', name.text)
# --- pages ---
data = []
url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/{}/'
for page in range(1, 3):
print('\n--- PAGE:', page, '---\n')
# read page with list of posts
r = s.get(url.format(page))
soup = BeautifulSoup(r.text, 'html.parser') # 'lxml'
all_uls = soup.find('li', class_="bbp-body").find_all('ul')
for number, ul in enumerate(all_uls, 1):
print('\n--- post:', number, '---\n')
a = ul.find('a')
if a:
post_url = a['href']
post_title = a.text
print('href:', post_url)
print('text:', post_title)
print('---------')
# read page with post content
r = s.get(post_url)
sub_soup = BeautifulSoup(r.text, 'html.parser')
post_content = sub_soup.find(class_='bbp-topic-content').get_text(strip=True, separator='\n')
print(post_content)
# keep on list as dictionary
data.append({
'href': post_url,
'text': post_title,
'content': post_content,
})
# --- save ---
with open("wp-forum-conversations.csv", "w") as f:
writer = csv.DictWriter(f, ["text", "href", "content"])
writer.writeheader()
writer.writerows(data) # all rows at once
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.