[英]how to add to an href in python using beautiful soup
我有一個腳本可以抓取 web 站點並測試它找到的所有鏈接。 我的問題是,當遇到帶有雙斜杠的鏈接(如 //us.cnn.com)時,我的腳本會失敗。
這是我的腳本失敗的代碼:
elif "//" in link.get('href'):
link = "http:" + str(link)
print("tested link is: " + link)
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScraper/chromedriver')
#driver.get(link)
#driver.get(str(link))
driver.get(link.get('href'))
我想要做的是當它遇到一個帶有“//”(雙斜杠)的鏈接時,我只想發送帶有 http: 的鏈接:添加到它,所以 selenium 打開那個完整的鏈接( Z80791B3AE7002CB88F8Z4://666FAA例如, com/us )。
我怎樣才能正確地做到這一點?
這是完整的腳本,以防我需要參考。
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import sys
import time
from datetime import date
from datetime import datetime
import datetime
# chrome browser control options
options = webdriver.ChromeOptions()
options.add_argument('headless')
# options.add_argument('--ignore-certificate-errors')
# options.add_argument("--test-type")
options.binary_location = "/usr/bin/google-chrome" # <--- needed actual path to chrome browser
# hard set path to chromedriver in project
# driver = webdriver.Chrome('/home/ironmantis7x/Documents/BSSLLC/projects/PycharmProjects/WebScraper/chromedriver')
# system time for time/date stamping
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# fetching url to test
url = raw_input("Enter a website to extract the URL's from: ")
r = requests.get("http://" + url)
data = r.text
# soup = BeautifulSoup(data)
soup = BeautifulSoup(data, 'html.parser')
validURL = 0
validChildURL = 0
invalidURL = 0
for link in soup.find_all('a'):
if "http" in link.get('href'):
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScraper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link_2.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
elif "https" in link.get('href'):
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScraper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link_2.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
elif "//" in link.get('href'):
link = "http:" + str(link)
print("tested link is: " + link)
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScraper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
else:
print(link.get('href') + " is an invalid link")
with open('invalid_link.txt', 'a') as f:
print >> f, link.get('href')
print(link.get('href'))
driver = webdriver.Chrome('/home/ironmantis7x/PycharmProjects/WebScraper/chromedriver',
chrome_options=options)
driver.quit()
invalidURL = invalidURL + 1
您可以檢查所有帶有請求和beautifulsoup的鏈接,不需要 Selenium。
為了解析和反解析 url 我使用了requests.utils.urlparse
和requests.utils.urlunparse
。
為了檢查 URL 是否有效,我檢查了頁面是否有標題。
import requests
from bs4 import BeautifulSoup
valid_urls = []
invalid_urls = []
response = requests.get("http://cnn.com/us")
print(f"base url: %s", response.url)
# scheme, netloc, path, params, query, fragment
parsed_base_url = requests.utils.urlparse(response.url)
base_scheme = parsed_base_url.scheme
base_netloc = parsed_base_url.netloc
page = BeautifulSoup(response.text, 'html5lib')
urls = [a.get("href") for a in page.select("a[href]")]
for url in urls:
# scheme, netloc, path, params, query, fragment
parsed_url = requests.utils.urlparse(url)
scheme = base_scheme if parsed_url.scheme == '' else parsed_url.scheme
netloc = base_netloc if parsed_url.netloc == '' else parsed_url.netloc
new_url_components = (scheme, netloc, parsed_url.path, '', '', '')
new_url = requests.utils.urlunparse(new_url_components)
title = ''
try:
response = requests.get(new_url)
title = BeautifulSoup(response.text, 'html5lib').select_one("title")
except:
pass
if title != '':
print(f"VALID: {title.text.strip()}", new_url)
valid_urls.append(new_url)
else:
print(f"INVALID: {new_url}")
invalid_urls.append(new_url)
print(f"Valid links count: {len(valid_urls)}")
print(f"Invalid links count: {len(invalid_urls)}")
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.