[英]parsing google news using beautiful soup python
我有如下的python代碼。 它搜索一個Google新聞頁面,並為每個新聞打印超鏈接和標題。 我的問題是,googlenews將在一個存儲桶中相似的新聞分組,而在下面的腳本中,每個存儲桶中僅打印第一條新聞。 如何從所有存儲桶中打印所有新文件?
from bs4 import BeautifulSoup
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)
r = requests.get('https://www.google.com/search?q=%22lebron+james%22&tbm=nws&tbs=qdr:y', headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
letters = soup.find_all("div", class_="_cnc")
#print soup.prettify()
#print letters
print type(letters)
print len(letters)
print("\n")
for x in range(0, len(letters)):
print x
print letters[x].a["href"]
print("\n")
letters2 = soup.find_all("a", class_="l _HId")
for x in range(0, len(letters2)):
print x
print letters2[x].get_text()
print ("\n----------content")
#print letters[0]
通過分類新聞,我的意思是在下圖中將前幾個新聞分組在一起。 新聞“勒布朗·詹姆斯將他的一個隊友比作丹恩”是另一組的一部分。
我不知道您所說的“桶”是什么意思? 如果您是要解析多個網站,那么我可以通過向它發送多個新聞requests.get()
來告訴您正在覆蓋r
requests.get()
這是一個處理urls數組中所有URL的循環。
import bs4
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
urls = ["https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d",
"https://www.google.com/search?q=%22lebron+james%22&tbm=nws&tbs=qdr:y"]
ahrefs = []
titles = []
for url in urls:
req = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(req.text, "html.parser")
#you don't even have to process the div container
#just go strait to <a> and using indexing get "href"
#headlines
ahref = [a["href"] for a in soup.find_all("a", class_="_HId")]
#"buckets"
ahref += [a["href"] for a in soup.find_all("a", class_="_sQb")]
ahrefs.append(ahref)
#or get_text() will return the array inside the hyperlink
#the title you want
title = [a.get_text() for a in soup.find_all("a", class_="_HId")]
title += [a.get_text() for a in soup.find_all("a", class_="_sQb")]
titles.append(title)
#print(ahrefs)
#print(titles)
我在Google上搜索len(ahrefs[1]) == 18
出現了18條結果,包括len(ahrefs[1]) == 18
,並且len(ahrefs[1]) == 18
隨着一個全新的轉變,我決定更有效地解決這個問題,這樣,您只需要追加查詢來搜索新玩家。 我不確定最終結果是什么,但這將返回字典列表。
import bs4
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
#just add to this list for each new player
#player name : url
queries = {"bledsoe":"https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d",
"james":"https://www.google.com/search?q=%22lebron+james%22&tbm=nws&tbs=qdr:y"}
total = []
for player in queries: #keys
#request the google query url of each player
req = requests.get(queries[player], headers=headers)
soup = bs4.BeautifulSoup(req.text, "html.parser")
#look for the main container
for each in soup.find_all("div"):
results = {player: { \
"link": None, \
"title": None, \
"source": None, \
"time": None} \
}
try:
#if <div> doesn't have class="anything"
#it will throw a keyerror, just ignore
if "_cnc" in each.attrs["class"]: #mainstories
results[player]["link"] = each.find("a")["href"]
results[player]["title"] = each.find("a").get_text()
sourceAndTime = each.contents[1].get_text().split("-")
results[player]["source"], results[player]["time"] = sourceAndTime
total.append(results)
elif "card-section" in each.attrs["class"]: #buckets
results[player]["link"] = each.find("a")["href"]
results[player]["title"] = each.find("a").get_text()
results[player]["source"] = each.contents[1].contents[0].get_text()
results[player]["time"] = each.contents[1].get_text()
total.append(results)
except KeyError:
pass
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.