[英](Python) Scraping data from a website with 'style:hidden' tags?
[英]Scraping Data from a website: issues with tags identification
我正在尝试使用网站的日期、标题和内容构建数据框。 为了抓取这些信息,我的操作如下:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(req, num):
r = req.get("http://www.lavocedellevoci.it/category/inchieste/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
for tag in soup.select(".contents"):
print(tag.select_one(".homepage_post_title auto-height td").text)
print(tag.select_one(".homepage_post-date td-module-date a").text)
print(tag.find_next(class_="col-sm-8 nopadding").text.strip())
return tag.select_one(".homepage_post_title auto-height homepage_post-date td-module-date a").text,text, tag.find_next(class_="col-sm-8 nopadding").text.strip()
标签似乎有问题,没有打印任何内容。 如果您能告诉我这是什么问题,我将不胜感激。
下面抓取每个调查,将日期转换为实际日期,然后访问每个文章页面以获取相关文本。 它使用Session
来提高 tcp 重用的效率。
在您的原始脚本中, .contents
的使用匹配单个父节点而不是子articles
。 然后你忽略了在 css 选择器中加入多值类,例如.homepage_post_title auto-height td
应该是.homepage_post_title.auto-height td
,其中单独的类值由“。”连接。 为了不被视为type
选择器。 从多值中选择一个单一的、外观稳定的类并使用它会更快,而且通常更健壮,如下所示。
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime
def get_date(date_string):
date_parts = date_string.split(' ')
article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
return d
month_numbers = { 'gennaio' : '01',
'febbraio' : '02',
'marzo' : '03',
'aprile' : '04',
'maggio' : '05',
'giugno' : '06',
'luglio' : '07',
'agosto' : '08',
'settembre' : '09',
'ottobre' : '10',
'novembre' : '11',
'dicembre' : '12',
}
def main(page):
results = []
with requests.Session() as s:
soup = bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')
for article in soup.select('article'): #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
title = article.select_one('h1').text
date = get_date(article.select_one('.homepage_post-date').text)
link = article.select_one('.read-more')['href']
soup2 = bs(s.get(link).content, 'lxml')
text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
results.append([title, date, text])
df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
print(df)
if __name__ == '__main__':
main(1)
在此处阅读有关 css 选择器的更多信息: https : //developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
您可以引入一个while
循环来获取所有页面,当与Successivi
关联的.next
类不再存在时停止,或者在n
页面后停止:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime
def get_date(date_string):
date_parts = date_string.split(' ')
article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
return d
month_numbers = { 'gennaio' : '01',
'febbraio' : '02',
'marzo' : '03',
'aprile' : '04',
'maggio' : '05',
'giugno' : '06',
'luglio' : '07',
'agosto' : '08',
'settembre' : '09',
'ottobre' : '10',
'novembre' : '11',
'dicembre' : '12',
}
next_page = True
final = pd.DataFrame()
def main(page):
global final
global next_page
results = []
with requests.Session() as s:
soup = bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')
for article in soup.select('article'): #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
title = article.select_one('h1').text
date = get_date(article.select_one('.homepage_post-date').text)
link = article.select_one('.read-more')['href']
soup2 = bs(s.get(link).content, 'lxml')
text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
results.append([title, date, text])
df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
if df.empty:
final = df
else:
final = pd.concat([final, df], sort = False)
next_page = soup.select_one('.next') is not None
if __name__ == '__main__':
page = 1
while next_page: # page < 3:
main(page)
page+=1
final = final.sort_values('Date').reset_index()
print(final)
所以,我的解决方案:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(num):
dict_ = {
'date': [],
'title': [],
'content': []
}
r = requests.get(f"http://www.lavocedellevoci.it/category/inchieste/page/{num}/")
soup = BeautifulSoup(r.text)
for article in soup.select('article.border_top'):
dict_['date'].append(article.select_one('span.homepage_post-date').text)
dict_['title'].append(article.select_one('h1.homepage_post_title').text)
dict_['content'].append(article.select_one('p').text)
return pd.DataFrame(dict_)
尝试这个:
r = requests.get("http://www.lavocedellevoci.it/category/inchieste/page/3/")
soup = BeautifulSoup(r.content, 'html.parser')
for tag in soup.select(".contents > div > article"):
print(tag.select_one("h1.homepage_post_title").string)
print(tag.select_one("span.homepage_post-date").string)
print(tag.select_one("a.read-more").parent.contents[0])
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.