繁体   English   中英

从网站抓取数据:标签识别问题

[英]Scraping Data from a website: issues with tags identification

我正在尝试使用网站的日期、标题和内容构建数据框。 为了抓取这些信息,我的操作如下:

import requests
from bs4 import BeautifulSoup
import pandas as pd


def main(req, num):
    r = req.get("http://www.lavocedellevoci.it/category/inchieste/page/{}/".format(num))
    soup = BeautifulSoup(r.content, 'html.parser')
    for tag in soup.select(".contents"):
        print(tag.select_one(".homepage_post_title auto-height td").text)
        print(tag.select_one(".homepage_post-date td-module-date a").text)
        print(tag.find_next(class_="col-sm-8 nopadding").text.strip())
    
    return tag.select_one(".homepage_post_title auto-height homepage_post-date td-module-date a").text,text, tag.find_next(class_="col-sm-8 nopadding").text.strip()

标签似乎有问题,没有打印任何内容。 如果您能告诉我这是什么问题,我将不胜感激。

下面抓取每个调查,将日期转换为实际日期,然后访问每个文章页面以获取相关文本。 它使用Session来提高 tcp 重用的效率。

在您的原始脚本中, .contents的使用匹配单个父节点而不是子articles 然后你忽略了在 css 选择器中加入多值类,例如.homepage_post_title auto-height td应该是.homepage_post_title.auto-height td ,其中单独的类值由“。”连接。 为了不被视为type选择器。 从多值中选择一个单一的、外观稳定的类并使用它会更快,而且通常更健壮,如下所示。

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

def main(page):
    
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')

        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        print(df)
        
if __name__ == '__main__':
    main(1)

在此处阅读有关 css 选择器的更多信息: https : //developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors


您可以引入一个while循环来获取所有页面,当与Successivi关联的.next类不再存在时停止,或者在n页面后停止:

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

next_page = True
final = pd.DataFrame()

def main(page):
    global final
    global next_page
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')
    
        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        
        if df.empty:
            final = df
        else:
            final = pd.concat([final, df], sort = False)
        
        next_page = soup.select_one('.next') is not None
        
if __name__ == '__main__':
    page = 1

    while next_page: # page < 3: 
        main(page)
        page+=1
        
    final = final.sort_values('Date').reset_index()
    print(final)

所以,我的解决方案:

import requests
from bs4 import BeautifulSoup
import pandas as pd

def main(num):
    dict_ = {
        'date': [],
        'title': [],
        'content': []
    }
    r = requests.get(f"http://www.lavocedellevoci.it/category/inchieste/page/{num}/")
    soup = BeautifulSoup(r.text)
    for article in soup.select('article.border_top'):
        dict_['date'].append(article.select_one('span.homepage_post-date').text)
        dict_['title'].append(article.select_one('h1.homepage_post_title').text)
        dict_['content'].append(article.select_one('p').text)

    return pd.DataFrame(dict_)

尝试这个:

r = requests.get("http://www.lavocedellevoci.it/category/inchieste/page/3/")
soup = BeautifulSoup(r.content, 'html.parser')
for tag in soup.select(".contents > div > article"):
    print(tag.select_one("h1.homepage_post_title").string)
    print(tag.select_one("span.homepage_post-date").string)
    print(tag.select_one("a.read-more").parent.contents[0])

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM