简体   繁体   中英

Scraping Data from a website: issues with tags identification

I am trying to build a dataframe with Date, Title and Content of a website. To scrape this information, I am doing as follows:

import requests
from bs4 import BeautifulSoup
import pandas as pd


def main(req, num):
    r = req.get("http://www.lavocedellevoci.it/category/inchieste/page/{}/".format(num))
    soup = BeautifulSoup(r.content, 'html.parser')
    for tag in soup.select(".contents"):
        print(tag.select_one(".homepage_post_title auto-height td").text)
        print(tag.select_one(".homepage_post-date td-module-date a").text)
        print(tag.find_next(class_="col-sm-8 nopadding").text.strip())
    
    return tag.select_one(".homepage_post_title auto-height homepage_post-date td-module-date a").text,text, tag.find_next(class_="col-sm-8 nopadding").text.strip()

It seems there is a problem with tags nothing has been printed. If you could tell me what it is wrong, I would appreciate it.

The following grabs each investigation, converts dates into actual dates and then visits each article page to get the associated text. It uses Session for the efficiency of tcp re-use.

In your original script the use of .contents matches a single parent node rather than the child articles . Then later you neglect to join the multi-value class in the css selector eg .homepage_post_title auto-height td should be .homepage_post_title.auto-height td , where the separate class values are joined by "." in order to not be seen as a type selector. It is faster, and generally more robust, to pick a single, stable looking, class from the multi-values and use that, as shown below.

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

def main(page):
    
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')

        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        print(df)
        
if __name__ == '__main__':
    main(1)

Read more about css selectors here: https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors


You could introduce a while loop to get all pages, which stops when the class .next , associated with Successivi , is no longer present, or stop after n pages:

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

next_page = True
final = pd.DataFrame()

def main(page):
    global final
    global next_page
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')
    
        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        
        if df.empty:
            final = df
        else:
            final = pd.concat([final, df], sort = False)
        
        next_page = soup.select_one('.next') is not None
        
if __name__ == '__main__':
    page = 1

    while next_page: # page < 3: 
        main(page)
        page+=1
        
    final = final.sort_values('Date').reset_index()
    print(final)

So, my solution:

import requests
from bs4 import BeautifulSoup
import pandas as pd

def main(num):
    dict_ = {
        'date': [],
        'title': [],
        'content': []
    }
    r = requests.get(f"http://www.lavocedellevoci.it/category/inchieste/page/{num}/")
    soup = BeautifulSoup(r.text)
    for article in soup.select('article.border_top'):
        dict_['date'].append(article.select_one('span.homepage_post-date').text)
        dict_['title'].append(article.select_one('h1.homepage_post_title').text)
        dict_['content'].append(article.select_one('p').text)

    return pd.DataFrame(dict_)

Try this:

r = requests.get("http://www.lavocedellevoci.it/category/inchieste/page/3/")
soup = BeautifulSoup(r.content, 'html.parser')
for tag in soup.select(".contents > div > article"):
    print(tag.select_one("h1.homepage_post_title").string)
    print(tag.select_one("span.homepage_post-date").string)
    print(tag.select_one("a.read-more").parent.contents[0])

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM