Scraping Data from a website: issues with tags identification

Question

I am trying to build a dataframe with Date, Title and Content of a website. To scrape this information, I am doing as follows:

import requests
from bs4 import BeautifulSoup
import pandas as pd


def main(req, num):
    r = req.get("http://www.lavocedellevoci.it/category/inchieste/page/{}/".format(num))
    soup = BeautifulSoup(r.content, 'html.parser')
    for tag in soup.select(".contents"):
        print(tag.select_one(".homepage_post_title auto-height td").text)
        print(tag.select_one(".homepage_post-date td-module-date a").text)
        print(tag.find_next(class_="col-sm-8 nopadding").text.strip())
    
    return tag.select_one(".homepage_post_title auto-height homepage_post-date td-module-date a").text,text, tag.find_next(class_="col-sm-8 nopadding").text.strip()

It seems there is a problem with tags nothing has been printed. If you could tell me what it is wrong, I would appreciate it.

Answer 1

The following grabs each investigation, converts dates into actual dates and then visits each article page to get the associated text. It uses Session for the efficiency of tcp re-use.

In your original script the use of .contents matches a single parent node rather than the child articles . Then later you neglect to join the multi-value class in the css selector eg .homepage_post_title auto-height td should be .homepage_post_title.auto-height td , where the separate class values are joined by "." in order to not be seen as a type selector. It is faster, and generally more robust, to pick a single, stable looking, class from the multi-values and use that, as shown below.

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

def main(page):
    
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')

        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        print(df)
        
if __name__ == '__main__':
    main(1)

Read more about css selectors here: https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors

You could introduce a while loop to get all pages, which stops when the class .next , associated with Successivi , is no longer present, or stop after n pages:

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import datetime

def get_date(date_string):
    date_parts = date_string.split(' ')
    article_date = '-'.join([date_parts[-1], month_numbers[date_parts[1].lower()], date_parts[0].zfill(2)])
    d = datetime.datetime.strptime(article_date, "%Y-%m-%d").date()
    return d

month_numbers =  { 'gennaio' : '01',
                    'febbraio' : '02',
                    'marzo' : '03',
                    'aprile' : '04',
                    'maggio' : '05',
                    'giugno' : '06',
                    'luglio' : '07',
                    'agosto' : '08',
                    'settembre' : '09',
                    'ottobre' : '10',
                    'novembre' : '11',
                    'dicembre' : '12',
        }

next_page = True
final = pd.DataFrame()

def main(page):
    global final
    global next_page
    results = []

    with requests.Session() as s:

        soup =  bs(s.get(f'http://www.lavocedellevoci.it/category/inchieste/page/{page}').content, 'lxml')
    
        for article in soup.select('article'):  #soup.select('article:has(a:contains("Inchieste"))') if need to be more restrictive in future
            title = article.select_one('h1').text
            date = get_date(article.select_one('.homepage_post-date').text)
            link = article.select_one('.read-more')['href']
            soup2 =  bs(s.get(link).content, 'lxml')
            text = '\n'.join([i.text for i in soup2.select('article p:not([class])')])
            results.append([title, date, text])

        df = pd.DataFrame(results, columns = ['Title', 'Date', 'Content'])
        
        if df.empty:
            final = df
        else:
            final = pd.concat([final, df], sort = False)
        
        next_page = soup.select_one('.next') is not None
        
if __name__ == '__main__':
    page = 1

    while next_page: # page < 3: 
        main(page)
        page+=1
        
    final = final.sort_values('Date').reset_index()
    print(final)

Answer 2

So, my solution:

import requests
from bs4 import BeautifulSoup
import pandas as pd

def main(num):
    dict_ = {
        'date': [],
        'title': [],
        'content': []
    }
    r = requests.get(f"http://www.lavocedellevoci.it/category/inchieste/page/{num}/")
    soup = BeautifulSoup(r.text)
    for article in soup.select('article.border_top'):
        dict_['date'].append(article.select_one('span.homepage_post-date').text)
        dict_['title'].append(article.select_one('h1.homepage_post_title').text)
        dict_['content'].append(article.select_one('p').text)

    return pd.DataFrame(dict_)

Answer 3

Try this:

r = requests.get("http://www.lavocedellevoci.it/category/inchieste/page/3/")
soup = BeautifulSoup(r.content, 'html.parser')
for tag in soup.select(".contents > div > article"):
    print(tag.select_one("h1.homepage_post_title").string)
    print(tag.select_one("span.homepage_post-date").string)
    print(tag.select_one("a.read-more").parent.contents[0])

Scraping Data from a website: issues with tags identification

Question

3 answers

solution1
2 ACCPTED 2020-11-08 05:57:24

solution2
0 2020-11-08 03:44:31

solution3
0 2020-11-08 04:17:43

Scraping Data from a website: issues with tags identification

Question

3 answers

solution1 2 ACCPTED 2020-11-08 05:57:24

solution2 0 2020-11-08 03:44:31

solution3 0 2020-11-08 04:17:43

solution1
2 ACCPTED 2020-11-08 05:57:24

solution2
0 2020-11-08 03:44:31

solution3
0 2020-11-08 04:17:43