I'm trying to scrape some Indonesian news website. The content I scraped is recent popular news on the website. The output like these:
And this is my code :
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
kompas = requests.get('https://www.kompas.com/')
beautify = BeautifulSoup(kompas.content)
news = beautify.find_all('div', {'class','most__list clearfix'})
arti = []
for each in news:
nu = each.find('div', {'class','most__count'}).text
title = each.find('h4', {'class','most__title'}).text
lnk = each.a.get('href')
rcount = each.find('div', {'class','most__read'}).text
print(nu)
print(title)
print(lnk)
print(rcount)
arti.append({
'Top Number': nu,
'Headline': title,
'Link': lnk,
'Most Read': rcount
})
df = pd.DataFrame(arti)
df.to_csv('kompas.csv', index=False)
What I want actually it's not just the headline, links and most read as the output, I want the article too. But the article is not in the content of the page(main page). So I must click the link to see the article. Any help would be appreciated
This should help you:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
kompas = requests.get('https://www.kompas.com/')
beautify = BeautifulSoup(kompas.content,'html5lib')
news = beautify.find_all('div', {'class','most__list clearfix'})
arti = []
for each in news:
nu = each.find('div', {'class','most__count'}).text
title = each.find('h4', {'class','most__title'}).text
lnk = each.a.get('href')
rcount = each.find('div', {'class','most__read'}).text
r = requests.get(lnk)
soup = BeautifulSoup(r.text,'html5lib')
content = soup.find('div', class_ = "read__content").text.strip()
print(nu)
print(title)
print(lnk)
print(rcount)
arti.append({
'Top Number': nu,
'Headline': title,
'Link': lnk,
'Most Read': rcount,
'Content':content
})
df = pd.DataFrame(arti)
df.to_csv('kompas.csv', index=False)
Screenshot of csv
file:
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.