There are several HTML tables I am trying to extract data between the <td>
from.
The HTML structure for each table is looks like this
<td rowspan="2" class="nfvtTitleTop"><b>Delta</b></td></tr><tr><td class="nfvtTitleSTop">USD <span style="color:#808080"><i>(in Million)<i></span></td><td class="nfvtTitleSTop">%</td><td class="nfvtTitleSTop">USD <span style="color:#808080"><i>(in Million)<i></span></td><td class="nfvtTitleSTop">%</td></tr><tr><td class="nfvtTitleLeft">More Personal Computing</td><td class="nfvtR">42,276</td><td class="nfvtR"><i>38.4%</i></td><td class="nfvtR">45,698</td><td class="nfvtR"><i>36.4%</i></td><td class="nfvtR"> <span class='cPos'>+8.09%<span></td></tr><tr><td class="nfvtTitleLeft">Productivity and Business Processes</td><td class="nfvtR">35,865</td><td class="nfvtR"><i>32.6%</i></td><td class="nfvtR">41,160</td><td class="nfvtR"><i>32.8%</i></td><td class="nfvtR"> <span class='cPos'>+14.76%<span></td></tr><tr><td class="nfvtTitleLeft">Intelligent Cloud</td><td class="nfvtR">32,219</td><td class="nfvtR"><i>29.2%</i></td><td class="nfvtR">38,985</td><td class="nfvtR"><i>31.1%</i></td><td class="nfvtR"> <span class='cPos'>+21%<span></td></tr></table>
As you can see the data is nested inside of a larger table. Because of this I am having trouble on how I can extract it. Below is what I have tried so far
soup = BeautifulSoup(requests.get(html).content, 'html.parser')
data_all = {}
for table in soup.select("table.tabElemNoBor overfH fvtDiv"):
for tr in table.select('tr'):
row = [td.get_text(strip=True, separator=' ') for td in tr.select('td')]
data_all[tr].append(row)
print(data_all)
This just returns a blank set of {}
Here is the url: https://www.marketscreener.com/MICROSOFT-CORPORATION-4835/company/
I am trying to scrape the data tables on this page if possible. After trying out Aramakus suggestion, this is returning the headers of the tables. So perhaps it is not the tags that I require!
Here is an image of one of the tables.
I did an inspect element on the figures and they appear to be between tags. But when I did something like
for elem in soup.find_all("td"):
print(elem)
EDIT:
Thanks all for the help. I seem to be getting there. If I do
for elem in soup.find_all("td", {"class" : "nfvtR"}):
print(elem)
This seems to return the individual figures. But can I make it so that I return the whole table?
Any help?
This code gives me all data and save in CSV. I had to get only nested tables to make it simpler.
Problem is that tables Sales per Business
, Sales per region
, Equities
have nested columns and it gives less headers then columns and it creates incorrect CSV file. You have to add headers befor saving files to create correct CSV.
For Sales per Business
, Sales per region
headers are in two rows so I join them using zip()
(and using del
to remove second row)
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.marketscreener.com/MICROSOFT-CORPORATION-4835/company/'
r = requests.get(url) #, headers={'user-agent': 'Mozilla/5.0'})
soup = BeautifulSoup(r.content, 'html.parser')
all_tables = []
for table in soup.select("table table.nfvtTab"):
table_rows = []
for tr in table.select('tr'):
row = []
for td in tr.select('td'):
#print(td)
item = td.get_text(strip=True, separator=' ')
#print(item)
row.append(item)
table_rows.append(row)
all_tables.append(table_rows)
# add headers for nested columns
#Sales per Business
all_tables[0][0].insert(2, '2018')
all_tables[0][0].insert(4, '2019')
all_tables[0][1].insert(0, '')
all_tables[0][1].insert(5, '')
# create one row with headers
headers = [f'{a} {b}'.strip() for a,b in zip(all_tables[0][0], all_tables[0][1])]
print('new:', headers)
all_tables[0][0] = headers # set new headers in first row
del all_tables[0][1] # remove second row
#Sales per region
all_tables[1][0].insert(2, '2018')
all_tables[1][0].insert(4, '2019')
all_tables[1][1].insert(0, '')
all_tables[1][1].insert(5, '')
# create one row with headers
headers = [f'{a} {b}'.strip() for a,b in zip(all_tables[1][0], all_tables[1][1])]
print('new:', headers)
all_tables[1][0] = headers # set new headers in first row
del all_tables[1][1] # remove second row
#Equities
all_tables[3][0].insert(4, 'Free-Float %')
all_tables[3][0].insert(6, 'Company-owned shares %')
for number, table in enumerate(all_tables, 1):
print('---', number, '---')
for row in table:
print(row)
for number, table in enumerate(all_tables, 1):
with open(f'table{number}.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(table)
Result:
new: ['', '2018 USD (in Million)', '2018 %', '2019 USD (in Million)', '2019 %', 'Delta']
new: ['', '2018 USD (in Million)', '2018 %', '2019 USD (in Million)', '2019 %', 'Delta']
--- 1 ---
['', '2018 USD (in Million)', '2018 %', '2019 USD (in Million)', '2019 %', 'Delta']
['More Personal Computing', '42,276', '38.4%', '45,698', '36.4%', '+8.09%']
['Productivity and Business Processes', '35,865', '32.6%', '41,160', '32.8%', '+14.76%']
['Intelligent Cloud', '32,219', '29.2%', '38,985', '31.1%', '+21%']
--- 2 ---
['', '2018 USD (in Million)', '2018 %', '2019 USD (in Million)', '2019 %', 'Delta']
['United States', '55,926', '50.8%', '64,199', '51.2%', '+14.79%']
['Other Countries', '54,434', '49.4%', '61,644', '49.1%', '+13.25%']
--- 3 ---
['Name', 'Age', 'Since', 'Title']
['Satya Nadella', '52', '2014', 'Chief Executive Officer & Non-Independent Director']
['Bradford Smith', '60', '2015', 'President & Chief Legal Officer']
['John Thompson', '69', '2014', 'Independent Chairman']
['Kirk Koenigsbauer', '51', '2020', 'COO & VP-Experiences & Devices Group']
['Amy E. Hood', '47', '2013', 'Chief Financial Officer & Executive Vice President']
['James Kevin Scott', '54', '-', 'Chief Technology Officer & Executive VP']
['John W. Stanton', '64', '2014', 'Independent Director']
['Teri L. List-Stoll', '57', '2014', 'Independent Director']
['Charles Scharf', '53', '2014', 'Independent Director']
['Sandra E. Peterson', '60', '2015', 'Independent Director']
--- 4 ---
['', 'Vote', 'Quantity', 'Free-Float', 'Free-Float %', 'Company-owned shares', 'Company-owned shares %', 'Total Float']
['Stock A', '1', '7,583,440,247', '7,475,252,172', '98.6%', '0', '0.0%', '98.6%']
--- 5 ---
['Name', 'Equities', '%']
['The Vanguard Group, Inc.', '603,109,511', '7.95%']
['Capital Research & Management Co.', '556,573,400', '7.34%']
['SSgA Funds Management, Inc.', '314,771,248', '4.15%']
['Fidelity Management & Research Co.', '221,883,722', '2.93%']
['BlackRock Fund Advisors', '183,455,207', '2.42%']
['T. Rowe Price Associates, Inc. (Investment Management)', '172,056,401', '2.27%']
['Capital Research & Management Co. (World Investors)', '139,116,236', '1.83%']
['Putnam LLC', '121,797,960', '1.61%']
['Geode Capital Management LLC', '115,684,966', '1.53%']
['Capital Research & Management Co. (International Investors)', '103,523,946', '1.37%']
Code which I used to test CSV files:
import pandas as pd
df = pd.read_csv(f'table1.csv', index_col=0) #, header=[0,1])
print(df)
df = pd.read_csv(f'table2.csv', index_col=0) #, header=[0,1])
print(df)
df = pd.read_csv(f'table3.csv') #, index_col=0)
print(df)
df = pd.read_csv(f'table4.csv', index_col=0)
print(df)
df = pd.read_csv(f'table5.csv') #, index_col=0)
print(df)
Don't mean to change your mind about beautiful soup, it is a great tool... But I am personally opinionated that Python has a much more elegant built in solution in htmlparser module.
Here's how you'd solve this using htmlparser.
ipstr = """
<table width="100%" cellspacing="0" cellpadding="0" class="tabElemNoBor overfH fvtDiv"><tr><td>
<table class="tabTitleWhite" cellpadding="0" cellspacing="0">
<tr><td class="tabTitleLeftWhite"><nobr><b>Sales per Business</b></nobr></td></tr>
</table>
</td><!-- inner td --></tr>
<tr><td class="std_txt th_inner center" style="padding-top:4px"><table width="100%" border="0" cellpadding="0" cellspacing="0" class="nfvtTab">
<colgroup>
<col width="40%">
<col width="15%">
</colgroup>
<tr><td rowspan="2" style="border:0px"></td><td colspan="2" class="nfvtTitleTop"><b>2019</b></td>
"""
from html.parser import HTMLParser
class ExtractDataFromBTag(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.found_b = False
def handle_starttag(self,tag,attr):
if tag == "b":
self.found_b = True
def handle_data(self,data):
if self.found_b == True:
print(data) ### or do whatever you want with it. like assign it to an attribute of your object
def handle_endtag(self,tag):
if tag == "b":
self.found_b = False
eg = ExtractDataFromBTag()
eg.feed(ipstr)
Try
select('.tabElemNoBor b')
Example
from bs4 import BeautifulSoup
html = """
<table width="100%" cellspacing="0" cellpadding="0" class="tabElemNoBor overfH fvtDiv"><tr><td>
<table class="tabTitleWhite" cellpadding="0" cellspacing="0">
<tr><td class="tabTitleLeftWhite"><nobr><b>Sales per Business</b></nobr></td></tr>
</table>
</td><!-- inner td --></tr>
<tr><td class="std_txt th_inner center" style="padding-top:4px"><table width="100%" border="0" cellpadding="0" cellspacing="0" class="nfvtTab">
<colgroup>
<col width="40%">
<col width="15%">
</colgroup>
<tr><td rowspan="2" style="border:0px"></td><td colspan="2" class="nfvtTitleTop"><b>2019</b></td>
"""
soup = BeautifulSoup(html, 'html.parser')
for elem in soup.select('.tabElemNoBor b'):
print(elem.text)
should print
Sales per Business 2019
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.