I am trying to convert multiple html tables to a pandas dataframe, For this task I've defined a function to return all these html tables as a pandas dataframe,
However the function returns a null list []
when the idea is that it returns a pandas dataframe.
Here's what I've tried so far:
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import pandas as pd
import string
### defining a list for all the needed links ###
first_url='https://www.salario.com.br/tabela-salarial/?cargos='
second_url='#listaSalarial'
allTheLetters = string.ascii_uppercase
links = []
for letter in allTheLetters:
links.append(first_url+letter+second_url)
### defining function to parse html objects ###
def getUrlTables(links):
for link in links:
# requesting link, parsing and finding tag:table #
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
tab_div = soup.find_all('table', {'class':'listas'})
# writing html files into directory #
with open('listas_salariales.html', "w") as file:
file.write(str(tab_div))
file.close
# reading html file as a pandas dataframe #
tables=pd.read_html('listas_salariales.html')
return tables
getUrlTables(links)
[]
Am I missing something in getUrlTables()
?
Is there an easier way to accomplish this task?
The following code will fetch the HTML from all the links, parse them to extract the table data and construct a large combined dataframe ( I have not stored the intermediate dataframes to the disk, which might be needed if the size of the tables become too large ):
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import pandas as pd
import string
### defining a list for all the needed links ###
first_url='https://www.salario.com.br/tabela-salarial/?cargos='
second_url='#listaSalarial'
allTheLetters = string.ascii_uppercase
links = []
for letter in allTheLetters:
links.append(first_url+letter+second_url)
### defining function to parse html objects ###
def getUrlTables(links, master_df):
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml') # using the lxml parser
try:
table = soup.find('table', attrs={'class':'listas'})
# finding table headers
heads = table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
#print(colnames)
# Now extracting the values
data = {k:[] for k in colnames}
rows = table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
# Constructing a pandas dataframe using the data just parsed
df = pd.DataFrame.from_dict(data)
master_df = pd.concat([master_df, df], ignore_index=True)
except AttributeError as e:
print('No data from the link: {}'.format(link))
return master_df
master_df = pd.DataFrame()
master_df = getUrlTables(links, master_df)
print(master_df)
The output from the above code is as follows:
CBO Cargo ... Teto Salarial Salário Hora
0 612510 Abacaxicultor ... 2.116,16 6,86
1 263105 Abade ... 5.031,47 17,25
2 263105 Abadessa ... 5.031,47 17,25
3 622020 Abanador na Agricultura ... 2.075,81 6,27
4 862120 Abastecedor de Caldeira ... 3.793,98 11,65
... ... ... ... ... ...
9345 263110 Zenji (missionário) ... 3.888,52 12,65
9346 723235 Zincador ... 2.583,20 7,78
9347 203010 Zoologista ... 4.615,45 14,21
9348 203010 Zoólogo ... 4.615,45 14,21
9349 223310 Zootecnista ... 5.369,59 16,50
[9350 rows x 8 columns]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.