简体   繁体   中英

Converting a HTML table to a CSV in Python

I am trying to convert a table in HTML to a csv in Python. The table I am trying to extract is this one:

<table class="tblperiode">
    <caption>Dades de per&iacute;ode</caption>
    <tr>
        <th class="sortable"><span class="tooltip" title="Per&iacute;ode (Temps Universal)">Per&iacute;ode</span><br/>TU</th>                   
            <th><span class="tooltip" title="Temperatura mitjana (&deg;C)">TM</span><br/>&deg;C</th> 
            <th><span class="tooltip" title="Temperatura m&agrave;xima (&deg;C)">TX</span><br/>&deg;C</th>
            <th><span class="tooltip" title="Temperatura m&iacute;nima (&deg;C)">TN</span><br/>&deg;C</th>
            <th><span class="tooltip" title="Humitat relativa mitjana (%)">HRM</span><br/>%</th>
            <th><span class="tooltip" title="Precipitaci&oacute; (mm)">PPT</span><br/>mm</th>
            <th><span class="tooltip" title="Velocitat mitjana del vent (km/h)">VVM (10 m)</span><br/>km/h</th>
            <th><span class="tooltip" title="Direcci&oacute; mitjana del vent (graus)">DVM (10 m)</span><br/>graus</th>
            <th><span class="tooltip" title="Ratxa m&agrave;xima del vent (km/h)">VVX (10 m)</span><br/>km/h</th>
            <th><span class="tooltip" title="Irradi&agrave;ncia solar global mitjana (W/m2)">RS</span><br/>W/m<sup>2</sup></th>
    </tr>
            <tr>
                <th>
                            00:00 - 00:30            
                </th>
                                <td>16.2</td>
                                <td>16.5</td>
                                <td>15.4</td>
                                <td>93</td>
                                <td>0.0</td>
                                <td>6.5</td>
                                <td>293</td>
                                <td>10.4</td>
                                <td>0</td>
            </tr>
            <tr>
                <th>
                            00:30 - 01:00
                </th>
                                <td>16.4</td>
                                <td>16.5</td>
                                <td>16.1</td>
                                <td>90</td>
                                <td>0.0</td>
                                <td>5.8</td>
                                <td>288</td>
                                <td>8.6</td>
                                <td>0</td>
            </tr>

And I want it to look something like this:

在此处输入图片说明

To achieve so, what I have tried is to parse the html and I have managed to build a dataframe with the data correctly doing the following:

from bs4 import BeautifulSoup
import csv
html = open("table.html").read()
soup = BeautifulSoup(html)
table = soup.select_one("table.tblperiode")

output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)

 df = pd.DataFrame(output_rows)
 print(df)

However, I would like to have the columns name and a column indicating the interval of time, in the example of html above just two of them appear 00:00-00:30 and 00:30 1:00. Therefore my table should have two rows, one corresponding with the observations of 00:00-00:30 and another one with the observations of 00:30 and 1:00.

How could I get this information from my HTML?

Here's a way of doing it, it's probably not the nicest way but it works! You can read through the comments to figure out what the code is doing!

from bs4 import BeautifulSoup
import csv

#read the html
html = open("table.html").read()
soup = BeautifulSoup(html, 'html.parser')

# get the table from html
table = soup.select_one("table.tblperiode")

# find all rows
rows = table.findAll('tr')

# strip the header from rows
headers = rows[0]
header_text = []

# add the header text to array
for th in headers.findAll('th'):
    header_text.append(th.text)

# init row text array
row_text_array = []

# loop through rows and add row text to array
for row in rows[1:]:
    row_text = []
    # loop through the elements
    for row_element in row.findAll(['th', 'td']):
        # append the array with the elements inner text
        row_text.append(row_element.text.replace('\n', '').strip())
    # append the text array to the row text array
    row_text_array.append(row_text)

# output csv
with open("out.csv", "w") as f:
    wr = csv.writer(f)
    wr.writerow(header_text)
    # loop through each row array
    for row_text_single in row_text_array:
        wr.writerow(row_text_single)

With this script:

import csv
from bs4 import BeautifulSoup

html = open('table.html').read()
soup = BeautifulSoup(html, features='lxml')
table = soup.select_one('table.tblperiode')
rows = []
for i, table_row in enumerate(table.findAll('tr')):
    if i > 0:
        periode = [' '.join(table_row.findAll('th')[0].text.split())]
        data = [x.text for x in table_row.findAll('td')]
        rows.append(periode + data)

header = ['Periode', 'TM', 'TX', 'TN', 'HRM', 'PPT', 'VVM', 'DVM', 'VVX', 'PM', 'RS']
with open('result.csv', 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(header)
    w.writerows(rows)

I've managed to generate following CSV file on output:

Periode,TM,TX,TN,HRM,PPT,VVM,DVM,VVX,PM,RS
00:00 - 00:30,16.2,16.5,15.4,93,0.0,6.5,293,10.4,0
00:30 - 01:00,16.4,16.5,16.1,90,0.0,5.8,288,8.6,0
import csv
from bs4 import BeautifulSoup
import pandas as pd

html = open('test.html').read()
soup = BeautifulSoup(html, features='lxml')
#Specify table name which you want to read.
#Example: <table class="queryResults" border="0" cellspacing="1">
table = soup.select_one('table.queryResults')

def get_all_tables(soup):
    return soup.find_all("table")


tbls = get_all_tables(soup)
for i, tablen in enumerate(tbls, start=1):
    print(i)
    print(tablen)

def get_table_headers(table):
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip())
    return headers

head = get_table_headers(table)
#print(head)

def get_table_rows(table):    
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = []
        # grab all td tags in this table row
        tds = tr.find_all("td")
        if len(tds) == 0:
            # if no td tags, search for th tags
            # can be found especially in wikipedia tables below the table
            ths = tr.find_all("th")
            for th in ths:
                cells.append(th.text.strip())
        else:
            # use regular td tags
            for td in tds:
                cells.append(td.text.strip())
        rows.append(cells)
    return rows

table_rows = get_table_rows(table)
#print(table_rows)

def save_as_csv(table_name, headers, rows):
    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")

save_as_csv("Test_table", head, table_rows)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM