I am new to Python and have started a little project that needs some webscraping. I started using BS4 but I am getting a little stuck while trying to convert a html table with cells that span into several columns into a list of lists (in Python 3).
I want to convert this html table into a list of lists, in order to be able to print it in text-mode with terminaltables. So, I was trying to get some empty list cells to fill the rest of the row wherever there is an HTML cell that spans for 5 columns.
I think I am probably overcomplicating something that could be done a lot easier in (fluent) Python. Can anyone help?
My code at this time:
#!/usr/local/bin/python3
# encoding: utf-8
# just did a lot of experiments, so I will need to clean these imports! (some of them are related to the rest of the project anyway)
import sys
import os
import os.path
import csv
import re
from textwrap import fill as tw_fill
from random import randint
from datetime import datetime, timedelta
from copy import deepcopy
from platform import node
from colorclass import Color
from urllib3 import PoolManager
from bleach import clean
from bs4 import BeautifulSoup
from terminaltables import SingleTable
def obter_estado_detalhado(tracking_code):
""" Verify detailed tracking status for CTT shipment
Ex: obter_estado_detalhado("EA746000000PT")
"""
ctt_url = "http://www.cttexpresso.pt/feapl_2/app/open/cttexpresso/objectSearch/objectSearch.jspx?lang=def&objects=" + tracking_code + "&showResults=true"
estado = "- N/A -"
dados_tracking = [[
"Hora",
"Estado",
"Motivo",
"Local",
"Recetor"
]
]
# try:
http = PoolManager()
r = http.urlopen('GET', ctt_url, preload_content=False)
soup = BeautifulSoup(r, "html.parser")
records = dados_tracking
table2 = soup.find_all('table')[1]
l = 1
c = 0
for linha in table2.find_all('tr')[1:]:
records.append([])
for celula in linha.find_all('td')[1:]:
txt = clean(celula.string, tags=[], strip=True).strip()
records[l].append(txt)
c += 1
l += 1
tabela = SingleTable(records)
print(tabela.table)
print(records)
tabela = SingleTable(records)
print(tabela.table)
exit() # This exit is only for testing purposes...
obter_estado_detalhado("EA746813946PT")
Sample HTML code (as in this link) :
<table class="full-width">
<thead>
<tr>
<th>
Nº de Objeto
</th>
<th>
Produto
</th>
<th>
Data
</th>
<th>
Hora
</th>
<th>
Estado
</th>
<th>
Info
</th>
</tr>
</thead>
<tbody><tr>
<td>
EA746813813PT
</td>
<td>19</td>
<td>2016/03/31</td>
<td>09:40</td>
<td>
Objeto entregue
</td>
<td class="truncate">
<a id="detailsLinkShow_0" onclick="toggleObjectDetails('0', true);" class="hide">[+]Info</a>
<a id="detailsLinkHide_0" class="" onclick="toggleObjectDetails('0', false);">[-]Info</a>
</td>
</tr>
<tr></tr>
<tr id="details_0" class="">
<td colspan="6">
<div class="full-width-table-scroller"><table class="full-width">
<thead>
<tr>
<th>Hora</th>
<th>Estado</th>
<th>Motivo</th>
<th>Recetor</th>
</tr>
</thead>
<tbody><tr>
</tr>
<tr class="group">
<td colspan="5">quinta-feira, 31 Março 2016</td>
</tr><tr><td>09:40</td>
<td>Entrega conseguida</td>
<th>Local</th><td>-</td>
<td>4470 - MAIA</td>
<td>DONIEL MARQUES</td>
</tr>
<tr>
<td>08:32</td>
<td>Em distribuição</td>
<td>-</td>
<td>4470 - MAIA</td>
<td>-</td>
</tr>
<tr>
<td>08:29</td>
<td>Receção no local de entrega</td>
<td>-</td>
<td>4470 - MAIA</td>
<td>-</td>
</tr>
<tr>
<td>08:29</td>
<td>Receção nacional</td>
<td>-</td>
<td>4470 - MAIA</td>
<td>-</td>
</tr>
<tr>
<td>00:17</td>
<td>Envio</td>
<td>-</td>
<td>C. O. PERAFITA</td>
<td>-</td>
</tr>
<tr>
</tr><tr class="group">
<td colspan="5">quarta-feira, 30 Março 2016</td>
</tr>
<tr><td>23:40</td>
<td>Expedição nacional</td>
<td>-</td>
<td>C.O. PERAFITA (OPE)</td>
<td>-</td>
</tr>
<tr>
<td>20:39</td>
<td>Receção no local de entrega</td>
<td>-</td>
<td>C. O. PERAFITA</td>
<td>-</td>
</tr>
<tr>
<td>20:39</td>
<td>Receção nacional</td>
<td>-</td>
<td>C. O. PERAFITA</td>
<td>-</td>
</tr>
<tr>
<td>20:39</td>
<td>Aceitação</td>
<td>-</td>
<td>C. O. PERAFITA</td>
<td>-</td>
</tr>
</tbody></table></div>
</td>
</tr>
</tbody></table>
This matches the main table output:
from bs4 import BeautifulSoup
html = requests.get("http://www.cttexpresso.pt/feapl_2/app/open/cttexpresso/objectSearch/objectSearch.jspx?lang=def&objects=EA746813946PT&showResults=true").content
soup = BeautifulSoup(html)
# get table using id
rows = soup.select("#details_0")[0]
# get the header names and strip whitespace
cols = [th.text.strip() for th in rows.select("th")]
# extract all td's from each table row, the list comp will data grouped row wise.
data = [[td.text.strip() for td in tr.select("td")] for tr in rows.select("tr")]
print(" ".join(cols))
for row in data:
print(", ".join(row))
Output:
Hora Estado Motivo Local Recetor
terça-feira, 5 Abril 2016
07:58, Em distribuição, -, 4000 - PORTO, -
00:35, Envio, -, C. O. PERAFITA, -
00:20, Expedição nacional, -, C.O. PERAFITA (OPE), -
segunda-feira, 4 Abril 2016
21:45, Receção nacional, -, C. O. PERAFITA, -
21:45, Aceitação, -, C. O. PERAFITA, -
Website:
It was the parser, I thought I tried them all nut the only one that worked was html5 using soup = BeautifulSoup(html,"html5")
outputted:
Hora Estado Motivo Local Recetor
terça-feira, 5 Abril 2016
11:02, Entrega conseguida, -, 4000 - PORTO, CANDIDA VIEGAS
07:58, Em distribuição, -, 4000 - PORTO, -
00:35, Envio, -, C. O. PERAFITA, -
00:20, Expedição nacional, -, C.O. PERAFITA (OPE), -
segunda-feira, 4 Abril 2016
21:45, Receção no local de entrega, -, C. O. PERAFITA, -
21:45, Receção nacional, -, C. O. PERAFITA, -
21:45, Aceitação, -, C. O. PERAFITA, -
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.