I'm extracting table from HTML. but I can't write it properly to csv file. Before that I use argparser but I had to change because I want to make a loop for a lot of files. My code is like this:
import pathlib
import sys
import csv
import io
import bs4
import os
def detect_engine():
try:
import lxml
except ImportError:
engine = 'html.parser'
else:
engine = 'lxml'
return engine
class Converter:
def __init__(self, **kwargs):
engine = kwargs.get('engine')
if engine is None:
self.engine = detect_engine()
else:
self.engine = engine
self.params = kwargs
def convert(self, html_doc):
soup = bs4.BeautifulSoup(html_doc, self.engine)
output = []
for table_num, table in enumerate(soup.find_all('table')):
csv_string = io.StringIO()
# print(csv_string)
csv_writer = csv.writer(csv_string)
print(csv_writer)
for tr in table.find_all('tr'):
row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
csv_writer.writerow(row)
table_attrs = dict(num=table_num)
output.append((csv_string.getvalue(), table_attrs))
return output
converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)
with open('filename.csv', 'w') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(output)
My output for now is like this:
"('รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,\r\nลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,\r\nที่กำหนด(รวมVAT),,\r\n1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,""320,000"",""2,000"",,\r\n2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,""310,000"",""2,000"",,\r\n5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,""630,000"",""2,000"",,\r\n6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,""400,000"",""3,000"",,\r\n7,\r\n', {'num': 0})"
I want something like this:
รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,
ลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,
ที่กำหนด(รวมVAT),,
1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,"320,000","2,000",,
2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER WHITE,"290,000","2,000",,
3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER WHITE,"290,000","2,000",,
4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,"310,000","2,000",,
5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,"630,000","2,000",,
6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,"400,000","3,000",,
7,
Try the following approach. It builds a list of rows with table_num
prefixed to each row and just returns that from convert()
. Secondly, it then uses the .writerows()
function to write all of the rows in a single call.
Note, encoding='utf-8'
is used to ensure all characters are written correctly. This would need to be viewed using something that can display that encoding (Excel doesn't by default). newline=''
is added to ensure newlines are handled correctly where values are also multiline.
import pathlib
import sys
import csv
import io
import bs4
import os
def detect_engine():
try:
import lxml
except ImportError:
engine = 'html.parser'
else:
engine = 'lxml'
return engine
class Converter:
def __init__(self, **kwargs):
engine = kwargs.get('engine')
if engine is None:
self.engine = detect_engine()
else:
self.engine = engine
self.params = kwargs
def convert(self, html_doc):
soup = bs4.BeautifulSoup(html_doc, self.engine)
output = []
for table_num, table in enumerate(soup.find_all('table'), start=1):
for tr in table.find_all('tr'):
row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
output.append([table_num, *row])
return output
converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)
with open('filename.csv', 'w', newline='', encoding='utf-8') as myfile:
wr = csv.writer(myfile)
wr.writerows(output)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.