简体   繁体   中英

Writing CSV file in Python from list with ascii

I'm extracting table from HTML. but I can't write it properly to csv file. Before that I use argparser but I had to change because I want to make a loop for a lot of files. My code is like this:

import pathlib
import sys
import csv
import io
import bs4
import os

def detect_engine():
    try:
        import lxml
    except ImportError:
        engine = 'html.parser'
    else:
        engine = 'lxml'
    return engine


class Converter:

    def __init__(self, **kwargs):
        engine = kwargs.get('engine')
        if engine is None:
            self.engine = detect_engine()
        else:
            self.engine = engine
        self.params = kwargs

    def convert(self, html_doc):
        soup = bs4.BeautifulSoup(html_doc, self.engine)
        output = []
        for table_num, table in enumerate(soup.find_all('table')):
            csv_string = io.StringIO()
            # print(csv_string)
            csv_writer = csv.writer(csv_string)
            print(csv_writer)
            for tr in table.find_all('tr'):
                row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
                csv_writer.writerow(row)
            table_attrs = dict(num=table_num)
            output.append((csv_string.getvalue(), table_attrs))
        return output

converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)
with open('filename.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(output)

My output for now is like this:

"('รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,\r\nลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,\r\nที่กำหนด(รวมVAT),,\r\n1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,""320,000"",""2,000"",,\r\n2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,""310,000"",""2,000"",,\r\n5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,""630,000"",""2,000"",,\r\n6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,""400,000"",""3,000"",,\r\n7,\r\n', {'num': 0})"

I want something like this:

รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,
ลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,
ที่กำหนด(รวมVAT),,
1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,"320,000","2,000",,
2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER  WHITE,"290,000","2,000",,
3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER  WHITE,"290,000","2,000",,
4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,"310,000","2,000",,
5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,"630,000","2,000",,
6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,"400,000","3,000",,
7,

Try the following approach. It builds a list of rows with table_num prefixed to each row and just returns that from convert() . Secondly, it then uses the .writerows() function to write all of the rows in a single call.

Note, encoding='utf-8' is used to ensure all characters are written correctly. This would need to be viewed using something that can display that encoding (Excel doesn't by default). newline='' is added to ensure newlines are handled correctly where values are also multiline.

import pathlib
import sys
import csv
import io
import bs4
import os

def detect_engine():
    try:
        import lxml
    except ImportError:
        engine = 'html.parser'
    else:
        engine = 'lxml'
    return engine


class Converter:

    def __init__(self, **kwargs):
        engine = kwargs.get('engine')
        if engine is None:
            self.engine = detect_engine()
        else:
            self.engine = engine
        self.params = kwargs

    def convert(self, html_doc):
        soup = bs4.BeautifulSoup(html_doc, self.engine)
        output = []
        
        for table_num, table in enumerate(soup.find_all('table'), start=1):
            for tr in table.find_all('tr'):
                row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
                output.append([table_num, *row])    
        
        return output

converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)

with open('filename.csv', 'w', newline='', encoding='utf-8') as myfile:
    wr = csv.writer(myfile)
    wr.writerows(output)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM