[英]Writing CSV file in Python from list with ascii
我正在从 HTML 中提取表。 但我无法将其正确写入 csv 文件。 在此之前我使用 argparser 但我必须更改,因为我想为很多文件创建一个循环。 我的代码是这样的:
import pathlib
import sys
import csv
import io
import bs4
import os
def detect_engine():
try:
import lxml
except ImportError:
engine = 'html.parser'
else:
engine = 'lxml'
return engine
class Converter:
def __init__(self, **kwargs):
engine = kwargs.get('engine')
if engine is None:
self.engine = detect_engine()
else:
self.engine = engine
self.params = kwargs
def convert(self, html_doc):
soup = bs4.BeautifulSoup(html_doc, self.engine)
output = []
for table_num, table in enumerate(soup.find_all('table')):
csv_string = io.StringIO()
# print(csv_string)
csv_writer = csv.writer(csv_string)
print(csv_writer)
for tr in table.find_all('tr'):
row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
csv_writer.writerow(row)
table_attrs = dict(num=table_num)
output.append((csv_string.getvalue(), table_attrs))
return output
converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)
with open('filename.csv', 'w') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(output)
我的 output 现在是这样的:
"('รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,\r\nลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,\r\nที่กำหนด(รวมVAT),,\r\n1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,""320,000"",""2,000"",,\r\n2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER\xa0 WHITE,""290,000"",""2,000"",,\r\n4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,""310,000"",""2,000"",,\r\n5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,""630,000"",""2,000"",,\r\n6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,""400,000"",""3,000"",,\r\n7,\r\n', {'num': 0})"
我想要这样的东西:
รายการประมูลรถที่APPLEวันที่19กุมภาพันธ์2564,,,,,,,
ลำดับ,รายการรถยนต์,ทะเบียน,วันรับรถเข้าคลัง,สีรถ,ราคาขั้นต่ำ,ค่าโอน+VAT,,
ที่กำหนด(รวมVAT),,
1,NISSAN JUKE 1.6 V,5กย3675,29/9/2016,WHITE SOLID,"320,000","2,000",,
2,TOYOTA REVO SMART CAB 2.4 J ROOF,1ฒศ4596,25/7/2016,SUPER WHITE,"290,000","2,000",,
3,TOYOTA REVO SMART CAB 2.4 J ROOF(รถรุ่น ปี2016),1ฒฮ4547,14/3/2017,SUPER WHITE,"290,000","2,000",,
4,TOYOTA REVO SMART CAB 2.4 J PLUS(รถรุ่น ปี2016),1ฒอ2820,3/2/2017,SILVER METALLIC,"310,000","2,000",,
5,TOYOTA REVO DOUBLE CAB 2.8 G 4WD DVD NAVIGATOR AUTO,6กท4269,19/8/2017,ATTITUDE BLACK MICA,"630,000","2,000",,
6,TOYOTA COMMUTER 3.0 D4D HIGH ROOF(รถรุ่น ปี2015),ฮษ1426,7/1/2016,WHITE,"400,000","3,000",,
7,
尝试以下方法。 它构建了一个行列表,每行都带有table_num
前缀,然后从convert()
返回。 其次,它然后使用.writerows()
function 在一次调用中写入所有行。
注意, encoding='utf-8'
用于确保正确写入所有字符。 这需要使用可以显示该编码的东西来查看(默认情况下 Excel 不会)。 添加了newline=''
以确保在值也是多行的情况下正确处理换行符。
import pathlib
import sys
import csv
import io
import bs4
import os
def detect_engine():
try:
import lxml
except ImportError:
engine = 'html.parser'
else:
engine = 'lxml'
return engine
class Converter:
def __init__(self, **kwargs):
engine = kwargs.get('engine')
if engine is None:
self.engine = detect_engine()
else:
self.engine = engine
self.params = kwargs
def convert(self, html_doc):
soup = bs4.BeautifulSoup(html_doc, self.engine)
output = []
for table_num, table in enumerate(soup.find_all('table'), start=1):
for tr in table.find_all('tr'):
row = [''.join(cell.stripped_strings) for cell in tr.find_all(['td', 'th'])]
output.append([table_num, *row])
return output
converter = Converter()
input_source = "html_file32.html"
path = pathlib.Path(input_source)
html_doc = path.read_text()
output = converter.convert(html_doc)
print(output)
with open('filename.csv', 'w', newline='', encoding='utf-8') as myfile:
wr = csv.writer(myfile)
wr.writerows(output)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.