[英]Python Parsing XML to CSV with missing Elements
第一次尝试使用 Python 将 XML 解析为 CSV。 当我有多个客户并且他们没有相同的子元素时,我需要一些帮助。 当客户没有子元素时,我希望将 csv 文件列填充为“空”。 我希望“空”作为占位符,以便确实存在的值填充在正确的列中。
正在发生的事情的示例,请注意第二行中假定位于邮政编码、街道和数字字段中的数据如何被挤压到未找到值的前列中。 ! https://imgur.com/olZ9OEZ !
这是我正在尝试做的一个示例,因为您将看到“空”只是一个占位符:! https://imgur.com/w5389Kd !
这是我的python代码:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse(r'C:\Documents\cat.xml')
root = tree.getroot()
#Open the file for writing
CustomerData = open(r'C:\Users\Kris\Documents\customerdata.csv', 'w')
#Create header row object
header_row = []
#Create the csv writer object
csvwriter = csv.writer(CustomerData)
#Set count to 0
count = 0
#Find tags and text
for node in tree.iter('Customer'):
data = []
if count == 0:
for customerid in node.iter('Id_Customer'):
customer = customerid.tag
header_row.append(customer)
for segmentid in node.iter('Segment'):
segment = segmentid.tag
header_row.append(segment)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
nature = natureid.tag
header_row.append(nature)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfo = Extrainfoid.tag
header_row.append(extrainfo)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcd = zipcode.tag
header_row.append(zipcd)
for address in node.iter('Address'):
for streetname in address.iter('street'):
street = streetname.tag
header_row.append(street)
for address in node.iter('Address'):
for number in address.iter('number'):
num = number.tag
csvwriter.writerow(header_row)
count = count + 1
for customerid in node.iter('Id_Customer'):
customertxt = customerid.text
data.append(customertxt)
for segmentid in node.iter('Segment'):
segmenttxt = segmentid.text
data.append(segmenttxt)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
naturetxt = natureid.text
data.append(naturetxt)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfotxt = Extrainfoid.text
data.append(extrainfotxt)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcdtxt = zipcode.text
data.append(zipcdtxt)
for address in node.iter('Address'):
for streetname in address.iter('street'):
streettxt = streetname.text
header_row.append(streettxt)
for address in node.iter('Address'):
for number in address.iter('number'):
numtxt = number.text
data.append(numtxt)
csvwriter.writerow(data)
CustomerData.close()
下面是一个 XML 代码示例,它与我的具有不同元素的相似。 这不是我使用的真正的 xml 代码,只是一个示例,说明一个客户如何拥有另一个客户没有的多个元素。 请注意,在我使用 xml 文件的实际过程中,标题和所有内容都在我的 csv 文件中正确显示,当元素实际上对该特定客户没有价值时,我只需要创建一个“空”。
<CAT>
<Header>...</Header>
<Add>...</Add>
<Customer>
<Id_Customer>xyz1</Id_Customer>
<Segment>abc1</Segment>
<Event>
<Nature>info1</Nature>
<Extrainfo>info2</Extrainfo>
</Event>
</Customer>
<Customer>
<Id_Customer>zzwy</Id_Customer>
<Segment>c2</Segment>
<Adress>
<zipcode>77098</zipcode>
<street>belaire drive</street>
<number>5</number>
</Adress>
</Customer>
...
您可以创建一个包含您想要的所有映射的列表。 尝试搜索每个,如果不存在,则捕获AttributeError
并为其存储一个空值:
import xml.etree.ElementTree as ET
import csv
fields = [
('Id_Customer', 'Id_Customer'),
('Segment', 'Segment'),
('Nature', 'Event/Nature'),
('Extrainfo', 'Event/Extrainfo'),
('zipcode', 'Adress/zipcode'),
('street', 'Adress/street'),
('number', 'Adress/number')]
tree = ET.parse('cat.xml')
root = tree.getroot()
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=[field for field, match in fields])
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match in fields:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
为您提供一个输出 CSV 文件,其中包含:
Id_Customer,Segment,Nature,Extrainfo,zipcode,street,number
xyz1,abc1,info1,info2,,,
zzwy,c2,,,77098,belaire drive,5
这种方法还使用DictWriter()
而不是标准的 csv writer。 这使得按名称分配值更容易。
要处理每个客户的多个地址条目,您首先需要自动创建每个条目的最大额外列数。 然后在访问元素时,使用findall()
获取每个元素:
import xml.etree.ElementTree as ET
import csv
extra_columns = 2
fields = [
('Id_Customer', 'Id_Customer', 1),
('Segment', 'Segment', 1),
('Nature', 'Event/Nature', 1),
('Extrainfo', 'Event/Extrainfo', 1),
('zipcode', 'Adress/zipcode', extra_columns),
('street', 'Adress/street', extra_columns),
('number', 'Adress/number', extra_columns)]
tree = ET.parse('cat.xml')
root = tree.getroot()
# Auto create the header from fields
fieldnames = []
for field, match, cols in fields:
fieldnames.append(field)
if cols > 1:
fieldnames.extend(["{}{}".format(field, x+2) for x in range(extra_columns)])
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=fieldnames)
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match, cols in fields:
if cols > 1:
for index, el in enumerate(node.findall(match)):
try:
if index:
row["{}{}".format(field_name, index+1)] = el.text
else:
row[field_name] = el.text
except AttributeError as e:
row[field_name] = ''
else:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
所以你的标题现在看起来像:
Id_Customer,Segment,Nature,Extrainfo,zipcode,zipcode2,zipcode3,street,street2,street3,number,number2,number3
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.