[英]enumerate column headers in CSV that belong to the same tag (key) in python
我正在使用以下幾組生成器將XML解析為CSV:
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
import csv
def flatten_list(aList, prefix=''):
for i, element in enumerate(aList, 1):
eprefix = "{}{}".format(prefix, i)
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
yield from flatten_dict(element, eprefix)
# treat like list
elif element[0].tag == element[1].tag:
yield from flatten_list(element, eprefix)
elif element.text:
text = element.text.strip()
if text:
yield eprefix[:].rstrip('.'), element.text
def flatten_dict(parent_element, prefix=''):
prefix = prefix + parent_element.tag
if parent_element.items():
for k, v in parent_element.items():
yield prefix + k, v
for element in parent_element:
eprefix = element.tag
if element:
# treat like dict - we assume that if the first two tags
# in a series are different, then they are all different.
if len(element) == 1 or element[0].tag != element[1].tag:
yield from flatten_dict(element, prefix=prefix)
# treat like list - we assume that if the first two tags
# in a series are the same, then the rest are the same.
else:
# here, we put the list in dictionary; the key is the
# tag name the list elements all share in common, and
# the value is the list itself
yield from flatten_list(element, prefix=eprefix)
# if the tag has attributes, add those to the dict
if element.items():
for k, v in element.items():
yield eprefix+k
# this assumes that if you've got an attribute in a tag,
# you won't be having any text. This may or may not be a
# good idea -- time will tell. It works for the way we are
# currently doing XML configuration files...
elif element.items():
for k, v in element.items():
yield eprefix+k
# finally, if there are no child tags and no attributes, extract
# the text
else:
yield eprefix, element.text
def makerows(pairs):
headers = []
columns = {}
for k, v in pairs:
if k in columns:
columns[k].extend((v,))
else:
headers.append(k)
columns[k] = [k, v]
m = max(len(c) for c in columns.values())
for c in columns.values():
c.extend(' ' for i in range(len(c), m))
L = [columns[k] for k in headers]
rows = list(zip(*L))
return rows
def main():
with open('2-Response_duplicate.xml', 'r', encoding='utf-8') as f:
xml_string = f.read()
xml_string= xml_string.replace('�', '') #optional to remove ampersands.
root = ElementTree.XML(xml_string)
# for key, value in flatten_dict(root):
# key = key.rstrip('.').rsplit('.', 1)[-1]
# print(key,value)
writer = csv.writer(open("try5.csv", 'wt'))
writer.writerows(makerows(flatten_dict(root)))
if __name__ == "__main__":
main()
在Excel中打開時,CSV的一列如下所示:
對象向導
2adeb916-cc43-4d73-8c90-579dd4aa050a
2e77c588-56e5-4f3f-b990-548b89c09acb
c8743bdd-04a6-4635-aedd-684a153f02f0
1cdc3d86-f9f4-4a22-81e1-2ecc20f5e558
2c19d69b-26d3-4df0-8df4-8e293201656f
6d235c85-6a3e-4cb3-9a28-9c37355c02db
c34e05de-0b0c-44ee-8572-c8efaea4a5ee
9b0fe8f5-8ec4-4f13-b797-961036f92f19
1d43d35f-61ef-4df2-bbd9-30bf014f7e10
9cb132e8-bc69-4e4f-8f29-c1f503b50018
24fd77da-030c-4cb7-94f7-040b165191ce
0a949d4f-4f4c-467e-b0a0-40c16fc95a79
801d3091-c28e-44d2-b9bd-3bad99b32547
7f355633-426d-464b-bab9-6a294e95c5d5
這是由於存在14個名稱為ObjectGuid的標簽。 例如,這些標記之一如下所示:
<ObjectGuid>2adeb916-cc43-4d73-8c90-579dd4aa050a</ObjectGuid>
我的問題:有沒有一種有效的方法來枚舉標頭(鍵),使得每個鍵都像其對應的值(XML數據結構中的文本)一樣被枚舉:
它將在Excel中顯示如下:
ObjectGuid_1 ObjectGuid_2 ObejectGuid3等.....
如果您需要我提供其他信息(例如示例XML),請告訴我。 謝謝您的幫助。
為了標識的目的,在數據集本身中添加元素,屬性或注釋性描述符是錯誤的……僅當您擁有該數據並且100%知道這樣做不會有任何結果時,才應該對數據進行規范化對其他使用者的負面影響(依賴屬性順序來操縱DOM的使用者)。 但是,如果通過對這個屬性進行new屬性的0(n)檢查來獲取哈希表查找的效率,那么使用dict或嵌套dict(我不太可能獲得t)有什么意義呢? 該哈希處理的重點是隨機查找。
如果僅是結構化的(鍵,值)對,那么在這里就有意義。為什么不只使用其他連續數據結構,而要像字典一樣對待呢。說一個命名元組…
第二種解決方案是,如果要添加其他狀態,則將生成器放在類中。
班級順序:
def__init__(self, lines, order):
self.lines = lines
self.order - python(max)
def __iter__(self):
for l, line in enumerate(self.lines, 1);
self.order.append( l, line))
yield line
when open (some file.csv) as f:
lines = oder( f);
應對數據進行無害轉換? 例如,如果要創建轉換字典(請參見下文)
好吧,直到其中一個值是空白為止……
類型= [('x',float'),('y',float')
with open(‘some.csv’) as f:
for row in cvs.DictReader(f):
row.update((key, conversion (row [ key]))
for key, conversion in field_types)
[x: ,y: 2. 2] — > that is until there is an empty data point.. Kaboom.
因此,我的建議不是更改或添加數據,而是更改處理此類數據的算法。如果問題是順序,為什么不簡單地將說一個元組作為類似於字典的命名元組,則警告是可變性但是在數據統一性上是有道理的...
*我不理解嵌套的字典……那是因為y標頭值是嗎?
值和順序鍵->鍵->(鍵:值)? 或者你可以跳過
first row :p..
So just skip the first row..
for line in {header: list, header: list }
line.next() # problem solved.. or print(line , end = ‘’)
*** Notables
-To iterator over multiple sequences in parallel
h = [a,b,c]
x = [1,2,3]
for i in zip(a,b):
print(i)
(a, 1)
(b, 2)
-Chaining
a = [1,2 , 3]
b= [a, b , c ]enter code here
for x in chain(a, b):
//remove white space
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.