element attributes missing when parsing XML with iterparse/lxml/python 2

Question

Here's my use case: I have a potentially large XML file, and I want to output the frequency of all the unique structural variations of a given element type. Element attributes should be included as part of the uniqueness test. The output should sort the variations by frequency.

Here's a trivial input example, with 4 entries for automobile:

<automobile>
    <mileage>20192</mileage>
    <year>2005</year>
    <user_defined name="color">red</user_defined>
</automobile>
<automobile>
    <mileage>1098</mileage>
    <year>2018</year>
    <user_defined name="color">blue</user_defined>
</automobile>
<automobile>
    <mileage>17964</mileage>
    <year>2012</year>
    <user_defined name="title_status">salvage</user_defined>
</automobile>
<automobile>
    <mileage>198026</mileage>
    <year>1990</year>
</automobile>

The output I expect would look like this:

<automobile automobile_frequency="2">
    <mileage />
    <year />
    <user_defined name="color" />
</automobile>
<automobile automobile_frequency="1">
    <mileage />
    <year />
    <user_defined name="title_status" />
</automobile>
<automobile automobile_frequency="1">
    <mileage />
    <year />
</automobile>

I've implemented the code using iterparse, but when it's processing the elements, the attributes do not exist in the element. The code logic appears to be correct, but attributes simply don't exist; they are not written in the output, and they are not present for the uniqueness test. Per the above input example, this is what I get on output:

<root>
  <automobile automobile_frequency="3">
    <mileage/>
    <year/>
    <user_defined/>
  </automobile>
  <automobile automobile_frequency="1">
    <mileage/>
    <year/>
  </automobile>
</root>

The usage is:

xplore.py input.xml node_to_explore

In the above example, I used:

xplore.py trivial.xml automobile

Here's the source:

from lxml import etree
import sys
import re
from datetime import datetime


# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")


def process_element(el):
    global target
    if el.tag != target:
        return
    # get the structure of the element
    structure = get_structure(el)
    global structure_map
    structure_key = etree.tostring(structure, pretty_print=True)
    if structure_key not in structure_map.keys():
        # add signature to structure map
        structure_map[structure_key] = structure
        # add node to output
        global tmp_root
        tmp_root.append(structure)
        # add signature to frequency map
        frequency_map[structure_key] = 1
    else:
        # increment frequency map
        frequency_map[structure_key] += 1


# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
    # create new element for the return value
    ret = etree.Element(el.tag)
    # get attributes
    attribute_keys = el.attrib.keys()
    for attribute_key in attribute_keys:
        ret.set(attribute_key, el.get(attribute_key))
    # check for children
    children = list(el)
    for child in children:
        ret.append(get_structure(child))
    return ret


if len(sys.argv) < 3:
    print "Must specify an XML file for processing, as well as an element type!"
    exit(0)

# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML

print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
for event, element in context:
    process_element(element)
    element.clear()
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
    structure = structure_map[item[0]]
    structure.set(target+"_frequency", str(item[1]))
    root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)

In the debugger, I can clearly see that the attributes are missing from elements in the calls to get_structure. Can anyone tell me why this is the case?

Answer 1

The data:

<root>
    <automobile>
        <mileage>20192</mileage>
        <year>2005</year>
        <user_defined name="color">red</user_defined>
    </automobile>
    <automobile>
        <mileage>1098</mileage>
        <year>2018</year>
        <user_defined name="color">blue</user_defined>
    </automobile>
    <automobile>
        <mileage>17964</mileage>
        <year>2012</year>
        <user_defined name="title_status">salvage</user_defined>
    </automobile>
    <automobile>
        <mileage>198026</mileage>
        <year>1990</year>
    </automobile>
</root>

The code:

from lxml import etree
import sys
import re
from datetime import datetime


# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")


def process_element(el):
    # get the structure of the element
    structure = get_structure(el)
    global structure_map
    structure_key = etree.tostring(structure, pretty_print=True)
    if structure_key not in structure_map.keys():
        # add signature to structure map
        structure_map[structure_key] = structure
        # add node to output
        global tmp_root
        tmp_root.append(structure)
        # add signature to frequency map
        frequency_map[structure_key] = 1
    else:
        # increment frequency map
        frequency_map[structure_key] += 1


# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
    # create new element for the return value
    ret = etree.Element(el.tag)
    # get attributes
    attribute_keys = el.attrib.keys()
    for attribute_key in attribute_keys:
        ret.set(attribute_key, el.get(attribute_key))
    # check for children
    children = list(el)
    for child in children:
        ret.append(get_structure(child))
    return ret


if len(sys.argv) < 3:
    print "Must specify an XML file for processing, as well as an element type!"
    exit(0)

# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML

print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
element_to_clear = []
for event, element in context:
    element_to_clear.append(element)
    global target
    if element.tag == target:
        process_element(element)
        for ele in element_to_clear:
            ele.clear()
        element_to_clear = []
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
    structure = structure_map[item[0]]
    structure.set(target+"_frequency", str(item[1]))
    root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)

The command: xplore.py trivial.xml automobile

element attributes missing when parsing XML with iterparse/lxml/python 2

Question

1 answers

solution1
0 2022-02-11 11:13:33

element attributes missing when parsing XML with iterparse/lxml/python 2

Question

1 answers

solution1 0 2022-02-11 11:13:33

solution1
0 2022-02-11 11:13:33