Processing of XML files excruciatingly slow with LXML Python

Question

I'm processing XML documents like the following.

<tok lemma="i" xpos="CC">e</tok> 
<tok lemma="que" xpos="CS">que</tok> 
<tok lemma="aquey" xpos="PD0MP0">aqueys</tok> 
<tok lemma="marit" xpos="NCMP000">marits</tok> 
<tok lemma="estar" xpos="VMIP3P0">stiguen</tok>  
[...]
<tok lemma="habitar" xpos="VMIP3P0">habiten</tok> 
<tok lemma="en" xpos="SPS00">en</tok>
<tok lemma="aquex" xpos="PD0FS0">aqueix</tok> 
<tok lemma="terra" xpos="NCMS000">món</tok>
[...]
<tok lemma="viure" xpos="VMIP3P0">viuen</tok> 
<tok lemma="en" xpos="SPS00">en</tok>
<tok lemma="aquex" xpos="PD0FP0">aqueixes</tok> 
<tok lemma="casa" xpos="NCFP000">cases</tok>

I need to change the attributes of certain elements whenever certain conditions are met. With the help of @LMC (see: https://stackoverflow.com/questions/73545510/python-and-lxml-extremely-slow-more-efficient-code/73545789 )I optimized the initial code I had to proces the xml files. Here's an exact copy of the code I'm using now.

# coding: utf-8
import os
import lxml.etree as et


ROOT = '/Path-to-input-xml-files'
ext = ('.xml')


def xml_change(root_element):


    for el in root.xpath('//tok[following-sibling::tok[1][starts-with(@xpos, "N")]]'):        
                          
        if el.text == 'aquest' or el.text == 'Aquest' or el.text == 'AQUEST' or el.text == 'aquast' or el.text == 'Aquast' or el.text == 'AQUAST' or el.text == 'aqast' or el.text == 'Aqast' or el.text == 'AQAST' or el.text == 'aqax' or el.text == 'Aqax' or el.text == 'AQAX' or el.text == 'aqest' or el.text == 'Aqest' or el.text == 'AQEST' or el.text == 'aqet' or el.text == 'Aqet' or el.text == 'AQET' or el.text == 'aquet' or el.text == 'Aquet' or el.text == 'AQUET':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0MS0')
            el.set('lemma', 'aquest')



        elif el.text == 'aquel' or el.text == 'Aquel' or el.text == 'AQUEL' or el.text == 'aquell' or el.text == 'Aquell' or el.text == 'AQUELL' or el.text == 'aqal' or el.text == 'Aqal' or el.text == 'AQAL' or el.text == 'aqual' or el.text == 'Aqual' or el.text == 'AQUAL' or el.text == 'aqueyl' or el.text == 'Aqueyl' or el.text == 'AQUEYL' or el.text == 'aqueil' or el.text == 'Aqueil' or el.text == 'AQUEIL':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0MS0')
            el.set('lemma', 'aquell')
       

        elif el.text == 'aquests' or el.text == 'Aquests' or el.text == 'AQUESTS' or el.text == 'aquets' or el.text == 'Aquets' or el.text == 'AQUETS' or el.text == 'aquetz' or el.text == 'Aquetz' or el.text == 'AQUETZ':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0MP0')
            el.set('lemma', 'aquest')

        elif el.text == 'aquells' or el.text == 'Aquells' or el.text == 'AQUELLS' or el.text == 'aqueys' or el.text == 'Aqueys'  or el.text == 'AQUEYS' or el.text == 'aqueyls'  or el.text == 'Aqueyls'  or el.text == 'AQUEYLS':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0MP0')
            el.set('lemma', 'aquell')

        elif el.text == 'aquestas' or el.text == 'Aquestas' or el.text == 'AQUESTAS' or el.text == 'aquestes' or el.text == 'Aquestes' or el.text == 'AQUESTES' or el.text == 'aquetes' or el.text == 'Aquetes' or el.text == 'AQUETES' or el.text == 'aquastes' or el.text == 'Aquastes' or el.text == 'AQUASTES' or el.text == 'aquastas' or el.text == 'Aquastas' or el.text == 'AQUASTAS'  or el.text == 'aqastas' or el.text == 'Aqastas' or el.text == 'AQASTAS' or el.text == 'aquexas' or el.text == 'Aquexas' or el.text == 'AQUEXAS':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0FP0')
            el.set('lemma', 'aquest')
        
        elif el.text == 'aqualas' or el.text == 'Aqualas' or el.text == 'AQUALAS' or el.text == 'aquelas' or el.text == 'Aquelas' or el.text == 'AQUELAS' or el.text == 'aqueles' or el.text == 'Aqueles' or el.text == 'AQUELES' or el.text == 'aquellas' or el.text == 'Aquellas' or el.text == 'AQUELLAS' or el.text == 'aquelles' or el.text == 'Aquelles' or el.text == 'AQUELLES' or el.text == 'aquales' or el.text == 'Aquales' or el.text == 'AQUALES' or el.text == 'aqueylas' or el.text == 'Aqueylas' or el.text == 'AQUEYLAS' or el.text == 'aqueyles' or el.text == 'Aqueyles' or el.text == 'AQUEYLES':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0FP0')
            el.set('lemma', 'aquell')

        elif el.text == 'aquesta' or el.text == 'Aquesta' or el.text == 'AQUESTA' or el.text == 'aquasta' or el.text == 'Aquasta' or el.text == 'AQUASTA' or el.text == 'aquaste' or el.text == 'Aquaste' or el.text == 'AQUASTE' or el.text == 'aqasta' or el.text == 'Aqasta' or el.text == 'AQASTA' or el.text == 'aquetes' or el.text == 'aqaste' or el.text == 'Aqaste' or el.text == 'AQASTE' or el.text == 'aquaxa' or el.text == 'Aquaxa' or el.text == 'AQUAXA' or el.text == 'aqexa' or el.text == 'Aqexa'  or el.text == 'AQEXA' or el.text == 'aquexa' or el.text == 'Aquexa' or el.text == 'AQUEXA':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0FS0')
            el.set('lemma', 'aquest')

        elif el.text == 'aquala' or el.text == 'Aquala' or el.text == 'AQUALA' or el.text == 'aquale' or el.text == 'Aquale' or el.text == 'AQUALE'  or el.text == 'aquela' or el.text == 'Aquela' or el.text == 'AQUELA' or el.text == 'aqueyla' or el.text == 'Aqueyla' or el.text == 'AQUEYLA' or el.text == 'aqueila' or el.text == 'Aqueila' or el.text == 'AQUEILA':

            print('Current value is:', el.get('lemma'), el.get('xpos'))
            el.set('xpos', 'DD0FS0')
            el.set('lemma', 'aquell')
# iterate all dirs
for root, dirs, files in os.walk(ROOT):

    # iterate all files
    for file in files:
        if file.endswith(ext):
            # join root dir and file name
            file_path = os.path.join(ROOT, file)

            # load root element from file
            root = et.parse(file_path).getroot()

            # recursively change  elements from xml
            xml_change(root)
    
        

            # init tree object from root
            tree = et.ElementTree(root)

            # save cleaned xml tree object to file. Important to specify encoding
                
            tree.write(file_path.replace('.xml', '-clean.xml'), encoding='utf-8', doctype='<!DOCTYPE document SYSTEM "estcorpus.dtd">', xml_declaration=True)

@LMC's advice was indeed useful and with a test run involving a few xml documents to process I noticed that the optimization resulted in a slight increase of speed. I think, however, that there is something fundamentally wrong with what I'm doing because it's been already 38 hours and the process still has not finished. Granted, there are a lot of conditions that have to be checked and processing these kinds of texts documents is supposed to be slow. But 38 hours and counting on a pretty powerful computer (Mac Studio with M1 max chip)? I have never experienced something like this.

I provide some more information that could be useful to people who have some experience working on similar projects. The total amount of XML documents I'm processing is 395 with a total size of 585 MB. The largest document is 34MB and the smallest is 3KB but most documents are between 100KB and 4MB.

Now, here's the odd thing. The speed of the process does not seem to be related to the length of the processed documents. It is as if the processing is done in bursts. All of a sudden I get a bunch of print statements (from print('Current value is:', el.get('lemma'), el.get('xpos'))) indicating that matches are found and a bunch of output documents of different sizes are generated.

However, after that a lot of hours can go by without any new print statements or output documents being generated. Here are a couple of screenshots of the directory where the output files are created so that you can see the time gaps between the creation of new files.

I cannot see much of a correlation between the size of the files and the times it takes to process them. At any rate, even if the file is large, it seems to me that 17 hours to process a single file is a bit too much. What do you think? Am I wrong and this is what should be expected with these kinds of jobs or there is something I'm doing wrong? Is there anything I could do to make this faster?

Answer 1

There's something pathological going on here, there's no way it should take this long. Things I would try to isolate the cause:

(a) see if there is any network traffic generated.

(b) take a look at memory consumption to see if there's excessive paging or garbage collection

(c) reduce the processing you're doing on each document to something trivial to see if the problem is with parsing/saving the documents, or with the processing you are doing on each document.

Answer 2

There might be a problem with variable naming since root variable has 2 meanings in the code which could cause a memory problem .
Given the example below

>>> t = os.walk('/home/lmc/tmp/a')
>>> for root, dirs, files in t:
...     print(root)
...     root= uuid.uuid4()
...     print(root)
... 
/home/lmc/tmp/a
ab5839a8-43b5-4d9d-bbb3-4836c612abaf
/home/lmc/tmp/a/b
7a8ba22e-7a02-45d6-82ce-538e11b70e7d
/home/lmc/tmp/a/b/c
de7c0e08-edc4-43e6-9bc1-9b1d7dd7e9db
/home/lmc/tmp/a/b/c/f
2536e2dc-11d1-4b41-86fd-128c3eeaddbc
/home/lmc/tmp/a/b/c/f/g
7d7e61b0-31d4-4af4-9097-540fc2bbac1c
/home/lmc/tmp/a/b/d
1a671eb2-7efe-4dc4-891b-94d1710ef638
/home/lmc/tmp/a/b/d/e
420d5228-44f1-493d-9dae-e2005c4e0f61

So instead of a directory name root might be holding an xml element on each instance of that list.
Removing withespace from parsed tree could also reduce the number of nodes in the tree

for root, dirs, files in os.walk(ROOT):

    # iterate all files
    for file in files:
        if file.endswith(ext):
            # join root dir and file name
            file_path = os.path.join(ROOT, file)

            # load root element from file
            parser = etree.XMLParser(remove_blank_text=True)
            root_ele = et.parse(file_path, parser).getroot()

            # recursively change  elements from xml
            xml_change(root_ele)

Finally, as suggested, changing the xpath search strategy also makes a difference

for el in root.xpath('//tok[starts-with(@xpos, "N")]/preceding-sibling::tok[1]'):

Processing of XML files excruciatingly slow with LXML Python

Question

2 answers

solution1
0 2022-09-01 14:37:23

solution2
0 2022-09-03 18:05:09

Processing of XML files excruciatingly slow with LXML Python

Question

2 answers

solution1 0 2022-09-01 14:37:23

solution2 0 2022-09-03 18:05:09

solution1
0 2022-09-01 14:37:23

solution2
0 2022-09-03 18:05:09