使用 python 有效地组合大型 xml 文件

Question

I have about 200 xml files ranging from 5MB to 50MB, with 80% being <10MB.我有大约 200 个 xml 文件，大小从 5MB 到 50MB，其中 80% <10MB。 These files contain multiple elements with both overlapping and unique data.这些文件包含多个具有重叠和唯一数据的元素。 My goal is to combine all this files, by performing a logical union over all the elements.我的目标是通过对所有元素执行逻辑联合来组合所有这些文件。

The code seems to work but gets exponentially slower the more files it has to process.该代码似乎可以工作，但它必须处理的文件越多，速度就越慢。 For example, it takes about 20sec to process the first 5 files, about a minute to process next five, about 5 min next five and so on, while also taking significantly more memory then the sum total of all the files.例如，处理前 5 个文件大约需要 20 秒，处理接下来的五个文件大约需要一分钟，接下来的五个文件大约需要 5 分钟，依此类推，同时占用的内存也远远多于所有文件的总和。 With the overall process running on the 4th hour as I type this.当我键入此内容时，整个过程在第 4 个小时运行。

This is, obviously a 'to be expected' effect, considering that lookup needs to happen on an ever larger tree.考虑到需要在更大的树上进行查找，这显然是一种“预期”效果。 Still, I wonder if there are ways to at least diminish this effect.不过，我想知道是否有办法至少减少这种影响。

I have tried implementing a form of simple caching, but I didnt notice any significant improvement.我已经尝试实现一种简单的缓存形式，但我没有注意到任何显着的改进。

I also tried multiprocessing, which does help, but adds extra complexity and pushed the problem to the hardware level, which does not feel very optimal.我还尝试了 multiprocessing，这确实有帮助，但增加了额外的复杂性并将问题推到了硬件级别，这感觉不是很理想。

Is there something I can do to improve the performance in any way?我可以做些什么来以任何方式提高性能吗？

Note: I had to obfuscate parts of code and data due to confidentiality reasons.注意：出于保密原因，我不得不混淆部分代码和数据。 Please dont hesitate to inform if it breaks the example如果它打破了这个例子，请不要犹豫通知

code:代码：

import lxml.etree
from lxml.etree import Element

# Edit2: added timing prints

def process_elements(files: list[str],
                     indentifier: int) -> lxml.etree._Element | None:

    base_el = Element('BASE')
    i = 0
    cache = {}  # Edit1. Missed this line

    
    start = time.time()
    time_spent_reading = 0
    lookup_time = [0, 0]
    append_new_el_time = [0, ]
    cache_search_time = [0, 0]
    recursive_calls_counter = [0, ]


    for file in files:
        i += 1
        print(f"Process: {indentifier}, File {i} of {len(files)}: {file}")
        print("Reading file...")
        start_read = time.time()

        tree = lxml.etree.parse(f'data/{file}').getroot()
        print(f"Reading file took {time.time() - start_read} seconds")
        print("Since start: ", time.time() - start)

        packages = tree.find('BASE')
        

        print("Starting walk...")
        sart_walked = time.time()
        for package in packages:
            walk(package, base_el, cache, lookup_time,
                 append_new_el_time, cache_search_time)
            
        print(f"Walk took {time.time() - sart_walked} seconds")
        print("Since start: ", time.time() - start)

    if indentifier == -1:
        return base_el
    else:
        print("Timing results:")
        print("Time spent reading: ", time_spent_reading)
        print("Time spent on lookup: ", lookup_time[0])
        print("Time spent on append: ", append_new_el_time[0])
        print("Time spent on cache search: ", cache_search_time[0])

        base_el.getroottree().write(
            f'temp{indentifier}.xml', encoding='utf-8')
        return None
    

def walk(element: lxml.etree._Element,
         reference: lxml.etree._Element,
         cache: dictlookup_time,
         append_new_el_time,
         cache_search_time,
         recursive_calls_counter) -> None:

    recursive_calls_counter[0] += 1

    children = element.iterchildren()
    elid = f"{element.tag}"
    element_name = element.get('some-id-i-need')
    if element_name is not None:
        elid += f'[@some-id-i-need="{element_name}"]'

    cache_id = str(id(reference)) + "_" + elid

    cache_search_time_start = time.time()
    relevant_data = cache.get(cache_id)
    cache_search_time[0] += time.time() - cache_search_time_start

    # if element is found either in cache or in the new merged object
    # continue to its children
    # otherwise, element does not exist in merged object. 
    # Add it to the merged object and to cache

    if relevant_data is None:
        # I believe this lookup may be what takes the most time
        # hence my attempt to cache this
        lookup_time_start = time.time()
        relevant_data = reference.find(elid)   
        lookup_time[0] += time.time() - lookup_time_start
        lookup_time[1] += 1
    else:
        # cache hit
        cache_search_time[1] += 1

    if relevant_data is None:
        append_new_el_time_start = time.time()
        reference.append(element)
        append_new_el_time[0] += time.time() - append_new_el_time_start
        return

    else:
        cache.setdefault(cache_id, relevant_data)
        # if element has no children, loop will not run
        for child in children:
            walk(child, relevant_data, cache, lookup_time,
                 append_new_el_time,
                 cache_search_time,
                 recursive_calls_counter)


# to run: process_elements(os.listdir("data"), -1)

example data:示例数据：

file1文件1

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPV" />
                <Number type="S" />
            </object>
            <object id="128161">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPF" />
            </object>
        </data-tag>

    </elem>
</BASE>

file2文件2

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">

                <concept type="t1" />
            </object>
            <object id="128161">

                <concept type="t2" />
            </object>
        </data-tag>
        <data-tag id="2">
            <object id="128162">
                <POS Tag="P" />
                <grammar type="PREFIX" />
                <Tag Tag="bi+" />
                <concept type="t3" />
            </object>
        </data-tag>
    </elem>
</BASE>

result:结果：

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPV" />
                <Number type="S" />
                <concept type="t1" />
            </object>
            <object id="128161">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPF" />
                <concept type="t2" />
            </object>
        </data-tag>
        <data-tag id="2">
            <object id="128162">
                <POS Tag="P" />
                <grammar type="PREFIX" />
                <Tag Tag="bi+" />
                <concept type="t3" />
            </object>
        </data-tag>
    </elem>
</BASE>

Edit2: Timing results after processing 10 files (about 60MB, 1m 24.8s): Edit2：处理10个文件后的计时结果（约60MB，1m 24.8s）：


Starting process...
Process: 102, File 1 of 10:
Reading file...
Reading file took 0.1326887607574463 seconds
Since start:  0.1326887607574463
preprocesing...
merging...
Starting walk...
Walk took 0.8433401584625244 seconds
Since start:  1.0600318908691406
Process: 102, File 2 of 10:
Reading file...
Reading file took 0.04700827598571777 seconds
Since start:  1.1070401668548584
preprocesing...
merging...
Starting walk...
Walk took 1.733034610748291 seconds
Since start:  2.8680694103240967
Process: 102, File 3 of 10:
Reading file...
Reading file took 0.041702985763549805 seconds
Since start:  2.9097723960876465
preprocesing...
merging...
...
Time spent on lookup:  79.53011083602905
Time spent on append:  1.1502337455749512
Time spent on cache search:  0.11017322540283203
Cache size:  30176

# Edit3: extra data
Number of cache hits:  112503
Cache size:  30177
Number of recursive calls:  168063

As an observation, I do expect significant overlap between the files, maybe the small cache search time indicates that something is wrong with how I implemented caching?作为观察，我确实希望文件之间有明显的重叠，也许缓存搜索时间短表明我实现缓存的方式有问题？

Edit3: It does seem that I do get a lot of hits. Edit3：看来我确实获得了很多点击率。 but the strange part is that if I comment out the cache search part, it makes almost no difference in performance.但奇怪的是，如果我注释掉缓存搜索部分，它对性能几乎没有影响。 In fact, it ran marginally faster without it (although not sure if a few seconds is a significant difference or just random chance in this case)事实上，没有它它运行得稍微快一点（虽然不确定几秒钟是否有显着差异或在这种情况下只是随机机会）

relevant_data = None  # cache.get(cache_id)

log with cache commented out:注释掉缓存的日志：

Time spent on lookup:  71.13456320762634
Number of lookups:  168063
Time spent on append:  3.9656710624694824
Time spent on cache search:  0.020023584365844727
Number of cache hits:  0
Cache size:  30177
Number of recursive calls:  168063

Answer 1

Caching all identifiers while proceeding seems to work well and doesn't significantly slow down as more data is added.在继续的同时缓存所有标识符似乎效果很好，并且不会随着添加更多数据而显着减慢速度。

The following code does this:以下代码执行此操作：

def xml_union(files, loader):
    existing = {}
    path = []

    def populatewalk(elem):
        pid = elem.get('id')
        ident = (elem.tag, pid)
        path.append(ident)
        if pid is not None:
            existing[tuple(path)] = elem
        for child in elem:
            populatewalk(child)
        popped = path.pop()
        assert popped is ident

    def walk(existing_parent, elem):
        pid = elem.get('id')
        
        if pid is None:
            existing_parent.append(elem)
            # make sure children are populated
            return populatewalk(elem)

        ident = (elem.tag, pid)
        path.append(ident)
        tpath = tuple(path)
        existing_elem = existing.get(tpath)
        if existing_elem is None:
            existing_parent.append(elem)
            existing[tpath] = elem
            for child in elem:
                populatewalk(child)
        else:
            existing_elem.attrib.update(elem.items())
            for child in elem:
                walk(existing_elem, child)

        popped = path.pop()
        assert popped is ident        

    first, *remain = files
    root = loader(first)
    for elem in root:
        populatewalk(elem)

    for text in remain:
        ri = loader(text)
        if root.tag != ri.tag:
            raise ValueError(f"root tag {root.tag!r} does not equal {ri.tag!r}")
        for elem in ri:
            walk(root, elem)
    
    return root

The above code assumes that you always want to use an id attribute to identify elements, but that should be easy to change.上面的代码假定您总是希望使用id属性来标识元素，但这应该很容易更改。 It also is slightly more general in that it keeps track of the element hierarchy of when doing the union, while your code only seems to care that an element with a given ID can be found.它还稍微更通用一些，因为它会跟踪执行联合时的元素层次结构，而您的代码似乎只关心是否可以找到具有给定 ID 的元素。 Not sure if that matters!不确定这是否重要！

This can be tested with the following line, with f1 and f2 set as strings containing the tests you sent above.这可以使用以下行进行测试，将f1和f2设置为包含您在上面发送的测试的字符串。

print(etree.tostring(xml_union([f1, f2], etree.fromstring)).decode())

Writing this didn't take too long, but convincing myself it's somewhat correct and performant took longer.写这篇文章并没有花太长时间，但说服自己它在某种程度上是正确的并且性能良好需要更长的时间。 I ended up writing a test harness that generates 10 files that are ~12MiB, runs the above code on these files, then writes the result to a ~87MiB file, then makes sure that file is exactly the union of what was generated.我最终编写了一个测试工具，它生成 10 个文件，大小约为 12MiB，在这些文件上运行上面的代码，然后将结果写入一个 ~87MiB 文件，然后确保该文件正是生成的文件的并集。 The part that uses xml_union looks like:使用xml_union的部分如下所示：

from time import time

def fileloader(path):
    print(f"loading {path}")
    return etree.parse(path).getroot()

t0 = time()
new_root = xml_union(
    [f'large-{i:02}.xml' for i in range(10)],
    fileloader,
)
t1 = time()
with open(f'merged.xml', 'wb') as fd:
    print("writing merged")
    etree.ElementTree(new_root).write(fd, pretty_print=True)
t2 = time()

print(f"union={t1-t0:.2f} write={t2-t1:.2f}")

My 1.6GHz laptop takes ~23 seconds to merge these files, with no slowdown noticed with later files.我的 1.6GHz 笔记本电脑需要大约 23 秒来合并这些文件，并且没有注意到后续文件的速度变慢。 Writing the resulting object takes Python ~2 seconds.编写生成的对象需要 Python ~2 秒。

The test harness is much more fiddly, and looks like:测试工具要复杂得多，看起来像：

from itertools import product
from random import choices

def randomizer():
    num = 1
    def next(n, rb):
        nonlocal num
        for _ in range(n):
            yield num, choices(rb, k=len(terminals))
            num += 1
    return next

rootids = list(range(10))
roots = [etree.Element('BASE') for _ in rootids]
obj_elems = {}
dt_elems = {}
el_elems = {}

def get_obj(root_id, el_id, dt_id, obj_id):
    obj = obj_elems.get((root_id, obj_id))
    if obj is not None:
        return obj

    obj = obj_elems[(root_id, obj_id)] = etree.Element('object', id=str(obj_id))
    dt = dt_elems.get((root_id, dt_id))
    if dt is not None:
        dt.append(obj)
        return obj

    dt = dt_elems[(root_id, dt_id)] = etree.Element('data-tag', id=str(dt_id))
    dt.append(obj)
    el = el_elems.get((root_id, el_id))
    if el is not None:
        el.append(dt)
        return obj

    el = el_elems[(root_id, el_id)] = etree.Element('elem', id=str(el_id))
    el.append(dt)
    roots[root_id].append(el)
    return obj

elmaker = randomizer()
dtmaker = randomizer()
objmaker = randomizer()

for el_id, el_roots in elmaker(1000, rootids):
    for dt_id, dt_roots in dtmaker(100, el_roots):
        for obj_id, obj_roots in objmaker(len(terminals), dt_roots):
            for key, root_id in zip(terminals, obj_roots):
                get_obj(root_id, el_id, dt_id, obj_id).append(
                    etree.Element(key, an='val')
                )

for root_id, root in zip(rootids, roots):
    with open(f'large-{root_id:02}.xml', 'wb') as fd:
        et = etree.ElementTree(root)
        et.write(fd, pretty_print=True)

nelem = 1000
ndt = 100
nterm = len(terminals)

expected_elems = set(str(i+1) for i in range(nelem))
expected_dts = set(str(i+1) for i in range(nelem*ndt))
expected_objs = set(str(i+1) for i in range(nelem*ndt*nterm))
expected_terms = set(product(expected_objs, terminals))

elem_seen = set()
dt_seen = set()
obj_seen = set()
terms_seen = set()

def check(el, tag, seen):
    assert el.tag == tag
    aid = el.attrib['id']
    assert aid not in seen
    seen.add(aid)
    return aid

for elem in etree.parse('merged.xml').getroot():
    check(elem, 'elem', elem_seen)
    for dt in elem:
        check(dt, 'data-tag', dt_seen)
        for obj in dt:
            obj_id = check(obj, 'object', obj_seen)
            for term in obj:
                assert term.tag in terminals
                term_id = (obj_id, term.tag)
                assert term_id not in terms_seen
                terms_seen.add(term_id)

assert elem_seen == expected_elems
assert dt_seen == expected_dts
assert obj_seen == expected_objs
assert terms_seen == expected_terms

Hopefully that test harness is useful to somebody else!希望该测试工具对其他人有用！

使用 python 有效地组合大型 xml 文件

问题描述

1 个解决方案

解决方案1
0 2022-12-15 18:19:23

使用 python 有效地组合大型 xml 文件

问题描述

1 个解决方案

解决方案1 0 2022-12-15 18:19:23

解决方案1
0 2022-12-15 18:19:23