Python: Unpack a list of objects to Dictionary

Question

I have a list of objects that need to be unpacked to a dictionary efficiently. There are more than 2,000,000 objects in the list. The operation takes more than 1.5 hours complete. I would like to know if this can be done more efficiently. The objects in the list is based on this class.

class ResObj:
def __init__(self, index, result):
    self.loc = index ### This is the location, where the values should go in the final result dictionary
    self.res = result ### This is a dictionary that has values for this location.

    self.loc = 2
    self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}

Currently I use this method to perform this operation.

def make_final_result(list_of_results):
    no_sub_result_variables = ['value1', 'value2']
    sub_result_variables = ['valuen']
    sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']

    final_result = {}
    num_of_results = len(list_of_results)
    for var in no_sub_result_variables:
        final_result[var] = numpy.zeros(num_of_results)
    for var in sub_result_variables:
        final_result[var] = {sub_var:numpy.zeros(num_of_results) for sub_var in sub_value_variables}

    for obj in list_of_results:
        i = obj.loc
        result = obj.res
        for var in no_sub_result_variables:
            final_result[var][i] = result[var]
        for var in sub_result_variables:
            for name in sub_value_variables:
                try:
                    final_result[var][name][i] = result[var][name]
                except KeyError as e:
                    ##TODO Add a debug check
                    pass

I have tried using multiprocessing.Manager().dict and Manager().Array() to use parallelism for this, however, I could only get 2 processes to work (even though, I manually set the processes to # of CPUs = 24). Can you please help me to use a faster method to improve the performance. Thank you.

Answer 1

Having nested numpy arrays doesn't seem the best way to structure your data. You can use numpy's structured arrays to create a more intuitive data structure.

import numpy as np

# example values
values = [
    {
        "v1": 0,
        "v2": 1,
        "vs": {
            "x": 2,
            "y": 3,
            "z": 4,
        }
    },
    {
        "v1": 5,
        "v2": 6,
        "vs": {
            "x": 7,
            "y": 8,
            "z": 9,
        }
    }
]

def value_to_record(value):
    """Take a dictionary and convert it to an array-like format"""
    return (
        value["v1"],
        value["v2"],
        (
            value["vs"]["x"],
            value["vs"]["y"],
            value["vs"]["z"]
        )
    )

# define what a record looks like -- f8 is an 8-byte float
dtype = [
    ("v1", "f8"),
    ("v2", "f8"),
    ("vs", [
        ("x", "f8"),
        ("y", "f8"),
        ("z", "f8")
    ])
]           

# create actual array
arr = np.fromiter(map(value_to_record, values), dtype=dtype, count=len(values))

# access individual record
print(arr[0]) # prints (0.0, 1.0, (2.0, 3.0, 4.0))
# access specific value
assert arr[0]['vs']['x'] == 2
# access all values of a specific field
print(arr['v2']) # prints [ 1.  6.]
assert arr['v2'].sum() == 7

Using this way of generating the data created a 2,000,000 long array in 2 seconds on my machine.

To make it work for your ResObj objects then sort them by the loc attribute, and then pass the res attribute to the value_to_record function.

Answer 2

You you can distribute the work among processes by key names.
Here I create a pool of workers and pass to them var and optional subvar names.
The huge dataset is shared with workers using cheap fork .
Unpacker.unpack picks the specified vars from ResObj and returns them as an np.array
The main loop in make_final_result combines the arrays in final_result.
Py2 :

from collections import defaultdict
from multiprocessing import Process, Pool
import numpy as np

class ResObj(object):
    def __init__(self, index=None, result=None):
        self.loc = index ### This is the location, where the values should go in the final result dictionary
        self.res = result ### This is a dictionary that has values for this location.

        self.loc = 2
        self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}

class Unpacker(object):
    @classmethod
    def cls_init(cls, list_of_results):
        cls.list_of_results = list_of_results

    @classmethod
    def unpack(cls, var, name):

        list_of_results = cls.list_of_results
        result = np.zeros(len(list_of_results))
        if name is None:
            for i, it in enumerate(list_of_results):
                result[i] = it.res[var]
        else:
            for i, it in enumerate(list_of_results):
                result[i] = it.res[var][name]
        return var, name, result

#Pool.map doesn't accept instancemethods so the use of a wrapper
def Unpacker_unpack((var, name),):
    return Unpacker.unpack(var, name)


def make_final_result(list_of_results):
    no_sub_result_variables = ['value1', 'value2']
    sub_result_variables = ['valuen']
    sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']

    pool = Pool(initializer=Unpacker.cls_init, initargs=(list_of_results, ))
    final_result = defaultdict(dict)

    def key_generator():
        for var in no_sub_result_variables:
            yield var, None
        for var in sub_result_variables:
            for name in sub_value_variables:
                yield var, name

    for var, name, result in pool.imap(Unpacker_unpack, key_generator()):
        if name is None:
            final_result[var] = result
        else:
            final_result[var][name] = result
    return final_result

if __name__ == '__main__':
    print make_final_result([ResObj() for x in xrange(10)])

Ensure that you are not on Windows. It lacks fork and multiprocessing will have to pipe entire dataset to each of 24 worker processes.
Hope this will help.

Answer 3

Remove some indentation to make your loops non-nested:

for obj in list_of_results:
    i = obj.loc
    result = obj.res
    for var in no_sub_result_variables:
        final_result[var][i] = result[var]
    for var in sub_result_variables:
        for name in sub_value_variables:
            try:
                final_result[var][name][i] = result[var][name]
            except KeyError as e:
                ##TODO Add a debug check
                pass

Python: Unpack a list of objects to Dictionary

Question

3 answers

solution1
2 2016-08-22 20:39:22

solution2
1 2016-08-23 04:47:40

solution3
0 2016-08-22 19:47:06

Python: Unpack a list of objects to Dictionary

Question

3 answers

solution1 2 2016-08-22 20:39:22

solution2 1 2016-08-23 04:47:40

solution3 0 2016-08-22 19:47:06

solution1
2 2016-08-22 20:39:22

solution2
1 2016-08-23 04:47:40

solution3
0 2016-08-22 19:47:06