简体   繁体   中英

merge join two generators in python

i want to merge two kyoto cabinet b-tree databases by key. ( kyoto cabinet python api ). the resulting list should contain each unique key (and its value) of any of the two input dbs.

the following code works but i think its ugly.
left_generator/right_generator are two cursor objects. its especially odd that get() returns None if the generator is exhausted.

def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
    try:
        comparison = cmp(right_generator.get_key(), left_generator.get_key())
        if comparison == 0:
            yield left_generator.get_key(), left_generator.get_value()
            left_generator.next()
            right_generator.next()
        elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
            yield right_generator.get_key(), right_generator.get_value()
            right_generator.next()   
        else:
            yield left_generator.get_key(), left_generator.get_value()
            left_generator.next()    
    except StopIteration:
        if stop:
            raise
        stop = True

generally: is there a function/lib which merge joins generators with cmp() ?

I think this is what you need; orderedMerge is based on Gnibbler's code but adds a custom key function and a unique argument,

import kyotocabinet
import collections
import heapq

class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
    def __init__(self, *args, **kwargs):
        kyotocabinet.Cursor.__init__(self, *args, **kwargs)
        collections.Iterator.__init__(self)

    def next():
        "Return (key,value) pair"
        res = self.get(True)
        if res is None:
            raise StopIteration
        else:
            return res

def orderedMerge(*iterables, **kwargs):
    """Take a list of ordered iterables; return as a single ordered generator.

    @param key:     function, for each item return key value
                    (Hint: to sort descending, return negated key value)

    @param unique:  boolean, return only first occurrence for each key value?
    """
    key     = kwargs.get('key', (lambda x: x))
    unique  = kwargs.get('unique', False)

    _heapify       = heapq.heapify
    _heapreplace   = heapq.heapreplace
    _heappop       = heapq.heappop
    _StopIteration = StopIteration

    # preprocess iterators as heapqueue
    h = []
    for itnum, it in enumerate(map(iter, iterables)):
        try:
            next  = it.next
            data   = next()
            keyval = key(data)
            h.append([keyval, itnum, data, next])
        except _StopIteration:
            pass
    _heapify(h)

    # process iterators in ascending key order
    oldkeyval = None
    while True:
        try:
            while True:
                keyval, itnum, data, next = s = h[0]  # get smallest-key value
                                                      # raises IndexError when h is empty
                # if unique, skip duplicate keys
                if unique and keyval==oldkeyval:
                    pass
                else:
                    yield data
                    oldkeyval = keyval

                # load replacement value from same iterator
                s[2] = data = next()        # raises StopIteration when exhausted
                s[0] = key(data)
                _heapreplace(h, s)          # restore heap condition
        except _StopIteration:
            _heappop(h)                     # remove empty iterator
        except IndexError:
            return    

then your function can be done as

from operator import itemgetter

def merge_join_kv(leftGen, rightGen):
    # assuming that kyotocabinet.Cursor has a copy initializer
    leftIter = IterableCursor(leftGen)
    rightIter = IterableCursor(rightGen)

    return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)

Python 2.6 has a merge in heapq, but it does not support a user defined cmp/key func

def merge(*iterables):
    '''Merge multiple sorted inputs into a single sorted output.

    Similar to sorted(itertools.chain(*iterables)) but returns a generator,
    does not pull the data into memory all at once, and assumes that each of
    the input streams is already sorted (smallest to largest).

    >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
    [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]

    '''
    _heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration

    h = []
    h_append = h.append
    for itnum, it in enumerate(map(iter, iterables)):
        try:
            next = it.next
            h_append([next(), itnum, next])
        except _StopIteration:
            pass
    heapify(h)

    while 1:
        try:
            while 1:
                v, itnum, next = s = h[0]   # raises IndexError when h is empty
                yield v
                s[0] = next()               # raises StopIteration when exhausted
                _heapreplace(h, s)          # restore heap condition
        except _StopIteration:
            _heappop(h)                     # remove empty iterator
        except IndexError:
            return

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM