i want to merge two kyoto cabinet b-tree databases by key. ( kyoto cabinet python api ). the resulting list should contain each unique key (and its value) of any of the two input dbs.
the following code works but i think its ugly.
left_generator/right_generator are two cursor objects. its especially odd that get() returns None if the generator is exhausted.
def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
try:
comparison = cmp(right_generator.get_key(), left_generator.get_key())
if comparison == 0:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
right_generator.next()
elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
yield right_generator.get_key(), right_generator.get_value()
right_generator.next()
else:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
except StopIteration:
if stop:
raise
stop = True
generally: is there a function/lib which merge joins generators with cmp() ?
I think this is what you need; orderedMerge is based on Gnibbler's code but adds a custom key function and a unique argument,
import kyotocabinet
import collections
import heapq
class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
def __init__(self, *args, **kwargs):
kyotocabinet.Cursor.__init__(self, *args, **kwargs)
collections.Iterator.__init__(self)
def next():
"Return (key,value) pair"
res = self.get(True)
if res is None:
raise StopIteration
else:
return res
def orderedMerge(*iterables, **kwargs):
"""Take a list of ordered iterables; return as a single ordered generator.
@param key: function, for each item return key value
(Hint: to sort descending, return negated key value)
@param unique: boolean, return only first occurrence for each key value?
"""
key = kwargs.get('key', (lambda x: x))
unique = kwargs.get('unique', False)
_heapify = heapq.heapify
_heapreplace = heapq.heapreplace
_heappop = heapq.heappop
_StopIteration = StopIteration
# preprocess iterators as heapqueue
h = []
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
data = next()
keyval = key(data)
h.append([keyval, itnum, data, next])
except _StopIteration:
pass
_heapify(h)
# process iterators in ascending key order
oldkeyval = None
while True:
try:
while True:
keyval, itnum, data, next = s = h[0] # get smallest-key value
# raises IndexError when h is empty
# if unique, skip duplicate keys
if unique and keyval==oldkeyval:
pass
else:
yield data
oldkeyval = keyval
# load replacement value from same iterator
s[2] = data = next() # raises StopIteration when exhausted
s[0] = key(data)
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
then your function can be done as
from operator import itemgetter
def merge_join_kv(leftGen, rightGen):
# assuming that kyotocabinet.Cursor has a copy initializer
leftIter = IterableCursor(leftGen)
rightIter = IterableCursor(rightGen)
return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)
Python 2.6 has a merge in heapq, but it does not support a user defined cmp/key func
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
h = []
h_append = h.append
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
h_append([next(), itnum, next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, itnum, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.