[英]merge join two generators in python
我想按鍵合並兩個京都內閣b樹數據庫。 ( 京都內閣Python API )。 結果列表應包含兩個輸入數據庫中任何一個的每個唯一鍵(及其值)。
以下代碼有效,但我認為它很丑。
left_generator / right_generator是兩個光標對象。 如果生成器用盡了,get()返回None尤其奇怪。
def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
try:
comparison = cmp(right_generator.get_key(), left_generator.get_key())
if comparison == 0:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
right_generator.next()
elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
yield right_generator.get_key(), right_generator.get_value()
right_generator.next()
else:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
except StopIteration:
if stop:
raise
stop = True
通常:是否有一個函數/庫,將帶有cmp()的生成器合並在一起?
我認為這就是您所需要的; orderedMerge基於Gnibbler的代碼,但添加了自定義鍵函數和唯一參數,
import kyotocabinet
import collections
import heapq
class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
def __init__(self, *args, **kwargs):
kyotocabinet.Cursor.__init__(self, *args, **kwargs)
collections.Iterator.__init__(self)
def next():
"Return (key,value) pair"
res = self.get(True)
if res is None:
raise StopIteration
else:
return res
def orderedMerge(*iterables, **kwargs):
"""Take a list of ordered iterables; return as a single ordered generator.
@param key: function, for each item return key value
(Hint: to sort descending, return negated key value)
@param unique: boolean, return only first occurrence for each key value?
"""
key = kwargs.get('key', (lambda x: x))
unique = kwargs.get('unique', False)
_heapify = heapq.heapify
_heapreplace = heapq.heapreplace
_heappop = heapq.heappop
_StopIteration = StopIteration
# preprocess iterators as heapqueue
h = []
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
data = next()
keyval = key(data)
h.append([keyval, itnum, data, next])
except _StopIteration:
pass
_heapify(h)
# process iterators in ascending key order
oldkeyval = None
while True:
try:
while True:
keyval, itnum, data, next = s = h[0] # get smallest-key value
# raises IndexError when h is empty
# if unique, skip duplicate keys
if unique and keyval==oldkeyval:
pass
else:
yield data
oldkeyval = keyval
# load replacement value from same iterator
s[2] = data = next() # raises StopIteration when exhausted
s[0] = key(data)
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
那么你的功能可以做到
from operator import itemgetter
def merge_join_kv(leftGen, rightGen):
# assuming that kyotocabinet.Cursor has a copy initializer
leftIter = IterableCursor(leftGen)
rightIter = IterableCursor(rightGen)
return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)
Python 2.6在heapq中具有合並功能,但不支持用戶定義的cmp / key func
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
h = []
h_append = h.append
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
h_append([next(), itnum, next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, itnum, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.