[英]Python multiprocessing merge dictionaries of dictionaries from multiple processes
我試圖在多個進程之間使用共享內存來更新包含字典的字典,而字典中包含的字典...我嘗試從多處理模塊中使用Manager,但是在向其中添加字典時遇到了困難。 請參閱下面的代碼和注釋。 本質上,該代碼應該在另一個稱為“輸出”的詞典中創建輸入的副本。 一旦完成此工作,將存在僅從輸入中復制某些“刀片”的邏輯,但是必須保持節點/群集/刀片的層次結構。
from multiprocessing import Process, Lock, Manager
# Define dictionary that will be used in this example
input = {
"Node_1": {
"IP": "127.0.0.1",
"clusters": {
"cluster_1": {
"blades": {
"blade_0_1": "127.0.1.1",
"blade_0_2": "127.0.1.2"
}
},
"cluster_2": {
"blades": {
"blade_0_3": "127.0.1.3",
"blade_0_4": "127.0.1.4"
}
}
}
},
"Node_2": {
"IP": "127.0.0.2",
"clusters": {
"cluster_1": {
"blades": {
"blade_0_1": "127.0.1.1",
"blade_0_2": "127.0.1.2"
}
},
"cluster_2": {
"blades": {
"blade_0_3": "127.0.1.3",
"blade_0_4": "127.0.1.4"
}
}
}
}
}
def iterate_over_clusters_in_node(input, node, lock, output):
""" Iterate over the clusters in the node, then over the blades in the cluster. Add each blade to the output dictionary."""
for cluster in input[node]['clusters']:
for blade in input[node]['clusters'][cluster]['blades']:
with lock:
print "node: " + node + ", node_IP: " + input[node]['IP'] + ", cluster: " + cluster + ", Blade: " + blade + ", cluster_IP: " + input[node]['clusters'][cluster]['blades'][blade]
with lock:
add_blade_to_output(input, node, cluster, blade, output)
def add_blade_to_output(input, node, cluster, blade, output):
''' Adds a blade to the managed output dictionary'''
if node not in output:
output[node] = {}
output[node]['IP'] = input[node]['IP']
output[node]['clusters'] = {}
# At this point, I would expect output[node]['IP'] and output[node]['clusters'] to exist
# But the following print raises KeyError: 'IP'
print output[node]['IP']
if cluster not in output[node]['clusters']:
# Raises KeyError: 'clusters'
output[node]['clusters'][cluster] = {}
output[node]['clusters'][cluster]['blades'] = {}
output[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
if __name__ == "__main__":
# Create lock to ensure correct handling of output from multiple processes
lock = Lock()
# Create dictionary to hold any failed blades so that appropriate measures can be taken
# Must use a Manager so that the dictionary can be shared among processes
manager = Manager()
output = manager.dict()
# Create array to track our processes
procs = []
# Iterate over all nodes in input
for node in input:
p = Process(target = iterate_over_clusters_in_node, args = (input, node, lock, output))
p.start()
procs.append(p)
# Join processes and wait for them to stop
for p in procs:
p.join()
print "The final output is:"
print output
# Expectation: should print the same dictionary as the input
# Actual: prints "{'Node_2': {}, 'Node_1': {}}"
是否需要將manager.dict()添加到output [node]而不是內置的字典類型? 還是我要解決所有這些錯誤?
謝謝!
編輯:我不反對將其切換到“線程”實現而不是“多重處理”。 我是並行運行事物的新手,因此,如果線程更適合此類內存共享,請告訴我。
編輯:這是工作代碼:
from multiprocessing import Process, Lock, Manager
# Define dictionary that will be used in this example
input = {
"Node_1": {
"IP": "127.0.0.1",
"clusters": {
"cluster_1": {
"blades": {
"blade_0_1": "127.0.1.1",
"blade_0_2": "127.0.1.2"
}
},
"cluster_2": {
"blades": {
"blade_0_3": "127.0.1.3",
"blade_0_4": "127.0.1.4"
}
}
}
},
"Node_2": {
"IP": "127.0.0.2",
"clusters": {
"cluster_1": {
"blades": {
"blade_0_1": "127.0.1.1",
"blade_0_2": "127.0.1.2"
}
},
"cluster_2": {
"blades": {
"blade_0_3": "127.0.1.3",
"blade_0_4": "127.0.1.4"
}
}
}
}
}
# Create dictionary to hold any failed blades so that appropriate measures can be taken
# Must use a Manager so that the dictionary can be shared among processes
manager = Manager()
output = manager.dict()
def iterate_over_clusters_in_node(input, node, lock):
""" Iterate over the clusters in the node, then over the blades in the cluster. Add each blade to the output dictionary."""
for cluster in input[node]['clusters']:
for blade in input[node]['clusters'][cluster]['blades']:
with lock:
add_blade_to_output(input, node, cluster, blade)
def add_blade_to_output(input, node, cluster, blade):
''' Adds a blade to the managed output dictionary'''
if node not in output:
new_node = {}
new_node[node] = {'IP': input[node]['IP'], 'clusters': {}}
output.update(new_node)
new_node = {}
new_node.update(output)
if cluster not in output[node]['clusters']:
new_node[node]['clusters'][cluster] = {}
new_node[node]['clusters'][cluster]['blades'] = {blade: input[node]['clusters'][cluster]['blades'][blade]}
else:
new_node[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
output.update(new_node)
if __name__ == "__main__":
# Create lock to ensure correct handling of output from multiple processes
lock = Lock()
# Create array to track our processes
procs = []
# Iterate over all nodes in input
for node in input:
p = Process(target = iterate_over_clusters_in_node, args = (input, node, lock))
p.start()
procs.append(p)
# Join processes and wait for them to stop
for p in procs:
p.join()
print "The final output is:"
print output
根據python文檔,
對dict和list代理中的可變值或可變項的修改不會通過管理器傳播,因為代理無法知道何時修改其值或可變項。 要修改此類項目,可以將修改后的對象重新分配給容器代理。
有了這些信息,我們可以相應地更新管理器:
#output[node] = {}
#output[node]['IP'] = input[node]['IP']
#output[node]['clusters'] = {} These changes are not propagated through the manager
new_node = {}
new_node[node] = {'IP': input[node]['IP'], 'clusters': {}}
output.update(new_node)
#if cluster not in output[node]['clusters']:
# Raises KeyError: 'clusters'
#output[node]['clusters'][cluster] = {}
#output[node]['clusters'][cluster]['blades'] = {}
#output[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
node_copy = output.copy()
if cluster not in node_copy[node]['clusters']:
node_copy[node]['clusters'].setdefault(cluster, {'blades': {}})
node_copy[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
output.update(node_copy)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.