[英]Multiprocessing Pool hangs if child process killed
我啟動了一個工作進程池並提交了一堆任務。 系統在 memory 上運行不足,並且 oomkiller 殺死了一個工作進程。 父進程只是掛在那里等待任務完成並且從未返回。
這是一個重現問題的可運行示例。 我沒有等待 oomkiller 殺死一個工作進程,而是找到所有工作進程的進程 ID 並告訴第一個任務殺死該進程。 (對ps
的調用不適用於所有操作系統。)
import os
import signal
from multiprocessing import Pool
from random import choice
from subprocess import run, PIPE
from time import sleep
def run_task(task):
target_process_id, n = task
print(f'Processing item {n} in process {os.getpid()}.')
delay = n + 1
sleep(delay)
if n == 0:
print(f'Item {n} killing process {target_process_id}.')
os.kill(target_process_id, signal.SIGKILL)
else:
print(f'Item {n} finished.')
return n, delay
def main():
print('Starting.')
pool = Pool()
ps_output = run(['ps', '-opid', '--no-headers', '--ppid', str(os.getpid())],
stdout=PIPE, encoding='utf8')
child_process_ids = [int(line) for line in ps_output.stdout.splitlines()]
target_process_id = choice(child_process_ids[1:-1])
tasks = ((target_process_id, i) for i in range(10))
for n, delay in pool.imap_unordered(run_task, tasks):
print(f'Received {delay} from item {n}.')
print('Closing.')
pool.close()
pool.join()
print('Done.')
if __name__ == '__main__':
main()
當我在具有八個 CPU 的系統上運行它時,我看到了這個 output:
Starting.
Processing item 0 in process 303.
Processing item 1 in process 304.
Processing item 2 in process 305.
Processing item 3 in process 306.
Processing item 4 in process 307.
Processing item 5 in process 308.
Processing item 6 in process 309.
Processing item 7 in process 310.
Item 0 killing process 308.
Processing item 8 in process 303.
Received 1 from item 0.
Processing item 9 in process 315.
Item 1 finished.
Received 2 from item 1.
Item 2 finished.
Received 3 from item 2.
Item 3 finished.
Received 4 from item 3.
Item 4 finished.
Received 5 from item 4.
Item 6 finished.
Received 7 from item 6.
Item 7 finished.
Received 8 from item 7.
Item 8 finished.
Received 9 from item 8.
Item 9 finished.
Received 10 from item 9.
您可以看到項目 5 永遠不會返回,並且池只會永遠等待。
當子進程被殺死時,如何讓父進程注意到?
這個問題在Python 錯誤 9205中有描述,但他們決定在concurrent.futures
模塊中而不是在multiprocessing
模塊中修復它。 為了利用該修復程序,請切換到較新的進程池。
import os
import signal
from concurrent.futures.process import ProcessPoolExecutor
from random import choice
from subprocess import run, PIPE
from time import sleep
def run_task(task):
target_process_id, n = task
print(f'Processing item {n} in process {os.getpid()}.')
delay = n + 1
sleep(delay)
if n == 0:
print(f'Item {n} killing process {target_process_id}.')
os.kill(target_process_id, signal.SIGKILL)
else:
print(f'Item {n} finished.')
return n, delay
def main():
print('Starting.')
pool = ProcessPoolExecutor()
pool.submit(lambda: None) # Force the pool to launch some child processes.
ps_output = run(['ps', '-opid', '--no-headers', '--ppid', str(os.getpid())],
stdout=PIPE, encoding='utf8')
child_process_ids = [int(line) for line in ps_output.stdout.splitlines()]
target_process_id = choice(child_process_ids[1:-1])
tasks = ((target_process_id, i) for i in range(10))
for n, delay in pool.map(run_task, tasks):
print(f'Received {delay} from item {n}.')
print('Closing.')
pool.shutdown()
print('Done.')
if __name__ == '__main__':
main()
現在,當您運行它時,您會收到一條清晰的錯誤消息。
Starting.
Processing item 0 in process 549.
Processing item 1 in process 550.
Processing item 2 in process 552.
Processing item 3 in process 551.
Processing item 4 in process 553.
Processing item 5 in process 554.
Processing item 6 in process 555.
Processing item 7 in process 556.
Item 0 killing process 556.
Processing item 8 in process 549.
Received 1 from item 0.
Traceback (most recent call last):
File "/home/don/.config/JetBrains/PyCharm2020.1/scratches/scratch2.py", line 42, in <module>
main()
File "/home/don/.config/JetBrains/PyCharm2020.1/scratches/scratch2.py", line 33, in main
for n, delay in pool.map(run_task, tasks):
File "/usr/lib/python3.7/concurrent/futures/process.py", line 483, in _chain_from_iterable_of_lists
for element in iterable:
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 598, in result_iterator
yield fs.pop().result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 428, in result
return self.__get_result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
我遇到了同樣的問題,在處理問題時,concurrent.futures 也好不到哪里去。 我最終使用了Ray模塊,這是我的示例代碼,它重試了正在減少的工作人員數量的已終止任務。 這樣,最飢餓的 memory 有機會在最壞的情況下在一個工人身上完成。 小心運行它,因為 OOM 殺手也可能殺死其他進程:
import ray
import logging
from multiprocessing import cpu_count
import numpy as np
import psutil
# the default max_retries is 3, but in this case there is no point to retry with the same amount of workers
@ray.remote(max_retries=0)
def f(x):
logging.warning("worker started %s", x)
allocate = int(psutil.virtual_memory().total / (cpu_count() - 3) / 8)
logging.warning("worker allocate %s element float array for %s", allocate, x)
crash = np.ones([allocate])
# make sure the interpreter won't optimize out the above allocation
logging.warning("worker print %s for %x", crash[0], x)
logging.warning("worker finished %s", x)
return x
def main():
processes = cpu_count() - 1
alljobs = range(processes + 1)
completedjobs = []
try:
while alljobs:
logging.warning("Number of jobs: %s", len(alljobs))
logging.warning("Number of workers: %s", processes)
ray.init(num_cpus=processes)
result_ids = [f.remote(i) for i in alljobs]
while True:
try:
while len(result_ids):
done_id, result_ids = ray.wait(result_ids, num_returns=1)
x = ray.get(done_id[0])
logging.warning("results from %s", x)
completedjobs.append(x)
except ray.exceptions.WorkerCrashedError:
logging.warning("Continue after WorkerCrashedError")
continue
break
# rerun the killed jobs on fewer workers to relieve memory pressure
alljobs = list(set(alljobs) - set(completedjobs))
ray.shutdown()
if processes > 1:
processes -= 1
else:
break
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
logging.exception(message)
raise
if __name__ == "__main__":
main()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.