简体   繁体   中英

too many files open error with multiprocessing

I have a code that uses multiprocessing over about 10000 files on a 12 core vcpu on Ubuntu.

def process_file(name):
    inp = open(name)
    out = open(name.split('.')[0]+'wikiout.txt','a')
    for row in inp:
        row = row.strip()
        sent_text = nltk.sent_tokenize(text)

        for sent in sent_text:
            # process sentence
    
        inp.close()
        out.close()

if __name__ == '__main__':
    processes = []
    for i in 'ABCDEF':
        for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
            for k in range(100)
                filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))

                p = multiprocessing.Process(target=process_file, args=(filename,))
                processes.append(p)
                p.start()

    for process in processes:
        process.join()

For some reason I get this issue

  File "wikirules.py", line 37, in <module>
    p.start()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
    self._popen = self._Popen(self)
  File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
    return _default_context.get_context().Process._Popen(process_obj)
  File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
    self._launch(process_obj)
  File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
    child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
  File "wikirules.py", line 13, in process_file
  File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
  File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
  File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
  File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'

Any clue why this might be happening? Im still new to multiprocessing. So shouldn't this not open more than 12 files at once.

Your code is trying to run

len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600

operating system processes simultaneously.

Actually multiprocessing module contains relatively low level primitives to work with multiprocessing, and for basic tasks standard library suggests more safe and convenient option - module concurrent.futures which contains Pools implementations for threads and processes, and could be very useful especially for "embarrassingly parallel" workloads.

Here is example how the code from your question could be transformed using concurrent.futures and some other python features like generators, context managers and pathlib module.

import concurrent.futures as futures
import itertools
import pathlib

import nltk

BASE_PATH = pathlib.Path(__file__).parent.absolute()

def filename_generator():
    """produce filenames sequence"""
    for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
        yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"

def worker(filename: pathlib.Path):
    """do all the job"""
    out_filename = filename.with_suffix('.wikiout.txt')
    with open(filename) as inp, open(out_filename, "a") as out:
        for row in inp:
            text = row.strip()
            sent_text = nltk.sent_tokenize(text)
            for sent in sent_text:
                """process sentence"""

def main():
    with futures.ProcessPoolExecutor() as pool:
        # mapping future->filename, useful in case of error
        task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
        for f in futures.as_completed(task_to_filename):
            try:
                f.result()
            except Exception as e:
                filename = task_to_filename[f]
                print(f"{filename} processing failed: {e}")

if __name__ == "__main__":
    main()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM