I use python.multiprocessing.sharedctypes.RawArray
to share large numpy arrays between multiple processes. And I've noticed that when this array is large (> 1 or 2 Gb) it becomes very slow to initialize and also much slower to read/write to (and read/write time is not predictable, sometimes pretty fast, sometimes very very slow).
I've made a small sample script that uses just one process, initialize a shared array and write to it several times. And measures time to do these operations.
import argparse
import ctypes
import multiprocessing as mp
import multiprocessing.sharedctypes as mpsc
import numpy as np
import time
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--block-count', type=int, default=1,
help='Number of blocks to write')
parser.add_argument('-w', '--block-width', type=int, default=20000,
help='Block width')
parser.add_argument('-d', '--block-depth', type=int, default=15000,
help='Block depth')
args = parser.parse_args()
blocks = args.block_count
blockwidth = args.block_width
depth = args.block_depth
start = time.perf_counter()
shared_array = mpsc.RawArray(ctypes.c_uint16, blocks*blockwidth*depth)
finish = time.perf_counter()
print('Init shared array of size {:.2f} Gb: {:.2f} s'.format(blocks*blockwidth*depth*ctypes.sizeof(ctypes.c_uint16)/1024/1024/1024, (finish-start)))
numpy_array = np.ctypeslib.as_array(shared_array).reshape(blocks*blockwidth, depth)
start = time.perf_counter()
for i in range(blocks):
begin = time.perf_counter()
numpy_array[i*blockwidth:(i+1)*blockwidth, :] = np.ones((blockwidth, depth), dtype=np.uint16)
end = time.perf_counter()
print('Write = %.2f s' % (end-begin))
finish = time.perf_counter()
print('Total time = %.2f s' % (finish-start))
if __name__ == '__main__':
main()
When I run this code I get the following on my PC:
$ python shared-minimal.py -c 1
Init shared array of size 0.56 Gb: 0.36 s
Write = 0.13 s
Total time = 0.13 s
$ python shared-minimal.py -c 2
Init shared array of size 1.12 Gb: 0.72 s
Write = 0.12 s
Write = 0.13 s
Total time = 0.25 s
$ python shared-minimal.py -c 4
Init shared array of size 2.24 Gb: 5.40 s
Write = 1.17 s
Write = 1.17 s
Write = 1.17 s
Write = 1.57 s
Total time = 5.08 s
In the last case, when array size is more than 2 Gb, initialization time is not linearly dependent on array size, and assigning save size slices to the array is more than 5 times slower.
I wonder why that happens. I'm running the script on Ubuntu 16.04 using Python 3.5. I also noticed by using iotop that when initializing and writing to the array there is a disk writing activity with same size as shared array, but I'm not sure if a real file is created or it's only in-memory operation (I suppose it should be). In general my system becomes less responsive as well in case of large shared array. There is no swapping, checked with top
, ipcs -mu
and vmstat
.
After more research I've found that python actually creates folders in /tmp
which are starting with pymp-
, and though no files are visible within them using file viewers, it looks exatly like /tmp/
is used by python for shared memory. Performance seems to be decreasing when file cashes are flushed.
The working solution in the end was to mount /tmp
as tmpfs
:
sudo mount -t tmpfs tmpfs /tmp
And, if using the latest docker, by providing --tmpfs /tmp
argument to the docker run
command.
After doing this, read/write operations are done in RAM, and performance is fast and stable.
I still wonder why /tmp
is used for shared memory, not /dev/shm
which is already monted as tmpfs
and is supposed to be used for shared memory.
Since python 3.8 you can use shared_memory , which seems to be much more efficient and plays nicely with numpy
arrays. I did some rough testing and creating an array with shape (5000,5000)
took cca 3s with multiprocessing.Array
and only cca 0.015s with shared_memory.SharedMemory
Below is a use case for SharedMemory
where I create, with multiple processes, an array for each item that comes into the input queue and read them from a separate process in the same order as they come in.
import os
import multiprocessing as mp
import numpy as np
import time
from multiprocessing import shared_memory
class FunctionTimer:
def __init__(self, name):
self.name = name
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, type, value, traceback):
self.end = time.time()
self.exec_time = self.end - self.start
print(f"{self.name} time: {self.exec_time}")
class MpArrayProcessor:
def __init__(self, in_queue, out_queue):
self.in_queue = in_queue
self.out_queue = out_queue
self.stop_event = mp.Event()
self.processes = []
self.cur_id = 0
self.order_dict = {}
self.writable_dict = {}
self.array_locks = {}
self.array_data_dict = {}
@staticmethod
def wrap_func(func, arr_shape, in_queue, out_queue, stop_event, writable, shmem_name):
pid = os.getpid()
while True:
if stop_event.is_set():
print("Stopping")
break
x = in_queue.get(block=True)
if x is None:
break
else:
res = func(arr_shape, x)
with FunctionTimer("Wait and write"):
writable.wait()
shmem = shared_memory.SharedMemory(name=shmem_name, create=False)
c = np.ndarray(arr_shape, dtype=np.uint8, buffer=shmem.buf)
c[:] = res
writable.clear()
out_queue.put((pid, shmem_name, x))
def start(self, func, arr_shape, n_proc):
# TODO implement proper closing of SharedMemory
for p in range(n_proc):
writable = mp.Event()
writable.set()
shmem_name = f"ps_{p}"
data = shared_memory.SharedMemory(create=True, size=arr_shape[0] * arr_shape[1], name=shmem_name)
p = mp.Process(target=self.wrap_func,
args=(
func, arr_shape, self.in_queue, self.out_queue, self.stop_event, writable, shmem_name))
p.start()
self.writable_dict[p.pid] = writable
self.array_data_dict[p.pid] = data
self.processes.append(p)
def get(self):
while True:
if self.cur_id in self.order_dict:
pid, shmem_name, order = self.order_dict[self.cur_id]
print(f"PID: {pid}, idx: {order}, dict_len: {len(self.order_dict)}")
shmem = shared_memory.SharedMemory(name=shmem_name, create=False)
result = np.copy(np.frombuffer(shmem.buf, dtype=np.uint8))
self.writable_dict[pid].set()
del self.order_dict[self.cur_id]
self.cur_id += 1
return result
print(self.order_dict)
pid, shmem_name, order = self.out_queue.get(block=True)
if order == self.cur_id:
print(f"PID: {pid}, idx: {order}, dict_len: {len(self.order_dict)}")
shmem = shared_memory.SharedMemory(name=shmem_name, create=False)
print(np.frombuffer(shmem.buf, dtype=np.uint8))
result = np.copy(np.frombuffer(shmem.buf, dtype=np.uint8))
self.writable_dict[pid].set()
self.cur_id += 1
return result
else:
self.order_dict[order] = (pid, shmem_name, order)
def close(self):
self.stop_event.set()
print("Event set")
for p in self.processes:
self.array_data_dict[p.pid].close()
self.array_data_dict[p.pid].unlink()
p.join()
print("Joined")
p.close()
print("Closed")
def create_data(shape, x):
time.sleep(0.08)
# np.random.randint(0, 255, shape, dtype=np.uint8)
return np.ones(shape, dtype=np.uint8) * x
def fill_queue(queue, n_elements, n_processes):
l = [x for x in range(n_elements)]
for i in l:
queue.put(i)
for i in range(n_processes):
queue.put(None)
print("filling finished")
if __name__ == "__main__":
print(f"Running: {__file__}")
print(f"Script dir: {os.path.dirname(os.path.abspath(__file__))}")
print(f"Working dir: {os.path.abspath(os.getcwd())}")
n = 100
n_proc = 4
input_queue = mp.Queue()
output_queue = mp.Queue(maxsize=50)
# shape = (3, 3)
# shape = (1280, 720)
shape = (5000, 5000)
in_proc = mp.Process(target=fill_queue, args=(input_queue, n, n_proc))
in_proc.start()
with FunctionTimer("MP processing"):
arr_processor = MpArrayProcessor(input_queue, output_queue)
arr_processor.start(create_data, shape, 4)
results = []
for i in range(n):
print(f"Getting: {i}")
r = arr_processor.get()[:shape[0]*shape[1]].reshape(shape)
results.append(r)
arr_processor.close()
in_proc.join()
in_proc.close()
print(results)
with FunctionTimer("Normal"):
for i in range(n):
a = create_data(shape, i)
you can try np.frombuffer
, which is much faster in my test.
just replace the following line
numpy_array = np.ctypeslib.as_array(shared_array).reshape(blocks*blockwidth, depth)
with
numpy_array = np.frombuffer(shared_array, dtype=np.uint16).reshape(blocks*blockwidth, depth)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.