I am working on comparing the calculation speed of Dask and Numpy for different data sizes. I understand that Dask can perform computations of data in parallel, and it splits up the data into chunks so that the data size can be larger than RAM. When using the Dask code below, I get a memory error (shown at the bottom) with square array of 42000 in size.
import dask as da
import time
size = 42000
y = da.random.random(size = (size,size), chunks = (size/8,size/8))
start = time.time()
y = y.dot(y*2) #arbitrary dot product calculation
y.compute()
end = time.time()
print(str(end-start) + " seconds")
However, I do not get any error when running similar code with Numpy.
import numpy as np
import time
size = 42000
x = np.random.random(size = (size,size))
start = time.time()
x = x.dot(x*2) #arbitrary dot product calculation
end = time.time()
print(str(end-start) + " seconds")
I, therefore, do not understand why Dask throws a memory error when Numpy doesn't especially because Dask should be able to partition the data. Is there any explanation/solution to this?
Edit: I have only had this problem with dot product. I have tested with mean without any problems.
MemoryError Traceback (most recent call last)
<ipython-input-3-a3af599b673a> in <module>()
3 start = time.time()
4 y = y.dot(y*2)
----> 5 y.compute()
6 end = time.time()
7 print(str(end-start) + " seconds")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
152 dask.base.compute
153 """
--> 154 (result,) = compute(self, traverse=False, **kwargs)
155 return result
156
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
405 keys = [x.__dask_keys__() for x in collections]
406 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 407 results = get(dsk, keys, **kwargs)
408 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
409
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, **kwargs)
73 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
74 cache=cache, get_id=_thread_get_id,
---> 75 pack_exception=pack_exception, **kwargs)
76
77 # Cleanup pools associated to dead threads
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
519 _execute_task(task, data) # Re-execute locally
520 else:
--> 521 raise_exception(exc, tb)
522 res, worker_id = loads(res_info)
523 state['cache'][key] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
65 if exc.__traceback__ is not tb:
66 raise exc.with_traceback(tb)
---> 67 raise exc
68
69 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
288 try:
289 task, data = loads(task_info)
--> 290 result = _execute_task(task, data)
291 id = get_id()
292 result = dumps((result, id))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in _execute_task(arg, cache, dsk)
269 func, args = arg[0], arg[1:]
270 args2 = [_execute_task(a, cache) for a in args]
--> 271 return func(*args2)
272 elif not ishashable(arg):
273 return arg
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
46 def apply(func, args, kwargs=None):
47 if kwargs:
---> 48 return func(*args, **kwargs)
49 else:
50 return func(*args)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in sum(a, axis, dtype, out, keepdims)
1880 return sum(axis=axis, dtype=dtype, out=out, **kwargs)
1881 return _methods._sum(a, axis=axis, dtype=dtype,
-> 1882 out=out, **kwargs)
1883
1884
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims)
30
31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32 return umr_sum(a, axis, dtype, out, keepdims)
33
34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
MemoryError:
During the final stage when Dask stitches things together it will probably need around 2x memory for the output.
Generally You probably shouldn't use Dask if your computation fits in memory. NumPy with a modern BLAS implementation (OpenBLAS, MKL, ...) will probably perform better.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.