Why does Dask array throw memory error when Numpy doesn't on dot product calculation?

I am working on comparing the calculation speed of Dask and Numpy for different data sizes. I understand that Dask can perform computations of data in parallel, and it splits up the data into chunks so that the data size can be larger than RAM. When using the Dask code below, I get a memory error (shown at the bottom) with square array of 42000 in size.

import dask as da
import time
size = 42000  
y = da.random.random(size = (size,size), chunks = (size/8,size/8))
start = time.time()
y = y.dot(y*2)      #arbitrary dot product calculation
end = time.time()
print(str(end-start) + " seconds")

However, I do not get any error when running similar code with Numpy.

import numpy as np
import time
size = 42000
x = np.random.random(size = (size,size))
start = time.time()
x = x.dot(x*2)      #arbitrary dot product calculation
end = time.time()
print(str(end-start) + " seconds")

I, therefore, do not understand why Dask throws a memory error when Numpy doesn't especially because Dask should be able to partition the data. Is there any explanation/solution to this?

Edit: I have only had this problem with dot product. I have tested with mean without any problems.

MemoryError                               Traceback (most recent call last)
<ipython-input-3-a3af599b673a> in <module>()
      3 start = time.time()
      4 y = y.dot(y*2)
----> 5 y.compute()
      6 end = time.time()
      7 print(str(end-start) + " seconds")

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
    152         dask.base.compute
    153         """
--> 154         (result,) = compute(self, traverse=False, **kwargs)
    155         return result

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
    405     keys = [x.__dask_keys__() for x in collections]
    406     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 407     results = get(dsk, keys, **kwargs)
    408     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, **kwargs)
     73     results = get_async(pool.apply_async, len(pool._pool), dsk, result,
     74                         cache=cache, get_id=_thread_get_id,
---> 75                         pack_exception=pack_exception, **kwargs)
     77     # Cleanup pools associated to dead threads

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    519                         _execute_task(task, data)  # Re-execute locally
    520                     else:
--> 521                         raise_exception(exc, tb)
    522                 res, worker_id = loads(res_info)
    523                 state['cache'][key] = res

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
     65         if exc.__traceback__ is not tb:
     66             raise exc.with_traceback(tb)
---> 67         raise exc
     69 else:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    288     try:
    289         task, data = loads(task_info)
--> 290         result = _execute_task(task, data)
    291         id = get_id()
    292         result = dumps((result, id))

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in _execute_task(arg, cache, dsk)
    269         func, args = arg[0], arg[1:]
    270         args2 = [_execute_task(a, cache) for a in args]
--> 271         return func(*args2)
    272     elif not ishashable(arg):
    273         return arg

~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
     46     def apply(func, args, kwargs=None):
     47         if kwargs:
---> 48             return func(*args, **kwargs)
     49         else:
     50             return func(*args)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in sum(a, axis, dtype, out, keepdims)
   1880             return sum(axis=axis, dtype=dtype, out=out, **kwargs)
   1881     return _methods._sum(a, axis=axis, dtype=dtype,
-> 1882                          out=out, **kwargs)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims)
     31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32     return umr_sum(a, axis, dtype, out, keepdims)
     34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):


During the final stage when Dask stitches things together it will probably need around 2x memory for the output.

Generally You probably shouldn't use Dask if your computation fits in memory. NumPy with a modern BLAS implementation (OpenBLAS, MKL, ...) will probably perform better.

