Cython 0.2: prange slowing down code unexpectedly

Question

Consider optimized cython code in two cases:

    for j in xrange(8):
        for x in xrange(1, 600):
            tmp[j] = 0.0
            for y in xrange(1, 800):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

and:

    for j in prange(8):  # < prange used for parallelization with openmp
        for x in xrange(1, 600):
            tmp[j] = 0.0
            for y in xrange(1, 800):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

In both cases, the code is within a native function with nogil declared and numpy arrays with memory views and optimized layout. The benchamrked runtimes for the first case is 14.97 msecs while for the second its 26.64 aproximately doubling!!

I have other functions where the use of prange greatly improves performance on my multi-core machine except for the above cases, I don't understand what is happening.

Any ideas about why prange is slowing down the code?

FWIW, here's the complete original code:

# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck.fold=True
# cython: embedsignature=False
# cython: cdivision=True
# cython: cdivision_warnings=False
# cython: always_allow_keywords=False
# cython: profile=False
# cython: linetrace=False
# cython: infer_types=False
# cython: language_level=2
# cython: c_string_type=unicode
# cython: c_string_encoding=utf-8
# cython: type_version_tag=True
# cython: unraisable_tracebacks=True
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange

DTYPE = np.int
ctypedef np.int_t DTYPE_t
UITYPE = np.uint
ctypedef np.uint_t UITYPE_t
U8TYPE = np.uint8
ctypedef np.uint8_t U8TYPE_t 
F32TYPE = np.float32
ctypedef np.float32_t F32TYPE_t
F64TYPE = np.float64
ctypedef np.float64_t F64TYPE_t
ctypedef Py_ssize_t DSIZE_t

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag,
                                F64TYPE_t [:, :, ::1] hgi_out) nogil:
    cdef DSIZE_t m, n, x, y, j, dims = mag.shape[0]
    cdef F64TYPE_t [32] tmp
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    for j in prange(dims):
        for x in xrange(1, m):
            tmp[j] = 0.0
            for y in xrange(1, n):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

def hog_integral_b(mag, hgi_out=None, orientations=8):
    if hgi_out is None:
        hgi_out = np.zeros((orientations + 1, mag.shape[0] + 1, mag.shape[1] + 1), dtype=F64TYPE)
    native_hog_integral_b(mag, hgi_out)
    return hgi_out

To test the above code, try:

mg2 = np.random.rand(9, 600, 800).astype(F64TYPE)
hg2 = np.zeros((9, mg2.shape[1] + 1, mg2.shape[2] + 1), dtype=F64TYPE)
print timeit(lambda:hog_integral_b(mg2, hgi_out=hg2), number=10)

UPDATE :

Alright, I looked closely at my setup.py compiler options:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np

ext_modules = [Extension("hog_cy", ["hog_cy.pyx"],
                         #extra_compile_args = ["-O3", "-fopenmp", "-fno-strict-aliasing"],
                         extra_compile_args = ["-O3", "-fopenmp"],
                         extra_link_args=["-fopenmp"]
                        )]

setup (
    name = 'performance test app',
    cmdclass = {'build_ext': build_ext},
    include_dirs = [np.get_include()],
    ext_modules = ext_modules,
)

the option -fno-strict-aliasing seems to be making the problem, once switched off, I get no speedup but no loss as well.

Answer 1

You are producing a GIL battle because prange is not inside a nogil block. There is no concurrency in your code, only multiple threads competing for GIL ownership:

cimport cython
from cython.parallel cimport prange, parallel

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag, 
                                F64TYPE_t [:, :, ::1] hgi_out):

    cdef DSIZE_t m, n, j, dims = mag.shape[0]
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    with nogil, parallel():
        cdef DSIZE_t x, y
        cdef F64TYPE_t tmp
        for j in prange(dims):
            for x in range(1, m):
                tmp = 0.0
                for y in range(1, n):
                    tmp += mag[j, x - 1, y - 1]
                    hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]

Answer 2

cimport cython
from cython.parallel cimport prange, parallel

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag, 
                                F64TYPE_t [:, :, ::1] hgi_out):

    cdef DSIZE_t m, n, j, dims = mag.shape[0]
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    with nogil, parallel():
        cdef DSIZE_t x, y
        cdef F64TYPE_t tmp
        for j in prange(dims):
            for x in range(1, m):
                tmp = 0.0
                for y in range(1, n):
                    tmp += mag[j, x - 1, y - 1]
                    hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]

Cython 0.2: prange slowing down code unexpectedly

Question

2 answers

solution1
2 2014-01-27 07:57:53

solution2
0 2014-01-28 00:32:43

Cython 0.2: prange slowing down code unexpectedly

Question

2 answers

solution1 2 2014-01-27 07:57:53

solution2 0 2014-01-28 00:32:43

solution1
2 2014-01-27 07:57:53

solution2
0 2014-01-28 00:32:43