Cython没有性能随着prange / parallel而增加

Question

I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. 我正在使用Cython版本0.27.3来编译以下源代码，以便创建一个包含同一算法的python和cython实现的简单素性测试模块。 When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. 当我将threads参数设置为不同的值时，尽管GIL被释放，但我看不到性能提升。 Is there something that's preventing this from running in parallel? 有什么东西阻止它并行运行吗？

The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice. 有问题的函数是cdef void _getprimes ，它接受一个内存视图切片作为参数，并应在该切片中将所有非素数值设置为0。

primes.pyx primes.pyx

#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math

# =====================
# Python implementation
# =====================

def pyisprime(n):
    """Python implementation"""
    if n < 2 or n & 1 == 0:
        if n == 2:
            return True
        return False
    for i in range(2, int(math.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True

def pygetprimes(nums):
    return [num for num in nums if pyisprime(num)]


# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
    """Cython implementation of a simple primality check"""
    cdef unsigned long long upper 
    cdef unsigned long long i = 3
    cdef int prime = 1
    if n < 2 or n & 1 == 0:
        if n == 2:
            return 1
        return 0
    upper = <unsigned long long>ceil(sqrt(<double>n))
    while i <= upper:
        if n % i == 0:
            prime = 0
            break
        i += 1
    return prime

def isprime(unsigned long long n):
    """Wrapper for _isprime"""
    cdef int result
    with nogil:
        result = _isprime(n)
    return result

cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
    cdef unsigned long num
    cdef int i = 0
    with parallel(num_threads=threads):
        for i in prange(nums.shape[0], schedule="dynamic"):
            if _isprime(nums[i]) == 0:
                nums[i] = 0

def getprimes(nums, int threads = 1):
    """Wrapper for _getprimes"""
    cdef unsigned long long num
    cdef unsigned long long[:] primes = array.array("Q", nums)

    with nogil:
        _getprimes(primes, threads)

    return [num for num in primes if num != 0]

setup.py setup.py

#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize

setup(
    name="primes",
    ext_modules=cythonize('primes.pyx'),
)

test.py test.py

#!/usr/bin/env python3
import functools
import random
import time
import primes

def timed(func):
    def wrapped(*args, **kwargs):
        start = time.time()
        val = func(*args, **kwargs)
        end = time.time()
        print(func.__name__, end - start)
        return val
    return functools.wraps(func)(wrapped)


def main():
    nums = [random.randint(0, 0xffffff) for _ in range(500000)]

    pyfoo = timed(primes.pygetprimes)
    cyfoo = timed(primes.getprimes)

    x = pyfoo(nums)
    y = cyfoo(nums, 1)
    z = cyfoo(nums, 4)
    assert x == y == z

if __name__ == "__main__":
    main()

When I run cyfoo , I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case: 当我运行cyfoo ，我预计将线程数从1增加到4将显示某种类型的速度增加，但事实并非如此：

[aarcher@Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py 
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295

Answer 1

It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything. 您似乎需要为OpenMP启用编译器标志，以便并行语句实际执行任何操作。

See cython docs here http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling 请参阅此处的cython文档http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling

# setup.py
# ... omitted ...

ext_modules = [
    Extension(
        "hello",
        ["hello.pyx"],
        extra_compile_args=['-fopenmp'],
        extra_link_args=['-fopenmp'],
    )
]

Cython没有性能随着prange / parallel而增加

问题描述

1 个解决方案

解决方案1
7 已采纳 2017-11-14 19:05:20

Cython没有性能随着prange / parallel而增加

问题描述

1 个解决方案

解决方案1 7 已采纳 2017-11-14 19:05:20

解决方案1
7 已采纳 2017-11-14 19:05:20