cuFFT 出錯后無法恢復

Question

在上一次啟動失敗后，我找不到啟動 cuFFT 處理的方法。

這是一個最小的例子。 主要思想如下：我們創建一個簡單的 cuFTT 處理器，可以管理其資源（設備 memory 和 cuFFT 計划）。 我們檢查該處理器是否進行 FFT。 然后我們要求創建太多計划，因此我們強制執行 cuFFT 錯誤。 然后我們釋放所有資源並嘗試重復啟動成功。 但是，處理器在失敗后無能為力。

首先，這是一個相當長的序言：

#include <iostream>
using std::cout;
using std::cerr;
using std::endl;

#include <vector>
using std::vector;

#include "cuda_runtime.h"
#include "cufft.h"

// cuFFT API errors
static char* _cufftGetErrorEnum( cufftResult_t error )
{
    switch ( error )
    {
        case CUFFT_SUCCESS:
        return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
        return "cuFFT was passed an invalid plan handle";

        case CUFFT_ALLOC_FAILED:
        return "cuFFT failed to allocate GPU or CPU memory";

        // No longer used
        case CUFFT_INVALID_TYPE:
        return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
        return "User specified an invalid pointer or parameter";

        case CUFFT_INTERNAL_ERROR:
        return "Driver or internal cuFFT library error";

        case CUFFT_EXEC_FAILED:
        return "Failed to execute an FFT on the GPU";

        case CUFFT_SETUP_FAILED:
        return "The cuFFT library failed to initialize";

        case CUFFT_INVALID_SIZE:
        return "User specified an invalid transform size";

        // No longer used
        case CUFFT_UNALIGNED_DATA:
        return "CUFFT_UNALIGNED_DATA";

        case CUFFT_INCOMPLETE_PARAMETER_LIST:
        return "Missing parameters in call";

        case CUFFT_INVALID_DEVICE:
        return "Execution of a plan was on different GPU than plan creation";

        case CUFFT_PARSE_ERROR:
        return "Internal plan database error";

        case CUFFT_NO_WORKSPACE:
        return "No workspace has been provided prior to plan execution";

        case CUFFT_NOT_IMPLEMENTED:
        return "CUFFT_NOT_IMPLEMENTED";

        case CUFFT_LICENSE_ERROR:
        return "CUFFT_LICENSE_ERROR";
    }

    return "<unknown>";
}

// check cuda runtime calls
bool cudaCheck( cudaError_t err )
{
    if ( err != cudaSuccess )
    {
        cudaDeviceSynchronize();
        cerr << cudaGetErrorString( cudaGetLastError() ) << endl;
        return false;
    }

    return true;
}

// check cuFFT calls
bool cufftCheck( cufftResult_t err )
{
    if ( err != CUFFT_SUCCESS )
    {
        cerr << _cufftGetErrorEnum( err ) << endl;
        return false;
    }

    return true;
}

接下來，我們定義一個簡單的 cuFFT 處理器，它可以管理其資源（設備 memory 和 cuFFT 計划）

class CCuFFT_Processor
{
    vector<cufftHandle> _plans;
    cufftComplex *_data;
    size_t _data_bytes;

    // Release resouces
    bool ReleaseAll();
    bool ReleaseMemory();
    bool ReleasePlans();

public:

    CCuFFT_Processor() :
    _data( NULL ),
    _data_bytes( 0 )
    {
        _plans.reserve( 32 );
        _plans.clear();
    }

    ~CCuFFT_Processor()
    {
        ReleaseAll();
    }

    bool Run();
    bool Alloc( size_t data_len, size_t batch_len );
};

下面是我們將如何釋放資源：

bool     CCuFFT_Processor::ReleaseMemory()
{
    bool chk = true;

    if ( _data != NULL )
    {
        chk         = cudaCheck( cudaFree( _data ) );
        _data       = NULL;
        _data_bytes = 0;
    }

    return chk;
}

bool CCuFFT_Processor::ReleasePlans()
{
    bool chk = true;

    for ( auto & p : _plans )
        chk = chk && cufftCheck( cufftDestroy( p ) );

    _plans.clear();

    return chk;
}

bool CCuFFT_Processor::ReleaseAll()
{
    bool chk = true;

    chk = chk && cudaCheck( cudaDeviceSynchronize() );
    chk = chk && ReleaseMemory();
    chk = chk && ReleasePlans();
    chk = chk && cudaCheck( cudaDeviceReset() );

    return chk;
}

下面是主要功能的實現：

bool CCuFFT_Processor::Alloc( size_t data_len, size_t batch_len )
{
    bool   chk   = true;
    size_t bytes = sizeof( cufftComplex ) * data_len * batch_len;

    // CUDA resources

    if ( _data_bytes < bytes )
        chk = chk && ReleaseMemory();

    if ( _data == NULL )
    {
        chk         = chk && cudaCheck( cudaMalloc( (void **)&_data, bytes ) );
        _data_bytes = bytes;
    }

    // cuFFT resources

    chk = chk && ReleasePlans();

    for ( size_t b = 1; chk && ( b <= batch_len ); b *= 2 )
    {
        cufftHandle new_plan;

        chk = cufftCheck(
            cufftPlan1d( &new_plan, int(data_len), CUFFT_C2C, int(b) ) );

        if ( chk )
            _plans.push_back( new_plan );
    }

    if ( !chk )
        ReleaseAll();

    return chk;
}

bool CCuFFT_Processor::Run()
{
    bool chk = true;

    chk = cufftCheck(
        cufftExecC2C( *_plans.rbegin(), _data, _data, CUFFT_FORWARD ) );

    if ( !chk )
        ReleaseAll();

    chk = chk && cudaCheck( cudaDeviceSynchronize() );

    return chk;
}

最后，程序

int main()
{
    size_t batch  = 1 << 5;
    size_t length = 1 << 21;

    CCuFFT_Processor proc;

    // Normal run
    if ( proc.Alloc( length, batch ) )
        proc.Run();

    // Run with error
    length *= 4;

    if ( proc.Alloc( length, batch ) )
        proc.Run();

    // Normal run : check recovery
    length /= 4;

    if ( proc.Alloc( length, batch ) )
        proc.Run();

    return EXIT_SUCCESS;
}

如果我使用較小的length = 1 << 18 ，則不會發生錯誤。 但是，對於較大的length = 1 << 21會出現兩個錯誤：

cuFFT failed to allocate GPU or CPU memory
Failed to execute an FFT on the GPU

第一個錯誤是預料之中的，我們是故意這樣做的。 但是第二個不是。 盡管設備已重置並且新資源已成功分配，但 cuFFT 無法執行 FFT。

我使用 GTX 970。我嘗試了以下所有組合：cuda 6.5、cuda 7.5、32 位平台、64 位平台等，但均未成功。

Answer 1

這顯然是一個僅限於舊版本 cuFFT 的內存不足錯誤恢復行為的問題，並在 CUDA 8 發布周期中得到糾正。 如果（6 年后）您仍在使用 CUDA 8 之前的 cuFFT 版本，請更新到更現代的版本，此問題將得到解決。

[根據評論收集的答案並添加為社區 wiki 條目，以便將問題從 CUDA 和 cuFFT 標簽的未回答列表中刪除]

cuFFT 出錯后無法恢復

問題描述

1 個解決方案

解決方案1
0

cuFFT 出錯后無法恢復

問題描述

1 個解決方案

解決方案1 0

解決方案1
0