Why does cudaGraphicsGLRegisterBuffer segfault when I change my cuda kernel?

Question

I'm on Ubuntu 14.04 with cuda toolkit 6.5, nvidia driver version 340.29. My application registers a pixel buffer from openGL and writes an image to the buffer every loop, copies the PBO to a texture using glTexSubImage2D, and draws the texture. This all works properly until I change my image-generation kernel, then I gdb reports a segmentation fault in cudaGraphicsGLRegisterBuffer. My guess is this is a bug, because the cuda kernel is completely unrelated to cudaGraphicsGLRegisterBuffer, which is called before any processing.

Makefile

CUDA=nvcc
CPP=g++

OUT=out

INC=-I/usr/local/cuda-6.5/include

LINK=-lcudart -lglfw -lGLEW -lGL

FLAGS=-std=gnu++11
CUFLAGS=-std=c++11

all: main.cu GLdisplay.cu
    $(CUDA) main.cu GLdisplay.cu -o $(OUT) $(CUFLAGS) $(INC) $(LINK)

clean:
    rm ./$(OUT)

add:
    git add -A
    git status

main.cu

#define  GLEW_STATIC

// C++ headers
#include <iostream>
#include <fstream>
#include <cstring>

// openGL headers
#include <GL/glew.h>
#include <GLFW/glfw3.h>

// CUDA headers
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

#include "GLdisplay.h"

#define WINDOW_WIDTH  640
#define WINDOW_HEIGHT 480

#define TEX_WIDTH     1920
#define TEX_HEIGHT    1080

using std::cout;
using std::cerr;
using std::endl;
using std::string;
using std::ifstream;

GLFWwindow* window;
GLuint vao, vbo, pbo;
GLuint vtx, frg, shaders;
GLuint tex;

uint8_t* cudaPBOptr;
size_t cudaPBOsize;

cudaGraphicsResource_t cuGfxPBO;

string loadTxtFileAsString( string filename )
{
    string source;
    string buf = "";
    ifstream file( filename, std::ios::in );

    while( file.good( ) )
    {
        std::getline( file, buf );
        source.append( buf + "\n" );
    }

    file.close( );

    return source;
}

void shaderCompileCheck( void )
{
    GLint status;

    // vertex
    glGetShaderiv( vtx, GL_COMPILE_STATUS, &status );

    if( GL_TRUE != status )
    {
        char buffer[ 512 ];
        glGetShaderInfoLog( vtx, 512, NULL, buffer );
        cerr << "vtx err | " << buffer << endl;
    }

    // fragment
    glGetShaderiv( frg, GL_COMPILE_STATUS, &status );

    if( GL_TRUE != status )
    {
        char buffer[ 512 ];
        glGetShaderInfoLog( frg, 512, NULL, buffer );
        cerr << "frg err | " << buffer << endl;
    }
}

// added exit on !cudaSuccess
#define cudaErr(err) cudaError( err, __FILE__, __LINE__ )
inline void cudaError( cudaError_t err, const char* file, uint32_t line, bool abort=true )
{
    if( cudaSuccess != err )
    {
        cerr << "[" << file << ":" << line << "] ";
        cerr << cudaGetErrorName( err ) << endl; // print the name instead of description
        if( abort ) exit( err );
    }
}

// added GL error checking
#define glErr( ) glError( glGetError( ), __FILE__, __LINE__ )
inline void glError( GLenum err, const char* file, uint32_t line, bool abort=false )
{
    if( GL_NO_ERROR != err )
    {
        cerr << "[" << file << ":" << line << "] ";
        cerr << glewGetErrorString( err ) << endl;
        if( abort ) exit( err );
    }
}

// main function
int main( int argc, char* argv[] )
{
    string vString = loadTxtFileAsString( "vert.glsl" );
    string fString = loadTxtFileAsString( "frag.glsl" );

    const GLchar* vtxSource = vString.c_str( );
    const GLchar* frgSource = fString.c_str( );

/////////////////////////////////////////////////

    glfwInit( );

    glfwWindowHint( GLFW_CONTEXT_VERSION_MAJOR, 3 );
    glfwWindowHint( GLFW_CONTEXT_VERSION_MINOR, 2 );
    glfwWindowHint( GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE );
    glfwWindowHint( GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE );

    glfwWindowHint( GLFW_RESIZABLE, GL_FALSE );

    window = glfwCreateWindow( WINDOW_WIDTH, WINDOW_HEIGHT, "CUDA-GL", NULL, NULL );

    glfwMakeContextCurrent( window );

    glewExperimental = GL_TRUE;
    glewInit( );
    glErr(); // added GL error checking

    cudaErr( cudaSetDevice( 0 ) );
    glErr(); // added GL error checking

/////////////////////////////////////////////////

    GLfloat vertices[] = {
    //   X      Y     U     V
        -1.0f,  1.0f, 0.0f, 1.0f, // t l
         1.0f,  1.0f, 1.0f, 1.0f, // t r
        -1.0f, -1.0f, 0.0f, 0.0f, // b l

        -1.0f, -1.0f, 0.0f, 0.0f, // b l
         1.0f, -1.0f, 1.0f, 0.0f, // b r
         1.0f,  1.0f, 1.0f, 1.0f  // t r
    };

    GLbyte pboInit[ TEX_WIDTH * TEX_HEIGHT * 4 ];
    memset( pboInit, 127, sizeof( pboInit ) );

/////////////////////////////////////////////////

    glGenVertexArrays( 1, &vao );
    glBindVertexArray( vao );
    glErr(); // added GL error checking

    glGenBuffers( 1, &pbo );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo );
    glBufferData( GL_PIXEL_UNPACK_BUFFER, TEX_WIDTH * TEX_HEIGHT * 4, pboInit, GL_STREAM_DRAW );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );
    glErr(); // added GL error checking

    glActiveTexture( GL_TEXTURE0 );
    glGenTextures( 1, &tex );
    glBindTexture( GL_TEXTURE_2D, tex );
    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA, TEX_WIDTH, TEX_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );
    glErr(); // added GL error checking

    glGenBuffers( 1, &vbo );
    glBindBuffer( GL_ARRAY_BUFFER, vbo );
    glBufferData( GL_ARRAY_BUFFER, sizeof( vertices ), vertices, GL_STATIC_DRAW );
    glErr(); // added GL error checking

/////////////////////////////////////////////////

    cudaErr( cudaGraphicsGLRegisterBuffer( &cuGfxPBO, pbo, cudaGraphicsMapFlagsNone ) );
    glErr(); // added GL error checking

/////////////////////////////////////////////////

    vtx = glCreateShader( GL_VERTEX_SHADER );
    glShaderSource( vtx, 1, &vtxSource, NULL );
    glCompileShader( vtx );

    frg = glCreateShader( GL_FRAGMENT_SHADER );
    glShaderSource( frg, 1, &frgSource, NULL );
    glCompileShader( frg );

    shaderCompileCheck( );

    shaders = glCreateProgram( );
    glAttachShader( shaders, vtx );
    glAttachShader( shaders, frg );

    glBindFragDataLocation( shaders, 0, "outColor" );

    glLinkProgram( shaders );
    glUseProgram( shaders );

/////////////////////////////////////////////////

    GLuint posAtt = glGetAttribLocation( shaders, "position" );
    glEnableVertexAttribArray( posAtt );
    glVertexAttribPointer( posAtt, 2, GL_FLOAT, GL_FALSE, 4 * sizeof( GLfloat ), NULL );

    GLuint texAtt = glGetAttribLocation( shaders, "texcoord" );
    glEnableVertexAttribArray( texAtt );
    glVertexAttribPointer( texAtt, 2, GL_FLOAT, GL_FALSE, 4 * sizeof( GLfloat ), ( void* )( 2 * sizeof( GLfloat ) ) );

/////////////////////////////////////////////////

    glClearColor( 0.0f, 0.0f, 0.0f, 1.0f );

    while( !glfwWindowShouldClose( window ) )
    {
        cudaErr( cudaGraphicsMapResources( 1, &cuGfxPBO, 0 ) );
        cudaErr( cudaGraphicsResourceGetMappedPointer( ( void** )&cudaPBOptr, &cudaPBOsize, cuGfxPBO ) );
        glErr(); // added GL error checking

        dim3 gridDim( ( int32_t )ceil( ( float )TEX_WIDTH / 16 ), ( int32_t )ceil( ( float )TEX_HEIGHT / 16 ) );
        dim3 blockDim( 16, 16 );

        makeImage<<< gridDim, blockDim >>>( cudaPBOptr, TEX_WIDTH, TEX_HEIGHT );
        cudaDeviceSynchronize( ); cudaErr( cudaGetLastError( ) );
        glErr(); // added GL error checking

        cudaErr( cudaGraphicsUnmapResources( 1, &cuGfxPBO, 0 ) );
        glErr(); // added GL error checking

/////////////////////////////////////////////////

        glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo );
        glErr(); // added GL error checking

        glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, NULL );
        glErr(); // added GL error checking

        glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );
        glErr(); // added GL error checking

/////////////////////////////////////////////////

        glClear( GL_COLOR_BUFFER_BIT );

        glDrawArrays( GL_TRIANGLES, 0, 6 );
        glErr(); // added GL error checking

        glfwSwapBuffers( window );

        glfwPollEvents( );
    }

/////////////////////////////////////////////////

    glDeleteTextures( 1, &tex );

    glDeleteProgram( shaders );
    glDeleteShader( frg );
    glDeleteShader( vtx );

    glDeleteBuffers( 1, &pbo );
    glDeleteBuffers( 1, &vbo );

    glDeleteVertexArrays( 1, &vao );

    return 0;
}

GLdisplay.cu

#include <cuda_runtime.h>
#include <stdint.h>

#include "GLdisplay.h"

__global__ void makeImage( uint8_t* output, uint32_t width, uint32_t height )
{
    uint32_t xIdx = threadIdx.x + blockIdx.x * blockDim.x;
    uint32_t yIdx = threadIdx.y + blockIdx.y * blockDim.y;
    uint32_t oIdx = ( xIdx + yIdx * width ) * 4;

    if( xIdx >= width || yIdx >= height ) return;

// segfault occurs if any of the lines below are commented or set to write 0
    uint8_t red = ( uint8_t )( 255.0f * ( float )( xIdx + yIdx * width ) / ( width * height ) );
    uint8_t gre = ( uint8_t )( 255.0f * ( 1.0f - ( float )( xIdx + yIdx * width ) / ( width * height ) ) );
    uint8_t blu = ( uint8_t )( 510.0f * abs( ( float )( xIdx + yIdx * width ) / ( width * height ) ) - 0.5f );

    output[ oIdx + 0 ] = red;
    output[ oIdx + 1 ] = gre;
    output[ oIdx + 2 ] = blu;
    output[ oIdx + 3 ] = 255;

 }

GLdisplay.h

#ifndef GL_DISPLAY_H
#define GL_DISPLAY_H

__global__ void makeImage( uint8_t*, uint32_t, uint32_t );

#endif

vert.glsl

#version 150

in vec2 position;
in vec2 texcoord;

out vec2 Texcoord;

void main( )
{
    Texcoord = texcoord;
    gl_Position = vec4( position, 0.0, 1.0 );
}

frag.glsl

#version 150

in vec2 Texcoord;

out vec4 outColor;

uniform sampler2D tex;

void main( )
{
    outColor = texture( tex, Texcoord );
}

Answer 1

Your code lacks error condition checking. First and foremost I suggest you check the error code status after each and every OpenGL and CUDA call. I've got a very strong hunch what the problem is and adding the error checking will tell if this is the case.

So: I see that you never properly unbind the PBO from OpenGL before trying to map it to a CUDA pointer. Depending on the path your program takes this may result in the PBO actually not getting mapped, hence giving you an invalid pointer that if accessed, from the CUDA kernel or the host, will result in a segfault.

To pin down the problem check the error status of every OpenGL and CUDA operation. CUDA will tell you if it can't map a OpenGL resource, for whatever reason (like the resource still being bound to a OpenGL functional unit).

Answer 2

Updating to nvidia display driver 346.47 solved my problem completely. For the unsure, run nvidia-smi in a terminal window, and if you don't see NVIDIA-SMI 346.xx (xx being an arbitrary number), update with the newest driver available from nvidia. At the time of writing this, CUDA toolkit 6.5 ships with an outdated graphics driver.

Why does cudaGraphicsGLRegisterBuffer segfault when I change my cuda kernel?

Question

2 answers

solution1
0 2015-03-08 20:14:06

solution2
0 ACCPTED 2015-03-09 04:02:08

Why does cudaGraphicsGLRegisterBuffer segfault when I change my cuda kernel?

Question

2 answers

solution1 0 2015-03-08 20:14:06

solution2 0 ACCPTED 2015-03-09 04:02:08

solution1
0 2015-03-08 20:14:06

solution2
0 ACCPTED 2015-03-09 04:02:08