简体   繁体   English

如何在dll中使用CUDA?

[英]How to use CUDA in a dll?

my purpose is to use CUDA through a stub library, which encapsulates CUDA specific functions, methods and data. 我的目的是通过存根库使用CUDA,该存根库封装了CUDA特定的功能,方法和数据。 CUDA stuff will reside inside a dll file and it will be used dynamically via LoadLibrary and GetProcAddress Windows API functions. CUDA内容将驻留在dll文件中,并将通过LoadLibrary和GetProcAddress Windows API函数动态使用。 I am using Visual Studio 2010 C++ compiler to create CUDA dll, but the rest is done by another compiler. 我正在使用Visual Studio 2010 C ++编译器创建CUDA dll,但其余工作由另一个编译器完成。 That means, that I cannot use heap memory (ie. malloc, new or anything, which lives outside stack memory. Even global variables cause memory corruption) inside CUDA dll, so it is literally a stub library. 这意味着,我不能在CUDA dll中使用堆内存(即,位于堆内存之外的malloc,new或其他任何东西,甚至全局变量也会导致内存损坏),因此它实际上是一个存根库。

However, first I want to code a test program: CUDAhost.exe and CUDAdevice.dll, which both are compiled by Visual Studio 2010, two projects in a single solution. 但是,首先,我想编写一个测试程序:CUDAhost.exe和CUDAdevice.dll,它们都由Visual Studio 2010编译,这两个项目在一个解决方案中。 This program draws an OpenGL texture to the screen, but first the image data is copied from readGLTexture to viewGLTexture by CUDA. 该程序在屏幕上绘制OpenGL纹理,但是首先,图像数据由CUDA从readGLTexture复制到viewGLTexture。 Notice that to avoid using heap memory, I use void pointer references void* &cReadCudaResource and void* &cViewCudaResource. 请注意,为避免使用堆内存,我使用了void指针引用void *&cReadCudaResource和void *&cViewCudaResource。 My problem is I cannot get the program working, the window is black. 我的问题是我无法使程序正常工作,窗口是黑色的。 I cannot find the error. 我找不到错误。 I'm not sure is this possible at all or should I choose completely different solution. 我不确定这是否完全可能,还是我应该选择完全不同的解决方案。 I hope you can help me. 我希望你能帮助我。 Any suggestions are appreciated. 任何建议表示赞赏。 Below is the source code: 下面是源代码:

CUDAhost.cpp: CUDAhost.cpp:

#include "stdafx.h"

const unsigned int window_width  = 512;
const unsigned int window_height = 512;

GLuint viewGLTexture;
GLuint readGLTexture;
void* cViewCudaResource;
void* cReadCudaResource;
HINSTANCE dll;
typedef void (*SETCUDA)(unsigned int& readGLTexture, void*   &cReadCudaResource, unsigned int& viewGLTexture, void* &cViewCudaResource);
SETCUDA setCuda;
typedef void (*DRAWPICTURE)(void* &cReadCudaResource, void* &cViewCudaResource);
DRAWPICTURE drawPicture;

bool loadTexture(const wchar_t* name, GLuint& number) {

  FILE* file;
  BITMAPFILEHEADER bitmapFileHeader;
  BITMAPINFOHEADER bitmapInfoHeader;
  unsigned char *bitmap;
  unsigned char temp;
  wchar_t path[45]={0};
  int width;
  int height;

//prepare file path
  wcsncat_s(path, L"Textures\\", 45);
  wcsncat_s(path, name, 45);
  wcsncat_s(path, L".bmp", 45);

//open BMP file
  file=_wfopen(path, L"rb");
  if (file==NULL) {
    return false;
  }

//read bmp file header and sequre it is bmp file
  fread(&bitmapFileHeader, sizeof(BITMAPFILEHEADER), 1, file);
  if (bitmapFileHeader.bfType != 0x4D42) {
    fclose(file);
    return false;
  }

//read bmp info header and move to the beginning of image data
  fread(&bitmapInfoHeader, sizeof(BITMAPINFOHEADER), 1, file);
  fseek(file, bitmapFileHeader.bfOffBits, SEEK_SET);

//allocate memory space
  bitmap=(unsigned char*)malloc(bitmapInfoHeader.biSizeImage);
  if (!bitmap) {
free(bitmap);
bitmap=NULL;
    fclose(file);
    return false;
  }

//read image
  fread(bitmap, 1, bitmapInfoHeader.biSizeImage, file);
  if (file==NULL) {
free(bitmap);
bitmap=NULL;
    fclose(file);
    return false;
  }

//rearrange bgr to rgb
  for (int i=0; i<bitmapInfoHeader.biSizeImage; i+=3) {
    temp=bitmap[i];
    bitmap[i]=bitmap[i+2];
    bitmap[i+2]=temp;
  }

//query image width and height
  width=bitmapInfoHeader.biWidth;
  height=abs(bitmapInfoHeader.biHeight);

//close bmp file
  fclose(file);
  glGetError();

//create OpenGL texture
  glGenTextures(1, &number);
  glBindTexture(GL_TEXTURE_2D, number);
  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, bitmap);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

//free temporary buffer
  free(bitmap);
  bitmap=NULL;

//if success, return true
  if (0==glGetError()) {
    return true;
  } else {
    return false;
  }
}

void initGLandCUDA(int argc, char* argv[]) {

  glutInit(&argc, argv);
  glutInitDisplayMode(GLUT_RGBA);
  glutInitWindowSize(window_width, window_height);
  glutCreateWindow("CUDA GL Interop");

  glewInit();

  glEnable(GL_TEXTURE_2D);
  bool success=loadTexture(L"Tex", readGLTexture);

  glGenTextures(1, &viewGLTexture);
  glBindTexture(GL_TEXTURE_2D, viewGLTexture);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, window_width, window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
  glBindTexture(GL_TEXTURE_2D, 0);

  dll=LoadLibraryW(L"CUDAdevice.dll");
  if (dll) {
    setCuda=(SETCUDA)GetProcAddress(dll, "setCuda");
    setCuda(readGLTexture, cReadCudaResource, viewGLTexture, cViewCudaResource);
  }
}    

void renderFrame() {
  if (dll) {
    drawPicture=(DRAWPICTURE)GetProcAddress(dll, "drawPicture");
    drawPicture(cReadCudaResource, cViewCudaResource);
  }
  glBindTexture(GL_TEXTURE_2D, viewGLTexture);
  {
    glBegin(GL_QUADS);
    {
      glTexCoord2f(0.0f, 0.0f); glVertex2f(-1.0f, -1.0f);
      glTexCoord2f(1.0f, 0.0f); glVertex2f(+1.0f, -1.0f);
      glTexCoord2f(1.0f, 1.0f); glVertex2f(+1.0f, +1.0f);
      glTexCoord2f(0.0f, 1.0f); glVertex2f(-1.0f, +1.0f);
    }
    glEnd();
  }
  glBindTexture(GL_TEXTURE_2D, 0);
  glFinish();
}

int _tmain(int argc, _TCHAR* argv[])
{
  initGLandCUDA(argc, reinterpret_cast<char**>(argv));
  glutDisplayFunc(renderFrame);
  glutMainLoop();
  return 0;
}

dllmain.cpp: dllmain.cpp:

BOOL APIENTRY DllMain( HMODULE hModule,
                   DWORD  ul_reason_for_call,
                   LPVOID lpReserved
                 )
{
  switch (ul_reason_for_call)
  {
    case DLL_PROCESS_ATTACH:
    case DLL_THREAD_ATTACH:
    case DLL_THREAD_DETACH:
    case DLL_PROCESS_DETACH:
    break;
  }
  return TRUE;
}

//this function is used to setup CUDA
void setCuda(unsigned int& readGLTexture, void* &cReadCudaResource, unsigned int& viewGLTexture, void* &cViewCudaResource) {

  struct cudaGraphicsResource* viewCudaResource;
  struct cudaGraphicsResource* readCudaResource;
  cudaError cError;

  cudaGLSetGLDevice(0);
  cError=cudaGraphicsGLRegisterImage(&viewCudaResource, viewGLTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
  cError=cudaGraphicsGLRegisterImage(&readCudaResource, readGLTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);
  cReadCudaResource=reinterpret_cast<void*>(readCudaResource);
  cViewCudaResource=reinterpret_cast<void*>(viewCudaResource);
}    

//this function is used to draw texture image via CUDA
void drawPicture(void* &cReadCudaResource, void* &cViewCudaResource) {

  cudaError cError;

  struct cudaGraphicsResource* viewCudaResource=reinterpret_cast<cudaGraphicsResource*>(cReadCudaResource);
  struct cudaGraphicsResource* readCudaResource=reinterpret_cast<cudaGraphicsResource*>(cViewCudaResource);
  cudaArray *readCudaArray;
  cudaArray *viewCudaArray;

  cError=cudaGraphicsMapResources(1, &readCudaResource);
  cError=cudaGraphicsMapResources(1, &viewCudaResource);
  cError=cudaGraphicsSubResourceGetMappedArray(&readCudaArray, readCudaResource, 0, 0);
  cError=cudaGraphicsSubResourceGetMappedArray(&viewCudaArray, viewCudaResource, 0, 0);
  callCUDAKernel(readCudaArray, viewCudaArray);
  cudaGraphicsUnmapResources(1, &viewCudaResource);
  cudaStreamSynchronize(0);
}

kernels.cu: kernels.cu:

    #include "stdafx.h"

texture<uchar4, cudaTextureType2D, cudaReadModeElementType> readCudaTextureObject;
surface<void, cudaSurfaceType2D> viewCudaSurfaceObject;


__global__ void renderingKernel() {

  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  uchar4 dd=tex2D(readCudaTextureObject, x, y);

  surf2Dwrite(dd, viewCudaSurfaceObject, x*sizeof(dd), y, cudaBoundaryModeZero);
}


void callCUDAKernel(cudaArray *readCudaArray, cudaArray *viewCudaArray) {

  cudaError cError;

  cError=cudaBindTextureToArray(readCudaTextureObject, readCudaArray);
  cError=cudaBindSurfaceToArray(viewCudaSurfaceObject, viewCudaArray);
  dim3 block(256, 1, 1);
  dim3 grid(2, 512, 1);
  renderingKernel<<<grid, block>>>();
  cudaPeekAtLastError();
  cudaDeviceSynchronize();
}

CUDAdevice's stdafx.h: CUDAdevice的stdafx.h:

    #pragma once

#include "targetver.h"

#define WIN32_LEAN_AND_MEAN             // Exclude rarely-used stuff from Windows headers
// Windows Header Files:
#include <windows.h>



// TODO: reference additional headers your program requires here
#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>

#include "kernels.h"

#if defined (__cplusplus)
extern "C"
{
#endif

__declspec(dllexport) void setCuda(unsigned int& readGLTexture, void* &cReadCudaResource, unsigned int& viewGLTexture, void* &cViewCudaResource);
__declspec(dllexport) void drawPicture(void* &cReadCudaResource, void* &cViewCudaResource);

#if defined (__cplusplus)
}
#endif

kernels.h: kernels.h:

    #ifndef __kernels_H
#define __kernels_H

void callCUDAKernel(cudaArray *readCudaArray, cudaArray *viewCudaArray);

#endif

PS. PS。 I have also set paths to CUDA libraries, headers, sources and binaries, added cudart.lib to additional dependencies, set CUDA 4.2 targets and compute_20,sm_21. 我还设置了CUDA库,标头,源和二进制文件的路径,将cudart.lib添加到其他依赖项,设置了CUDA 4.2目标和compute_20,sm_21。 Test program uses GLEW and GLUT libraries. 测试程序使用GLEW和GLUT库。

Better update solution here if someone reads this... 如果有人阅读此内容,可以在这里更好地更新解决方案...

The above code cannot work. 上面的代码不起作用。 The whole idea was to avoid using heap memory in a DLL module. 整个想法是避免在DLL模块中使用堆内存。 But, CUDA needs textures and surfaces to be global: 但是,CUDA需要纹理和表面是全局的:

texture<uchar4, cudaTextureType2D, cudaReadModeElementType>readCudaTextureObject;
surface<void, cudaSurfaceType2D> viewCudaSurfaceObject;

However, to avoid using heap memory requires these to be local, so this is impossible. 但是,要避免使用堆内存,这些内存必须位于本地,因此这是不可能的。

Furthermore, according to my understanding, there shouldn't be any problem to use a heap memory inside a CUDA DLL module. 此外,根据我的理解,在CUDA DLL模块中使用堆内存应该没有任何问题。 I've already made working EXE/DLL test programs, which use heap memory inside DLL module. 我已经制作了可以运行的EXE / DLL测试程序,该程序使用DL​​L模块中的堆内存。 I haven't found the bug in my previous attempts, but it may be due to the GLEW library. 我以前的尝试中没有发现该错误,但这可能是由于GLEW库造成的。 I'm not using GLEW with my working programs. 我的工作程序中没有使用GLEW。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM