CUDA function doesn't change data

Question

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define SZ_INT sizeof(int)
#define CELL_SZ 1
#define CELL_VALUE(a,x) (((a) << 1) | x)
#define FROM(a) ((a) & 1)
#define LENGTH(a) ((a) >> 1)
#define INDEX(i,j,m) ((i) * (m + 1) + j)
//FROM: 1 if L[i][j] took value from L[i - 1][j], 0 if L[i][j] took value from L[i][j - 1]

#define CUDA_CHECK_ERROR(err)           \
if (err != cudaSuccess) {          \
printf("Cuda error: %s\n", cudaGetErrorString(err));    \
printf("Error in file: %s, line: %i\n", __FILE__, __LINE__);  \
}  

__global__ void Find_L_entry (int *L, int *A, int n, int *B, int m, int diag) {
        int j = threadIdx.x + blockIdx.x * blockDim.x;
        int i = diag - j;
        if (i >= 0 && i < n && j >= 0 && j < m) {
                if (A[i] == B[j]) {
                        L[INDEX(i, j, m)] = CELL_VALUE(LENGTH(L[INDEX(i - 1, j - 1,     m)]) + 1, 0);
                } else {
                    L[INDEX(i, j, m)] = (LENGTH(L[INDEX(i - 1, j, m)]) >     LENGTH(L[INDEX(i, j - 1, m)])) ?
                                CELL_VALUE(LENGTH(L[INDEX(i - 1, j, m)]), 1) :
                               CELL_VALUE(LENGTH(L[INDEX(i, j - 1, m)]), 0);
                }
        }
}

__host__ void output_sequence(int *L, int *A, int n, int *B, int m) {
        int len = LENGTH(L[INDEX(n - 1, m - 1, m)]);
        int i = n - 1, j = m - 1;
        int *lcs = (int*) malloc(len * SZ_INT);
        int top = 0;
        while (i >= 0 && j >= 0) {
                if (A[i] == B[j]) {
                        lcs[top++] = A[i];
                        i--; j--;
                } else {
                        if (FROM(L[INDEX(i, j, m)]) == 1)
                                i--;
                        else
                                j--;
                }
        }
        printf("Length: %d\nSequence: ", len);
        for (int i = len - 1; i >= 0; i--) {
                printf("%d%c", lcs[i], i ? ' ' : '\n');
        }
        free(lcs);
}

__host__ void read_sequence(int *&A, int &n, int num) {
        printf("Enter number of elements in sequence %d\n", num);
        scanf("%d", &n);
        A = (int*) malloc(n * sizeof(int));
        printf("Enter %d elements of sequence %d\n", n, num);
        for (int i = 0; i < n; i++)
                scanf("%d", A + i);
}

int main ( int argc, char **argv ) {
        int number_of_blocks = atoi(argv[1]), threads_in_block = atoi(argv[2]);
        int n, m;
        int *A, *B;
        read_sequence(A, n, 1);
        read_sequence(B, m, 2);

    int *d_A, *d_B;
    cudaMalloc((void**)&d_A, n * SZ_INT);
    cudaMalloc((void**)&d_B, m * SZ_INT);

    CUDA_CHECK_ERROR(cudaMemcpy(d_A, A, n * SZ_INT, cudaMemcpyHostToDevice));
    CUDA_CHECK_ERROR(cudaMemcpy(d_B, B, m * SZ_INT, cudaMemcpyHostToDevice));

    int *big_L = (int*) malloc((n + 1) * (m + 1) * CELL_SZ * SZ_INT);
    for (int i = 0; i < (n + 1) * (m + 1) * CELL_SZ; i++)
            big_L[i] = 0;

    int *L = &big_L[(m + 2) * CELL_SZ];
    int *dev_L;
    cudaMalloc((void**)&dev_L, (n + 1) * (m + 1) * SZ_INT);
    int *d_L = &dev_L[(m + 2) * CELL_SZ];
    CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, (n * (m + 1) - 1) * SZ_INT, cudaMemcpyHostToDevice));

    int diag_count = n + m - 1;

    for (int diag = 0; diag < diag_count; diag++) {
            CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, SZ_INT, cudaMemcpyHostToDevice));
            Find_L_entry<<<number_of_blocks, threads_in_block>>>(d_L, d_A, n, d_B, m, diag);
            CUDA_CHECK_ERROR(cudaPeekAtLastError());
            CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
            CUDA_CHECK_ERROR(cudaDeviceSynchronize());
            for (int i = 0; i < n; i++)
                    for (int j = 0; j < m; j++)
                            printf("%d%c", L[INDEX(i,j,m)], j == m - 1 ? '\n' : ' ');
            system("pause");
            CUDA_CHECK_ERROR(cudaThreadSynchronize());
    }
    CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));

    output_sequence(L, A, n, B, m);

    cudaFree(d_L);
    cudaFree(d_A);
    cudaFree(d_B);


    free(A); free(B); free(big_L);

    return 0;
}

The code doesn't run properly. After calling funciton Find_L_entry aray d_L doesn't changes.
I'm compiling via cmd.
nvcc -g -G -arch=sm_21 -o lcs.exe lcs.cu
When I run it, I get a runtime error: "Cuda error: invalid device in function, line 94"

Answer 1

The runtime error you are receiving is occurring because the runtime API cannot either find or create code which can run on your GPU.

The underlying reason is that you are compiling your code for an architecture (compute capability 2.1) which is incompatible with your GPU. You have stated you have a GT310M, which you can see from here is a compute capability 1.2 device. The CUDA tool chain supports backwards code compatibility (ie old code will run on a new device), but not the other way around.

You should build your code something like this:

nvcc -g -G -arch=sm_12 -o lcs.exe lcs.cu

CUDA function doesn't change data

Question

1 answers

solution1
2 ACCPTED 2013-08-09 12:06:10

CUDA function doesn't change data

Question

1 answers

solution1 2 ACCPTED 2013-08-09 12:06:10

solution1
2 ACCPTED 2013-08-09 12:06:10