繁体   English   中英

Strassen矩阵乘法的内存管理

[英]Memory management for Strassen's matrix multiplication

作为任务的一部分,我试图找出Strassen矩阵乘法和朴素乘法算法的交叉点。 但同样,当矩阵变为256x256时,我无法继续。 有人可以建议我适当的内存管理技术,以便能够处理更大的输入。

代码在C中如下:

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>

void strassenMul(double* X, double* Y, double* Z, int m);
void matMul(double* A, double* B, double* C, int n);
void matAdd(double* A, double* B, double* C, int m);
void matSub(double* A, double* B, double* C, int m);

int idx = 0;

int main()
{
    int N;
    int count = 0;
    int i, j;
    clock_t start, end;
    double elapsed;

    int total = 15;
    double tnaive[total];
    double tstrassen[total];
    printf("-------------------------------------------------------------------------\n\n");
    for (count = 0; count < total; count++)
    {
        N = pow(2, count);
        printf("Matrix size = %2d\t",N);
        double X[N][N], Y[N][N], Z[N][N], W[N][N];
        srand(time(NULL));
        for (i = 0; i < N; i++)
        {
            for (j = 0; j < N; j++)
            {
                X[i][j] = rand()/(RAND_MAX + 1.);
                Y[i][j] = rand()/(RAND_MAX + 1.);
            }
        }
        start = clock();
        matMul((double *)X, (double *)Y, (double *)W, N);
        end = clock();
        elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
        tnaive[count] = elapsed;
        printf("naive = %5.4f\t\t",tnaive[count]);

        start = clock();
        strassenMul((double *)X, (double *)Y, (double *)Z, N);
        end = clock();
        elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
        tstrassen[count] = elapsed;
        printf("strassen = %5.4f\n",tstrassen[count]);
    }
    printf("-------------------------------------------------------------------\n\n\n");

    while (tnaive[idx+1] <= tstrassen[idx+1] && idx < 14) idx++;

    printf("Optimum input size to switch from normal multiplication to Strassen's is above %d\n\n", idx);

    printf("Please enter the size of array as a power of 2\n");
    scanf("%d",&N);
    double A[N][N], B[N][N], C[N][N];
    srand(time(NULL));
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            A[i][j] = rand()/(RAND_MAX + 1.);
            B[i][j] = rand()/(RAND_MAX + 1.);
        }
    }

printf("------------------- Input Matrices A and B ---------------------------\n\n");
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",A[i][j]);
        printf("\n");
    }
    printf("\n");

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",B[i][j]);
        printf("\n");
    }
    printf("\n------- Output matrix by Strassen's method after optimization -----------\n\n");

    strassenMul((double *)A, (double *)B, (double *)C, N);

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",C[i][j]);
        printf("\n");
    }
    return(0);
}

void strassenMul(double *X, double *Y, double *Z, int m)
{
    if (m <= idx)
    {
        matMul((double *)X, (double *)Y, (double *)Z, m);
        return;
    }
    if (m == 1)
    {
        *Z = *X * *Y;
        return;
    }
    int row = 0, col = 0;
    int n = m/2;
    int i = 0, j = 0;
    double x11[n][n], x12[n][n], x21[n][n], x22[n][n];
    double y11[n][n], y12[n][n], y21[n][n], y22[n][n];
    double P1[n][n], P2[n][n], P3[n][n], P4[n][n], P5[n][n], P6[n][n], P7[n][n];
    double C11[n][n], C12[n][n], C21[n][n], C22[n][n];
    double S1[n][n], S2[n][n], S3[n][n], S4[n][n], S5[n][n], S6[n][n], S7[n][n];
    double S8[n][n], S9[n][n], S10[n][n], S11[n][n], S12[n][n], S13[n][n], S14[n][n];

    for (row = 0, i = 0; row < n; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
        {
            x11[i][j] = *((X+row*m)+col);
            y11[i][j] = *((Y+row*m)+col);
        }
        for (col = n, j = 0; col < m; col++, j++)
        {
            x12[i][j] = *((X+row*m)+col);
            y12[i][j] = *((Y+row*m)+col);
        }
    }

    for (row = n, i = 0; row < m; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
        {
            x21[i][j] = *((X+row*m)+col);
            y21[i][j] = *((Y+row*m)+col);
        }
        for (col = n, j = 0; col < m; col++, j++)
        {
            x22[i][j] = *((X+row*m)+col);
            y22[i][j] = *((Y+row*m)+col);
        }
    }

    // Calculating P1
    matAdd((double *)x11, (double *)x22, (double *)S1, n);
    matAdd((double *)y11, (double *)y22, (double *)S2, n);
    strassenMul((double *)S1, (double *)S2, (double *)P1, n);

    // Calculating P2
    matAdd((double *)x21, (double *)x22, (double *)S3, n);
    strassenMul((double *)S3, (double *)y11, (double *)P2, n);

    // Calculating P3
    matSub((double *)y12, (double *)y22, (double *)S4, n);
    strassenMul((double *)x11, (double *)S4, (double *)P3, n);

    // Calculating P4
    matSub((double *)y21, (double *)y11, (double *)S5, n);
    strassenMul((double *)x22, (double *)S5, (double *)P4, n);

    // Calculating P5
    matAdd((double *)x11, (double *)x12, (double *)S6, n);
    strassenMul((double *)S6, (double *)y22, (double *)P5, n);

    // Calculating P6
    matSub((double *)x21, (double *)x11, (double *)S7, n);
    matAdd((double *)y11, (double *)y12, (double *)S8, n);
    strassenMul((double *)S7, (double *)S8, (double *)P6, n);

    // Calculating P7
    matSub((double *)x12, (double *)x22, (double *)S9, n);
    matAdd((double *)y21, (double *)y22, (double *)S10, n);
    strassenMul((double *)S9, (double *)S10, (double *)P7, n);

    // Calculating C11
    matAdd((double *)P1, (double *)P4, (double *)S11, n);
    matSub((double *)S11, (double *)P5, (double *)S12, n);
    matAdd((double *)S12, (double *)P7, (double *)C11, n);

    // Calculating C12
    matAdd((double *)P3, (double *)P5, (double *)C12, n);

    // Calculating C21
    matAdd((double *)P2, (double *)P4, (double *)C21, n);

    // Calculating C22
    matAdd((double *)P1, (double *)P3, (double *)S13, n);
    matSub((double *)S13, (double *)P2, (double *)S14, n);
    matAdd((double *)S14, (double *)P6, (double *)C22, n);

    for (row = 0, i = 0; row < n; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
            *((Z+row*m)+col) = C11[i][j];
        for (col = n, j = 0; col < m; col++, j++)
            *((Z+row*m)+col) = C12[i][j];
    }
    for (row = n, i = 0; row < m; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
            *((Z+row*m)+col) = C21[i][j];
        for (col = n, j = 0; col < m; col++, j++)
            *((Z+row*m)+col) = C22[i][j];
    }
}

void matMul(double *A, double *B, double *C, int n)
{
    int i = 0, j = 0, k = 0, row = 0, col = 0;
    double sum;
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < n; j++)
        {
            sum = 0.0;
            for (k = 0; k < n; k++)
            {
                sum += *((A+i*n)+k) * *((B+k*n)+j);
            }
            *((C+i*n)+j) = sum;
        }
    }
}

void matAdd(double *A, double *B, double *C, int m)
{
    int row = 0, col = 0;
    for (row = 0; row < m; row++)
        for (col = 0; col < m; col++)
            *((C+row*m)+col) = *((A+row*m)+col) + *((B+row*m)+col);
}

void matSub(double *A, double *B, double *C, int m)
{
    int row = 0, col = 0;
    for (row = 0; row < m; row++)
        for (col = 0; col < m; col++)
            *((C+row*m)+col) = *((A+row*m)+col) - *((B+row*m)+col);
}

稍后添加如果我尝试使用malloc语句进行内存分配,则代码如下。 但问题是它在朴素矩阵乘法后停止,甚至没有进入Strassen方法N = 1。 它显示关闭程序的提示。

for (count = 0; count < total; count++)
{
    N = pow(2, count);
    printf("Matrix size = %2d\t",N);
    //double X[N][N], Y[N][N], Z[N][N], W[N][N];
    double **X, **Y, **Z, **W;
    X = malloc(N * sizeof(double*));
    if (X == NULL){
        perror("Failed malloc() in X");
        return 1;
    }
    Y = malloc(N * sizeof(double*));
            if (Y == NULL){
                perror("Failed malloc() in Y");
                return 1;
    }
    Z = malloc(N * sizeof(double*));
            if (Z == NULL){
                perror("Failed malloc() in Z");
                return 1;
    }
    W = malloc(N * sizeof(double*));
            if (W == NULL){
                perror("Failed malloc() in W");
                return 1;
    }
    for (j = 0; j < N; j++)
    {
        X[j] = malloc(N * sizeof(double*));
        if (X[j] == NULL){
            perror("Failed malloc() in X[j]");
            return 1;
        }
        Y[j] = malloc(N * sizeof(double*));
                    if (Y[j] == NULL){
                        perror("Failed malloc() in Y[j]");
                        return 1;
        }
        Z[j] = malloc(N * sizeof(double*));
                    if (Z[j] == NULL){
                        perror("Failed malloc() in Z[j]");
                        return 1;
        }
        W[j] = malloc(N * sizeof(double*));
                    if (W[j] == NULL){
                        perror("Failed malloc() in W[j]");
                        return 1;
        }
    }
    srand(time(NULL));
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            X[i][j] = rand()/(RAND_MAX + 1.);
            Y[i][j] = rand()/(RAND_MAX + 1.);
        }
    }
    start = clock();
    matMul((double *)X, (double *)Y, (double *)W, N);
    end = clock();
    elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
    tnaive[count] = elapsed;
    printf("naive = %5.4f\t\t",tnaive[count]);

    start = clock();
    strassenMul((double *)X, (double *)Y, (double *)Z, N);
    end = clock();
    elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
    tstrassen[count] = elapsed;
    for (j = 0; j < N; j++)
    {
        free(X[j]);
        free(Y[j]);
        free(Z[j]);
        free(W[j]);
    }

    free(X); free(Y); free(Z); free(W);

    printf("strassen = %5.4f\n",tstrassen[count]);
}

我重写了答案。 我以前逐行分配内存的答案将无效,因为OP在传递给函数时已将2-D数组转换为1-D数组。 这是我对代码的重写,并进行了一些简化,例如将所有矩阵数组保持为1维。

我不确定Strassen的方法到底是什么,尽管递归将矩阵维度减半。 所以我想知道在访问传递的数组时是否打算使用row*2col*2

我希望这些技术对你有用 - 即使它有效! 所有矩阵数组现在都在堆上。

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>

#define total   4       //15

void strassenMul(double* X, double* Y, double* Z, int m);
void matMul(double* A, double* B, double* C, int n);
void matAdd(double* A, double* B, double* C, int m);
void matSub(double* A, double* B, double* C, int m);

enum array { x11, x12, x21, x22, y11, y12, y21, y22,
    P1, P2, P3, P4, P5, P6, P7, C11, C12, C21, C22,
    S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, arrs };

int idx = 0;

int main()
{
    int N;
    int count = 0;
    int i, j;
    clock_t start, end;
    double elapsed;

    double tnaive[total];
    double tstrassen[total];
    double *X, *Y, *Z, *W, *A, *B, *C;

    printf("-------------------------------------------------------------------------\n\n");
    for (count = 0; count < total; count++)
    {
        N = (int)pow(2, count);
        printf("Matrix size = %2d\t",N);
        X = malloc(N*N*sizeof(double));
        Y = malloc(N*N*sizeof(double));
        Z = malloc(N*N*sizeof(double));
        W = malloc(N*N*sizeof(double));
        if (X==NULL || Y==NULL || Z==NULL || W==NULL) {
            printf("Out of memory (1)\n");
            return 1;
        }
        srand((unsigned)time(NULL));
        for (i=0; i<N*N; i++)
        {
            X[i] = rand()/(RAND_MAX + 1.);
            Y[i] = rand()/(RAND_MAX + 1.);
        }
        start = clock();
        matMul(X, Y, W, N);
        end = clock();
        elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
        tnaive[count] = elapsed;
        printf("naive = %5.4f\t\t",tnaive[count]);

        start = clock();
        strassenMul(X, Y, Z, N);
        free(W); 
        free(Z); 
        free(Y); 
        free(X); 
        end = clock();
        elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
        tstrassen[count] = elapsed;
        printf("strassen = %5.4f\n",tstrassen[count]);
    }
    printf("-------------------------------------------------------------------\n\n\n");

    while (tnaive[idx+1] <= tstrassen[idx+1] && idx < 14) idx++;

    printf("Optimum input size to switch from normal multiplication to Strassen's is above %d\n\n", idx);

    printf("Please enter the size of array as a power of 2\n");
    scanf("%d",&N);
    A = malloc(N*N*sizeof(double));
    B = malloc(N*N*sizeof(double));
    C = malloc(N*N*sizeof(double));
    if (A==NULL || B==NULL || C==NULL) {
        printf("Out of memory (2)\n");
        return 1;
    }
    srand((unsigned)time(NULL));
    for (i=0; i<N*N; i++)
    {
        A[i] = rand()/(RAND_MAX + 1.);
        B[i] = rand()/(RAND_MAX + 1.);
    }

    printf("------------------- Input Matrices A and B ---------------------------\n\n");
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",A[i*N+j]);
        printf("\n");
    }
    printf("\n");

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",B[i*N+j]);
        printf("\n");
    }
    printf("\n------- Output matrix by Strassen's method after optimization -----------\n\n");

    strassenMul(A, B, C, N);

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
            printf("%5.4f  ",C[i*N+j]);
        printf("\n");
    }
    free(C); 
    free(B); 
    free(A); 
    return(0);
}

void strassenMul(double *X, double *Y, double *Z, int m)
{
    int row = 0, col = 0;
    int n = m/2;
    int i = 0, j = 0;
    double *arr[arrs];                      // each matrix mem ptr

    if (m <= idx)
    {
        matMul(X, Y, Z, m);
        return;
    }
    if (m == 1)
    {
        *Z = *X * *Y;
        return;
    }

    for (i=0; i<arrs; i++) {                // memory for arrays
        arr[i] = malloc(n*n*sizeof(double));
        if (arr[i] == NULL) {
            printf("Out of memory (1)\n");
            exit (1);                       // brutal
        }
    }

    for (row = 0, i = 0; row < n; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
        {
            arr[x11][i*n+j] = X[row*m+col];
            arr[y11][i*n+j] = Y[row*m+col];
        }
        for (col = n, j = 0; col < m; col++, j++)
        {
            arr[x12][i*n+j] = X[row*m+col];
            arr[y12][i*n+j] = Y[row*m+col];
        }
    }

    for (row = n, i = 0; row < m; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
        {
            arr[x21][i*n+j] = X[row*m+col];
            arr[y21][i*n+j] = Y[row*m+col];
        }
        for (col = n, j = 0; col < m; col++, j++)
        {
            arr[x22][i*n+j] = X[row*m+col];
            arr[y22][i*n+j] = Y[row*m+col];
        }
    }

    // Calculating P1
    matAdd(arr[x11], arr[x22], arr[S1], n);
    matAdd(arr[y11], arr[y22], arr[S2], n);
    strassenMul(arr[S1], arr[S2], arr[P1], n);

    // Calculating P2
    matAdd(arr[x21], arr[x22], arr[S3], n);
    strassenMul(arr[S3], arr[y11], arr[P2], n);

    // Calculating P3
    matSub(arr[y12], arr[y22], arr[S4], n);
    strassenMul(arr[x11], arr[S4], arr[P3], n);

    // Calculating P4
    matSub(arr[y21], arr[y11], arr[S5], n);
    strassenMul(arr[x22], arr[S5], arr[P4], n);

    // Calculating P5
    matAdd(arr[x11], arr[x12], arr[S6], n);
    strassenMul(arr[S6], arr[y22], arr[P5], n);

    // Calculating P6
    matSub(arr[x21], arr[x11], arr[S7], n);
    matAdd(arr[y11], arr[y12], arr[S8], n);
    strassenMul(arr[S7], arr[S8], arr[P6], n);

    // Calculating P7
    matSub(arr[x12], arr[x22], arr[S9], n);
    matAdd(arr[y21], arr[y22], arr[S10], n);
    strassenMul(arr[S9], arr[S10], arr[P7], n);

    // Calculating C11
    matAdd(arr[P1], arr[P4], arr[S11], n);
    matSub(arr[S11], arr[P5], arr[S12], n);
    matAdd(arr[S12], arr[P7], arr[C11], n);

    // Calculating C12
    matAdd(arr[P3], arr[P5], arr[C12], n);

    // Calculating C21
    matAdd(arr[P2], arr[P4], arr[C21], n);

    // Calculating C22
    matAdd(arr[P1], arr[P3], arr[S13], n);
    matSub(arr[S13], arr[P2], arr[S14], n);
    matAdd(arr[S14], arr[P6], arr[C22], n);

    for (row = 0, i = 0; row < n; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
            Z[row*m+col] = arr[C11][i*n+j];
        for (col = n, j = 0; col < m; col++, j++)
            Z[row*m+col] = arr[C12][i*n+j];
    }
    for (row = n, i = 0; row < m; row++, i++)
    {
        for (col = 0, j = 0; col < n; col++, j++)
            Z[row*m+col] = arr[C21][i*n+j];
        for (col = n, j = 0; col < m; col++, j++)
            Z[row*m+col] = arr[C22][i*n+j];
    }

    for (i=0; i<arrs; i++)
        free (arr[i]);
}

void matMul(double *A, double *B, double *C, int n)
{
    int i = 0, j = 0, k = 0, row = 0, col = 0;
    double sum;
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < n; j++)
        {
            sum = 0.0;
            for (k = 0; k < n; k++)
            {
                sum += A[i*n+k] * B[k*n+j];
            }
            C[i*n+j] = sum;
        }
    }
}

void matAdd(double *A, double *B, double *C, int m)
{
    int row = 0, col = 0;
    for (row = 0; row < m; row++)
        for (col = 0; col < m; col++)
            C[row*m+col] = A[row*m+col] + B[row*m+col];
}

void matSub(double *A, double *B, double *C, int m)
{
    int row = 0, col = 0;
    for (row = 0; row < m; row++)
        for (col = 0; col < m; col++)
            C[row*m+col] = A[row*m+col] - B[row*m+col];
}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM