繁体   English   中英

比较 C 中不同大小的矩阵乘法执行时间

[英]Compare Matrix Multiplication Execution Time in Different Sizes in C

我需要计算和比较 C 编程语言中 3 种不同大小(100 * 100、1000 * 1000 和 10000 * 10000)的 2 个矩阵相乘的执行时间。 我写了以下简单的代码来做 1000 * 1000 并且我得到了执行时间

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{
    int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

    // Dynamic allocation.
    double(*a)[r1][c1] = malloc(sizeof *a);
    double(*b)[r2][c2] = malloc(sizeof *b);
    double(*result)[r1][c2] = malloc(sizeof *result);

    // Storing elements of first matrix.
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c1; ++j)
        {
            (*a)[i][j] = rand() / RAND_MAX;
        }
    }

    // Storing elements of second matrix.
    for (i = 0; i < r2; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*b)[i][j] = rand() / RAND_MAX;
        }
    }

    // Initializing all elements of result matrix to 0
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*result)[i][j] = 0;
        }
    }

    clock_t begin1 = clock();
    // Multiplying matrices a and b and
    // storing result in result matrix
    for (i = 0; i < r1; ++i)
        for (j = 0; j < c2; ++j)
            for (k = 0; k < c1; ++k)
            {
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];
            }

    clock_t end1 = clock();
    double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
    printf("\n function took %f seconds to execute \n", time_taken);
    return 0;
}

现在我想对另外两种尺寸重复这部分,并在我的程序结束时通过一次运行得到这样的结果:

the execution time for 100 * 100 is 1 second
the execution time for 1000 * 1000 is 2 seconds
the execution time for 10000 * 10000 is 3 seconds

什么是最好的解决方案? 当我在 1000 * 1000 之后重复这部分 10000 * 10000 时,我得到了分段错误错误。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{
    int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

    // Dynamic allocation.
    double(*a)[r1][c1] = malloc(sizeof *a);
    double(*b)[r2][c2] = malloc(sizeof *b);
    double(*result)[r1][c2] = malloc(sizeof *result);

    // Storing elements of first matrix.
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c1; ++j)
        {
            (*a)[i][j] = rand() / RAND_MAX;
        }
    }

    // Storing elements of second matrix.
    for (i = 0; i < r2; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*b)[i][j] = rand() / RAND_MAX;
        }
    }

    // Initializing all elements of result matrix to 0
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*result)[i][j] = 0;
        }
    }

    clock_t begin1 = clock();
    // Multiplying matrices a and b and
    // storing result in result matrix
    for (i = 0; i < r1; ++i)
        for (j = 0; j < c2; ++j)
            for (k = 0; k < c1; ++k)
            {
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];
            }

    clock_t end1 = clock();
    double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
    printf("\n \nfunction took %f seconds to execute \n", 
           time_taken);
    free(a);
    free(b);
    free(result);

    r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
printf("\n run second one for %d \n",r1);
    // Storing elements of first matrix.
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c1; ++j)
        {
            (*a)[i][j] = rand() / RAND_MAX;
        }
    }

    // Storing elements of second matrix.
    for (i = 0; i < r2; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*b)[i][j] = rand() / RAND_MAX;
        }
    }

    // Initializing all elements of result matrix to 0
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*result)[i][j] = 0;
        }
    }

    begin1 = clock();
    // Multiplying matrices a and b and
    // storing result in result matrix
    for (i = 0; i < r1; ++i)
        for (j = 0; j < c2; ++j)
            for (k = 0; k < c1; ++k)
            {
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];
            }

    end1 = clock();
    time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
    printf("\n second function took %f seconds to execute \n", 
           time_taken);
    free(a);
    free(b);
    free(result);

    return 0;
}

您的程序的简化版本:

...
int main()
{

    int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

    // Dynamic allocation.

    double(*a)[r1][c1] = malloc(sizeof *a);
    double(*b)[r2][c2] = malloc(sizeof *b);
    double(*result)[r1][c2] = malloc(sizeof *result);

    ...

    free(a);
    free(b);
    free(result);

    r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
    for (i = 0; i < r1; ++i)
        for (j = 0; j < c1; ++j)
            (*a)[i][j] = rand() /RAND_MAX; // KABOOM !
...
}

有关 VLA 阵列的快速但重要的信息。 在“variable-length-array”中命名为“variable”意味着大小存储在variable中,而不是 size 是variable 此变量是隐藏的,只能通过sizeof运算符读取。

数组的大小取决于它的类型,而不是它的值。 因此,无论对象是动态的还是自动的,VLA 类型(和对象)的尺寸都不能改变。

该行:

double(*a)[r1][c1] = malloc(sizeof *a);

它解释为:

typedef double __hidden_type[r1][c1];
__hidden_type* a = malloc(sizeof *a);

... changes of r1 or c1 do not affect sizeof(__hidden_type)

定义类型时,大小将绑定到类型。 之后,类型是不可变的。

因此更改r1不会改变*a的大小。 您需要创建一个新a (或更确切地说是它的类型)并为这个新*a分配内存。

我建议将整个测试移至以r1r2c1c2作为参数的函数。 数组对于函数来说是本地的。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void bench(int r1, int c1, int r2, int c2) {
    int i, j, k;

    // Dynamic allocation.

    double(*a)[r1][c1] = malloc(sizeof *a);
    double(*b)[r2][c2] = malloc(sizeof *b);
    double(*result)[r1][c2] = malloc(sizeof *result);

    // Storing elements of first matrix.
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c1; ++j)
        {
            (*a)[i][j] = rand() /RAND_MAX;
        }
    }

    // Storing elements of second matrix.

    for (i = 0; i < r2; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*b)[i][j] = rand()/ RAND_MAX;
        }
    }
    // Initializing all elements of result matrix to 0
    for (i = 0; i < r1; ++i)
    {
        for (j = 0; j < c2; ++j)
        {
            (*result)[i][j] = 0;
        }
    }
     clock_t begin1 = clock();
    // Multiplying matrices a and b and
    // storing result in result matrix
    for (i = 0; i < r1; ++i)
        for (j = 0; j < c2; ++j)
            for (k = 0; k < c1; ++k)
            {
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];
            }

    clock_t end1 = clock();
    double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
   printf("\n \nfunction took %f seconds to execute \n", time_taken);
    free(a);
    free(b);
    free(result);
}

int main()
{
    bench(1000, 1000, 1000, 1000);
    bench(2000, 2000, 2000, 2000);
}

我已将大小从 10000 减少到 2000,以便在合理的时间内获得结果。 在我的机器上,我得到:

function took 1.966788 seconds to execute 
function took 37.370633 seconds to execute 

请注意,该函数对缓存非常不友好。

    for (i = 0; i < r1; ++i)
        for (j = 0; j < c2; ++j)
            for (k = 0; k < c1; ++k)
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];

k的每次迭代中,访问(*b)[k][j]时都会出现缓存未命中。 尝试交换jk循环:

    for (i = 0; i < r1; ++i)
      for (k = 0; k < c1; ++k)
        for (j = 0; j < c2; ++j)
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];

现在当增加j时, (*result)[i][j](*b)[k][j]可能在缓存中。 在我的机器上,这个微不足道的改变给了 10 倍的加速:

function took 0.319594 seconds to execute 
function took 3.829459 seconds to execute 

您的代码中有多个问题:

  • 您释放矩阵并执行新的基准测试,通过无效指针存储数据......这具有未定义的行为,在您的情况下是分段错误。
  • 分配代码特定于初始矩阵大小,您不能在main()函数中为不同大小重新分配矩阵。 您应该将代码移动到以矩阵大小为参数的单独函数中,并多次调用此函数。
  • 初始化值rand() / RAND_MAX几乎总是为零,因为整数算术用于此除法。 您应该使用(*a)[i][j] = rand() / (double)RAND_MAX;

这是修改后的版本(类似于tstanisl的):

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void test(int r1, int c1, int r2, int c2) {
    int i, j, k;

    // Dynamic allocation.
    double(*a)[r1][c1] = malloc(sizeof *a);
    double(*b)[r2][c2] = malloc(sizeof *b);
    double(*result)[r1][c2] = malloc(sizeof *result);

    // Storing elements of first matrix.
    for (i = 0; i < r1; ++i) {
        for (j = 0; j < c1; ++j) {
            (*a)[i][j] = rand() / (double)RAND_MAX;
        }
    }
    // Storing elements of second matrix.
    for (i = 0; i < r2; ++i) {
        for (j = 0; j < c2; ++j) {
            (*b)[i][j] = rand() / (double)RAND_MAX;
        }
    }
    // Initializing all elements of result matrix to 0
    for (i = 0; i < r1; ++i) {
        for (j = 0; j < c2; ++j) {
            (*result)[i][j] = 0;
        }
    }
    clock_t begin1 = clock();
    // Multiplying matrices a and b and
    // storing result in result matrix
    // using cache friendly index order
    for (i = 0; i < r1; ++i) {
        for (k = 0; k < c1; ++k) {
            for (j = 0; j < c2; ++j) {
                (*result)[i][j] += (*a)[i][k] * (*b)[k][j];
            }
        }
    }
    clock_t end1 = clock();
    double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
    printf("M(%d,%d) x M(%d,%d) took %f seconds to execute\n",
           r1, c1, r2, c2, time_taken);
    free(a);
    free(b);
    free(result);
}

int main() {
    test(100, 100, 100, 100);
    test(1000, 1000, 1000, 1000);
    test(2000, 2000, 2000, 2000);
    test(3000, 3000, 3000, 3000);
    test(4000, 4000, 4000, 4000);
    return 0;
}

输出:

M(100,100) x M(100,100) took 0.000347 seconds to execute
M(1000,1000) x M(1000,1000) took 0.616177 seconds to execute
M(2000,2000) x M(2000,2000) took 5.017987 seconds to execute
M(3000,3000) x M(3000,3000) took 17.703356 seconds to execute
M(4000,4000) x M(4000,4000) took 43.825951 seconds to execute

这种简单实现的时间复杂度为O(N 3 ) ,与上述时序一致。 给定足够的 RAM (2.4 GB),将矩阵与 10000 行和列相乘将需要 10 多分钟。

在 3 秒内实现 2 个 10k 乘以 10k double矩阵的乘法需要专门的硬件和量身定制的软件,远远超出了这个答案中的简单方法。

现在我想对其他两种尺寸重复这部分,并在我的程序结束时通过一次运行得到这样的结果: 100 * 100 的执行时间是 1 秒 1000 * 1000 的执行时间是 2 秒 执行10000 * 10000 的时间是 3 秒

我简直不敢相信您在 3 秒内将两个 10,000 x 10,000 矩阵相乘。 你在哪台电脑上做这个实验? 不,使用该算法并且仅使用一个核心。 可能您正在优化您的编译(使用默认标志-O2并且整个算法已从编译器输出中清除,因为您在计算后不使用矩阵,因此没有必要在循环中浪费时间)首先,编译你的矩阵,或者在计算后打印结果矩阵的一个元素,所以编译器不能驱逐计算。 但不要说您的算法是在 3 秒内将两个矩阵相乘 10,000 行和列。

分配 10,000x10,000 double数量的矩阵需要大量内存。 这是 100,000,000 次double条目,在单个 malloc 中超过 800Mb 您的笔记本电脑很可能一次可以处理这个......但不要做很多这样的分配,因为您可能会超过malloc(3) (或您的系统)。

当您需要至少两个这样的分配时,甚至更多,正如您所说的要重复计算。

您是否尝试将您的问题扩展到 100,000 x 100,000 个矩阵?

当您重复时,不能保证malloc()维护的堆中没有碎片,因此当您请求每个矩阵 1 GB 的连续内存时,可能malloc(3)在最后一个 malloc 中耗尽内存。 我建议您在不同的程序(单独)中进行这三个测试,并且不要在运行涉及 200,000,000 个数字的矩阵乘法时启动任何其他程序(例如浏览器或您喜欢的桌面环境。

无论如何,如果你不能很好地控制你的执行环境,你的程序可能会开始交换,只是破坏你试图做的所有效率测量。

另一件事是您可能会遇到进程限制(如果您的管理员为您的进程建立了最大核心内存限制)并且您没有检查malloc(3)的结果是否正确分配。

笔记:

我无法在我的机器(具有 6Gb RAM 的 Pentium Core duo)中将两个 10,000 乘以 10,000 矩阵相乘,因为程序开始交换并且变得异常缓慢。 顺便说一句,小心,不要用优化编译你的程序(即使 -O1 也会使矩阵乘法从代码中完全消除,因为没有使用乘积,所以完整的算法会从输出代码中消除编译器中的优化代码)如果我得到一些结果,我会让它运行并更新我的答案。

(编辑)

$ matrix_mult
function took 9364.303443 seconds to execute
$ 

在我的系统中执行需要 2h 36m 6.303443s。 这更接近问题的复杂性(因为您的近似值几乎不是线性的),如果您想要计算产品,您需要在不优化的情况下编译它,或者至少打印结果矩阵的一个元素。

下面是我正在测试的代码,如果你想测试它(我对其进行了一些修改以使其更具可读性,使用更好的时间戳,并避免 C 初始化,因为它可以在运行中完成(阅读代码中的注释):

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 10000

int main()
{

    // Dynamic allocation.
#define A (*a)
#define B (*b)
#define R (*result)

    double A[N][N] = malloc(sizeof *a);
    double B[N][N] = malloc(sizeof *b);
    double R[N][N] = malloc(sizeof *result);

    printf("Initializing both matrices\n");
    // Storing elements of first (and second) matrices.
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < N; ++col) {
            A[row][col] = rand() / (double)RAND_MAX; // matrix A
            B[row][col] = rand() / (double)RAND_MAX; // matrix B
        }
    }

    // Storing elements of second matrix. (done above)

    // Initializing all elements of result matrix to 0
    // (not needed, see below)

    printf("Starting multiplication\n");
    struct timespec start_ts; // start timestamp (secs & nsecs).
    int res = clock_gettime(
            CLOCK_PROCESS_CPUTIME_ID,
            &start_ts); // cpu time only.
    if (res < 0) {
        perror("clock_gettime");
        exit(EXIT_FAILURE);
    }
    // Multiplying matrices a and b and
    // storing result in result matrix
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < N; ++col) {
            double aux = 0.0;
            for (int k = 0; k < N; ++k) {
                aux += A[row][k] * B[k][col];
            }
            // why to involve calculating the address of the affected
            // cell at every inner loop iteration???
            R[row][col] = aux;
        }
    }
    struct timespec end_ts;
    res = clock_gettime(
            CLOCK_PROCESS_CPUTIME_ID,
            &end_ts);
    if (res < 0) {
        perror("clock_gettime");
        exit(EXIT_FAILURE);
    }

    bool carry = start_ts.tv_nsec > end_ts.tv_nsec;
    struct timespec diff_time;
    diff_time.tv_sec  = end_ts.tv_sec  - start_ts.tv_sec;
    diff_time.tv_nsec = end_ts.tv_nsec - start_ts.tv_nsec;
    if (carry) {
        diff_time.tv_sec--;
        diff_time.tv_nsec += 1000000000;
    }
    printf("\n function took %ld.%06ld seconds to execute \n",
            diff_time.tv_sec, diff_time.tv_nsec / 1000);
    return 0;
}

第二次编辑

我已经使用以下程序在您的程序的修改版本(但使用相同的算法)上测试了乘法时间:

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define MAX 10000

int dim[] = {
    89, 144, 233, 377, 610,
    987, 1597, 2584, 4181, 6765,
    10000
};  /* several numbers taken from fibonacci numbers */
size_t dim_cnt = sizeof dim / sizeof dim[0];

double A[MAX][MAX];
double B[MAX][MAX];
double R[MAX][MAX];

int main()
{

    for (int rep = 0; rep < dim_cnt; rep++) {

        size_t N = dim[rep];

        printf("It %d: Initializing both %zd x %zd matrices\n",
                rep, N, N);
        // Storing elements of first (and second) matrices.
        for (int row = 0; row < N; ++row) {
            for (int col = 0; col < N; ++col) {
                A[row][col] = rand() /(double)RAND_MAX; // matrix A
                B[row][col] = rand() /(double)RAND_MAX; // matrix B
            }
        }

        // Storing elements of second matrix. (done above)

        // Initializing all elements of result matrix to 0
        // (not needed, see below)

        printf("It %d: Starting multiplication\n", rep);
        struct timespec start_ts; // start timestamp (secs & nsecs).
        int res = clock_gettime(
                CLOCK_PROCESS_CPUTIME_ID,
                &start_ts); // cpu time only.
        if (res < 0) {
            perror("clock_gettime");
            exit(EXIT_FAILURE);
        }
        // Multiplying matrices a and b and
        // storing result in result matrix
        for (int row = 0; row < N; ++row) {
            for (int col = 0; col < N; ++col) {
                double aux = 0.0;
                for (int k = 0; k < N; ++k) {
                    aux += A[row][k] * B[k][col];
                }
                // why to involve calculating the address of the affected
                // cell at every inner loop iteration???
                R[row][col] = aux;
            }
        }
        struct timespec end_ts;
        res = clock_gettime(
                CLOCK_PROCESS_CPUTIME_ID,
                &end_ts);
        if (res < 0) {
            perror("clock_gettime");
            exit(EXIT_FAILURE);
        }

        bool carry = start_ts.tv_nsec > end_ts.tv_nsec;
        struct timespec diff_time;
        diff_time.tv_sec  = end_ts.tv_sec  - start_ts.tv_sec;
        diff_time.tv_nsec = end_ts.tv_nsec - start_ts.tv_nsec;
        if (carry) {
            diff_time.tv_sec--;
            diff_time.tv_nsec += 1000000000;
        }
        printf("It %d: R[0][0] = %g\n", rep, R[0][0]);
        printf("%7zd %ld.%06ld\n",
                N,
                diff_time.tv_sec,
                diff_time.tv_nsec / 1000);
    }
    return 0;
}

它为最大的矩阵使用单个静态分配的内存,并使用其中的一个子集对较低的矩阵进行建模。 执行时间远比您在文档中显示的值更合适,我打印结果矩阵单元格值以强制优化器保存矩阵计算。 您获得的执行结果应该显示与此处显示的值成比例的值。

$ time matrix_mult
It 0: Initializing both 89 x 89 matrices
It 0: Starting multiplication
It 0: R[0][0] = 23.6756
     89 0.005026
It 1: Initializing both 144 x 144 matrices
It 1: Starting multiplication
It 1: R[0][0] = 40.2614
    144 0.019682
It 2: Initializing both 233 x 233 matrices
It 2: Starting multiplication
It 2: R[0][0] = 59.5599
    233 0.095213
It 3: Initializing both 377 x 377 matrices
It 3: Starting multiplication
It 3: R[0][0] = 93.4422
    377 0.392914
It 4: Initializing both 610 x 610 matrices
It 4: Starting multiplication
It 4: R[0][0] = 153.068
    610 1.671904
It 5: Initializing both 987 x 987 matrices
It 5: Starting multiplication
It 5: R[0][0] = 252.981
    987 8.816252
It 6: Initializing both 1597 x 1597 matrices
It 6: Starting multiplication
It 6: R[0][0] = 403.61
   1597 37.807920
It 7: Initializing both 2584 x 2584 matrices
It 7: Starting multiplication
It 7: R[0][0] = 629.521
   2584 157.371367
It 8: Initializing both 4181 x 4181 matrices
It 8: Starting multiplication
It 8: R[0][0] = 1036.47
   4181 667.084346
It 9: Initializing both 6765 x 6765 matrices
It 9: Starting multiplication
It 9: R[0][0] = 1653.59
   6765 2831.117818
It 10: Initializing both 10000 x 10000 matrices
It 10: Starting multiplication
It 10: R[0][0] = 2521.68
  10000 9211.738007

real    216m46,129s
user    215m16,041s
sys     0m4,899s
$ _

在我的系统中,程序显示以下大小

$ size matrix_mult
  text   data          bss          dec          hex   filename
  2882    528   2400000016   2400003426   0x8f0d2562   matrix_mult
$ _

具有 2.4Gb 的大bss段,对应于每个大约 800Mb 的三个变量。

最后一点:使用 VLA 并让您的程序动态分配将在整个程序生命周期中使用的东西不会帮助您使其更快或更慢。 是算法使程序更快或更慢。 但是我担心你还没有在 3 秒内计算出 10,000 x 10,000 的矩阵乘积。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM