簡體   English   中英

使用SSE對任意大小的輸入矩陣和向量進行向量矩陣和矩陣矩陣乘法

[英]Vector-matrix & matrix-matrix multiplication using SSE for any size of input matrix and vector

我正在嘗試使用SSE Intrinsic進行矢量矩陣乘法和矩陣矩陣乘法,但是如果我嘗試除4的倍數以外的任何操作,都會收到一個錯誤消息“ Segmentation Fault”,但我無法弄清楚原因,它不為其他工作。 請提出更改建議,使其適用於任何大小的輸入。

以下是我的實現:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>  

/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
 ****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
    int i;
    for (i = 0; i < size*size; i++){
        vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
        printf("%f \n ", vector[i]);
    }
    for (i = 0; i < size*size; i++){
        matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
        printf("%f \n ", matrix[i]);
    }
}

/****************************************************
the following function calculate the below equation
   vector_out = vector_in x matrix_in
 ***************************************************/
void matrix_mult_sq(int size, float *vector_in,
               float *matrix_in, float *vector_out){
    int i, j, k;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            vector_out[size*i + j] = 0.0;
            for (k = 0; k < size; k++)
                vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
        }
    }
}

void matrix_mult_sse(int size, float *vector_in,
    float *matrix_in, float *vector_out){
    __m128 a_line, b_line, r_line;
    int i, j, k, l;
    for (k = 0; k < size; k++)
    {

        for (i = 0; i < size; i += 4){
            j = 0;
            b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
            a_line = _mm_set1_ps(vector_in[j + k*size]);      // a_line = vec4(vector_in[0])
            r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
            for (j = 1; j < size; j++) {
                b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
                a_line = _mm_set1_ps(vector_in[j + k*size]);  // b_line = vec4(b[i][j])
                // r_line += a_line * b_line
                r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
            }
            _mm_store_ps(&vector_out[i + k*size], r_line);     // r[i] = r_line
        }
    }
    for (l=0; l < size*size; l++)
    {
        printf("%f \n", vector_out[l]);
    }
}

int main(int argc, char *argv[]){
  if(argc < 2){
    printf("Usage: %s matrix/vector_size\n", argv[0]);
    return 0;
  }

  int size = atoi(argv[1]);
  if(size%4 != 0){
    printf("This version implements for ""size = 4*n"" only\n");
    return 0;
  }

  float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
  if(vector==NULL){
    printf("can't allocate the required memory for vector\n");
    return 0;
  }

  float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
  if(matrix==NULL){
    printf("can't allocate the required memory for matrix\n");
    free(vector);
    return 0;
  }

  float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_sq==NULL){
    printf("can't allocate the required memory for result_sq\n");
    free(vector);
    free(matrix);
    return 0;
  }

  float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_pl==NULL){
    printf("can't allocate the required memory for result_pl\n");
    free(vector);
    free(matrix);
    free(result_sq);
    return 0;
  }

  matrix_vector_gen(size, matrix, vector);

  double time_sq;
  double time_sse;

  time_sq = omp_get_wtime();
  matrix_mult_sq(size, vector, matrix, result_sq);
  time_sq = omp_get_wtime() - time_sq;

  time_sse = omp_get_wtime();
  matrix_mult_sse(size, vector, matrix, result_pl);
  time_sse = omp_get_wtime() - time_sse;

  printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
  printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);

  //check
  /*int i;
  for(i=0; i<size; i++)
    if((int)result_sq[i] != (int)result_pl[i]){
      printf("wrong at position %d\n", i);
      free(vector);
      free(matrix);
      free(result_sq);
      free(result_pl);
      return 0;
    }*/

  free(vector);
  free(matrix);
  free(result_sq);
  free(result_pl);
  return 1;
}

看來您是用mm_load_ps和mm_store_ps專門加載和存儲的,它們在一條指令中加載和存儲4個浮點數。

由於您的容器(矩陣和向量)的大小不一定是4個浮點數(16個字節)的倍數,因此這是不正確的。

memalign確保指針對齊(此處為16個字節),但不會在末尾保留填充,以使分配的塊大小為16個字節的倍數。

例如,當存儲5維向量時,該向量在內存中僅分配了20個字節,但是您寫入了32個字節(兩次mm_store_ps操作)

此外,這似乎是不正確的:

_mm_store_ps(&vector_out [i + k * size],r_line);

如果我正確的話,您想在這里存儲一個浮點數。 沒有四個包裝好的花車。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM