使用SSE对任意大小的输入矩阵和向量进行向量矩阵和矩阵矩阵乘法

Question

I am trying to do Vector-matrix multiplication as well as matrix-matrix multiplication using SSE Intrinsic but I get an error saying "Segmentation Fault", if I try to do for anything except multiples of 4. not able to figure out why, it don't work for anything else.? 我正在尝试使用SSE Intrinsic进行矢量矩阵乘法和矩阵矩阵乘法，但是如果我尝试除4的倍数以外的任何操作，都会收到一个错误消息“ Segmentation Fault”，但我无法弄清楚原因，它不为其他工作。 Please suggest changes so that it works for anysize of input.? 请提出更改建议，使其适用于任何大小的输入。

Following is my implementation: 以下是我的实现：

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>  

/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
 ****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
    int i;
    for (i = 0; i < size*size; i++){
        vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
        printf("%f \n ", vector[i]);
    }
    for (i = 0; i < size*size; i++){
        matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
        printf("%f \n ", matrix[i]);
    }
}

/****************************************************
the following function calculate the below equation
   vector_out = vector_in x matrix_in
 ***************************************************/
void matrix_mult_sq(int size, float *vector_in,
               float *matrix_in, float *vector_out){
    int i, j, k;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            vector_out[size*i + j] = 0.0;
            for (k = 0; k < size; k++)
                vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
        }
    }
}

void matrix_mult_sse(int size, float *vector_in,
    float *matrix_in, float *vector_out){
    __m128 a_line, b_line, r_line;
    int i, j, k, l;
    for (k = 0; k < size; k++)
    {

        for (i = 0; i < size; i += 4){
            j = 0;
            b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
            a_line = _mm_set1_ps(vector_in[j + k*size]);      // a_line = vec4(vector_in[0])
            r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
            for (j = 1; j < size; j++) {
                b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
                a_line = _mm_set1_ps(vector_in[j + k*size]);  // b_line = vec4(b[i][j])
                // r_line += a_line * b_line
                r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
            }
            _mm_store_ps(&vector_out[i + k*size], r_line);     // r[i] = r_line
        }
    }
    for (l=0; l < size*size; l++)
    {
        printf("%f \n", vector_out[l]);
    }
}

int main(int argc, char *argv[]){
  if(argc < 2){
    printf("Usage: %s matrix/vector_size\n", argv[0]);
    return 0;
  }

  int size = atoi(argv[1]);
  if(size%4 != 0){
    printf("This version implements for ""size = 4*n"" only\n");
    return 0;
  }

  float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
  if(vector==NULL){
    printf("can't allocate the required memory for vector\n");
    return 0;
  }

  float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
  if(matrix==NULL){
    printf("can't allocate the required memory for matrix\n");
    free(vector);
    return 0;
  }

  float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_sq==NULL){
    printf("can't allocate the required memory for result_sq\n");
    free(vector);
    free(matrix);
    return 0;
  }

  float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_pl==NULL){
    printf("can't allocate the required memory for result_pl\n");
    free(vector);
    free(matrix);
    free(result_sq);
    return 0;
  }

  matrix_vector_gen(size, matrix, vector);

  double time_sq;
  double time_sse;

  time_sq = omp_get_wtime();
  matrix_mult_sq(size, vector, matrix, result_sq);
  time_sq = omp_get_wtime() - time_sq;

  time_sse = omp_get_wtime();
  matrix_mult_sse(size, vector, matrix, result_pl);
  time_sse = omp_get_wtime() - time_sse;

  printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
  printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);

  //check
  /*int i;
  for(i=0; i<size; i++)
    if((int)result_sq[i] != (int)result_pl[i]){
      printf("wrong at position %d\n", i);
      free(vector);
      free(matrix);
      free(result_sq);
      free(result_pl);
      return 0;
    }*/

  free(vector);
  free(matrix);
  free(result_sq);
  free(result_pl);
  return 1;
}

Answer 1

It seems you are loading and storing exclusively with mm_load_ps and mm_store_ps, which load and store 4 floats in a single instruction. 看来您是用mm_load_ps和mm_store_ps专门加载和存储的，它们在一条指令中加载和存储4个浮点数。

Since your containers (matrixes and vectors) do not have necessarily a size which is a multiple of 4 floats (16 bytes) this is incorrect. 由于您的容器（矩阵和向量）的大小不一定是4个浮点数（16个字节）的倍数，因此这是不正确的。

memalign ensures that the pointer is aligned (here on 16 bytes) but does not reserve padding at the end so that the allocated block size is a multiple of 16 bytes. memalign确保指针对齐（此处为16个字节），但不会在末尾保留填充，以使分配的块大小为16个字节的倍数。

For instance, when storing a 5-dimension vector, the vector has only 20 bytes allocated in memory, but you write 32 bytes (two mm_store_ps operations) 例如，当存储5维向量时，该向量在内存中仅分配了20个字节，但是您写入了32个字节（两次mm_store_ps操作）

Additionally, it seems this is incorrect: 此外，这似乎是不正确的：

_mm_store_ps(&vector_out[i + k*size], r_line); _mm_store_ps（＆vector_out [i + k * size]，r_line）;

You want to store a single float here, if I'm right. 如果我正确的话，您想在这里存储一个浮点数。 Not four packed floats. 没有四个包装好的花车。

使用SSE对任意大小的输入矩阵和向量进行向量矩阵和矩阵矩阵乘法

问题描述

1 个解决方案

解决方案1
3 已采纳 2014-05-08 22:06:07

使用SSE对任意大小的输入矩阵和向量进行向量矩阵和矩阵矩阵乘法

问题描述

1 个解决方案

解决方案1 3 已采纳 2014-05-08 22:06:07

解决方案1
3 已采纳 2014-05-08 22:06:07