[英]Vector-matrix & matrix-matrix multiplication using SSE for any size of input matrix and vector
我正在嘗試使用SSE Intrinsic進行矢量矩陣乘法和矩陣矩陣乘法,但是如果我嘗試除4的倍數以外的任何操作,都會收到一個錯誤消息“ Segmentation Fault”,但我無法弄清楚原因,它不為其他工作。 請提出更改建議,使其適用於任何大小的輸入。
以下是我的實現:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>
/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
int i;
for (i = 0; i < size*size; i++){
vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
printf("%f \n ", vector[i]);
}
for (i = 0; i < size*size; i++){
matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
printf("%f \n ", matrix[i]);
}
}
/****************************************************
the following function calculate the below equation
vector_out = vector_in x matrix_in
***************************************************/
void matrix_mult_sq(int size, float *vector_in,
float *matrix_in, float *vector_out){
int i, j, k;
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
vector_out[size*i + j] = 0.0;
for (k = 0; k < size; k++)
vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
}
}
}
void matrix_mult_sse(int size, float *vector_in,
float *matrix_in, float *vector_out){
__m128 a_line, b_line, r_line;
int i, j, k, l;
for (k = 0; k < size; k++)
{
for (i = 0; i < size; i += 4){
j = 0;
b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
a_line = _mm_set1_ps(vector_in[j + k*size]); // a_line = vec4(vector_in[0])
r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
for (j = 1; j < size; j++) {
b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
a_line = _mm_set1_ps(vector_in[j + k*size]); // b_line = vec4(b[i][j])
// r_line += a_line * b_line
r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
}
_mm_store_ps(&vector_out[i + k*size], r_line); // r[i] = r_line
}
}
for (l=0; l < size*size; l++)
{
printf("%f \n", vector_out[l]);
}
}
int main(int argc, char *argv[]){
if(argc < 2){
printf("Usage: %s matrix/vector_size\n", argv[0]);
return 0;
}
int size = atoi(argv[1]);
if(size%4 != 0){
printf("This version implements for ""size = 4*n"" only\n");
return 0;
}
float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
if(vector==NULL){
printf("can't allocate the required memory for vector\n");
return 0;
}
float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
if(matrix==NULL){
printf("can't allocate the required memory for matrix\n");
free(vector);
return 0;
}
float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_sq==NULL){
printf("can't allocate the required memory for result_sq\n");
free(vector);
free(matrix);
return 0;
}
float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_pl==NULL){
printf("can't allocate the required memory for result_pl\n");
free(vector);
free(matrix);
free(result_sq);
return 0;
}
matrix_vector_gen(size, matrix, vector);
double time_sq;
double time_sse;
time_sq = omp_get_wtime();
matrix_mult_sq(size, vector, matrix, result_sq);
time_sq = omp_get_wtime() - time_sq;
time_sse = omp_get_wtime();
matrix_mult_sse(size, vector, matrix, result_pl);
time_sse = omp_get_wtime() - time_sse;
printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);
//check
/*int i;
for(i=0; i<size; i++)
if((int)result_sq[i] != (int)result_pl[i]){
printf("wrong at position %d\n", i);
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 0;
}*/
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 1;
}
看來您是用mm_load_ps和mm_store_ps專門加載和存儲的,它們在一條指令中加載和存儲4個浮點數。
由於您的容器(矩陣和向量)的大小不一定是4個浮點數(16個字節)的倍數,因此這是不正確的。
memalign確保指針對齊(此處為16個字節),但不會在末尾保留填充,以使分配的塊大小為16個字節的倍數。
例如,當存儲5維向量時,該向量在內存中僅分配了20個字節,但是您寫入了32個字節(兩次mm_store_ps操作)
此外,這似乎是不正確的:
_mm_store_ps(&vector_out [i + k * size],r_line);
如果我正確的話,您想在這里存儲一個浮點數。 沒有四個包裝好的花車。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.