简体   繁体   中英

SSE memory access

I need to perform Gaussian Elimination using SSE and I am not sure how to access each element(32 bits) from the 128 bit registers(each storing 4 elements). This is the original code(without using SSE):

unsigned int i, j, k;

for (i = 0; i < num_elements; i ++)             /* Copy the contents of the A matrix into the U matrix. */
    for(j = 0; j < num_elements; j++)
        U[num_elements * i + j] = A[num_elements*i + j];


for (k = 0; k < num_elements; k++){             /* Perform Gaussian elimination in place on the U matrix. */
    for (j = (k + 1); j < num_elements; j++){   /* Reduce the current row. */

        if (U[num_elements*k + k] == 0){
            printf("Numerical instability detected. The principal diagonal element is zero. \n");
            return 0;
        }

        /* Division step. */
        U[num_elements * k + j] = (float)(U[num_elements * k + j] / U[num_elements * k + k]);
    }

    U[num_elements * k + k] = 1;             /* Set the principal diagonal entry in U to be 1. */

    for (i = (k+1); i < num_elements; i++){
        for (j = (k+1); j < num_elements; j++)
            /* Elimnation step. */
            U[num_elements * i + j] = U[num_elements * i + j] -\
                                      (U[num_elements * i + k] * U[num_elements * k + j]);

        U[num_elements * i + k] = 0; 
    } 
}

Okay I'm getting segmentation fault[core dumped] with this code. I'm new to SSE. Can someone help? Thanks.

 int i,j,k;
 __m128 a_i,b_i,c_i,d_i;

for (i = 0; i < num_rows; i++)
{
for (j = 0; j < num_rows; j += 4)
{
    int index = num_rows * i + j;
   __m128 v = _mm_loadu_ps(&A[index]); // load 4 x floats
   _mm_storeu_ps(&U[index], v);         // store 4 x floats

}
}
for (k = 0; k < num_rows; k++){  

 a_i= _mm_load_ss(&U[num_rows*k+k]);         


    for (j = (4*k + 1); j < num_rows; j+=4){
               b_i= _mm_loadu_ps(&U[num_rows*k+j]);// Reduce the currentrow. 

    if (U[num_rows*k+k] == 0){
    printf("Numerical instability detected.);

        }

        /* Division step. */
        b_i =    _mm_div_ps(b_i, a_i);
  }

    a_i = _mm_set_ss(1);           

    for (i = (k+1); i < num_rows; i++){
  d_i= _mm_load_ss(&U[num_rows*i+k]);
        for (j = (4*k+1); j < num_rows; j+=4){
           c_i= _mm_loadu_ps(&U[num_rows*i+j]); /* Elimnation step. */
        b_i= _mm_loadu_ps(&U[num_rows*k+j]);    
            c_i = _mm_sub_ps(c_i, _mm_mul_ss(b_i,d_i));
        }
       d_i= _mm_set_ss(0); 
    } 
  }

In order to get you started, your first loop should be more like this:

for (i = 0; i < num_elements; i++)
{
    for (j = 0; j < num_elements; j += 4)
    {
        int index = num_elements * i + j;
        __m128i v = _mm_loadu_ps((__m128i *)&A[index]); // load 4 x floats
        _mm_storeu_ps((__m128i *)&U[index], v);         // store 4 x floats
    }
}

This assumes that num_elements is a multiple of 4, and that neither A nor U is correctly aligned.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM