矩阵反转使用线程较慢

Question

I made a function that makes the inverse and then another multithreaded, as long I have to make inverse of arrays >2000 x 2000. A 1000x1000 array unthreated takes 2.5 seconds (on a i5-4460 4 cores 2.9ghz) and multithreaded takes 7.25 seconds 我制作了一个使该函数求逆的函数，然后再使另一个线程成为多线程，只要我必须使数组的求逆> 2000 x2000。不加处理的1000x1000数组花费2.5秒（在i5-4460 4核2.9ghz上），多线程花费7.25秒

I placed the multithreads in the part that most time consumption is taken. 我将多线程放在消耗大量时间的部分。 Whai is wrong? 怀恩错了吗？ Is due vectors are used instead of 2 dimensions arrays? 是否使用适当的向量而不是2维数组？

This is the minimum code to test both versions: 这是测试两个版本的最低代码：

#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <chrono>
#include <thread>
const int NUCLEOS = 8;

#ifdef __linux__ 
#include <unistd.h>    //usleep()
typedef std::chrono::system_clock t_clock;    //try to use high_resolution_clock on  new linux x64 computer!
#else
typedef std::chrono::high_resolution_clock t_clock;
#pragma warning(disable:4996)
#endif
using namespace std;


std::chrono::time_point<t_clock> start_time, stop_time = start_time; char null_char = '\0';
void timer(char *title = 0, int data_size = 1) { stop_time = t_clock::now(); double us = (double)chrono::duration_cast<chrono::microseconds>(stop_time - start_time).count();   if (title) printf("%s time = %7lgms = %7lg MOPs\n", title, (double)us*1e-3, (double)data_size / us); start_time = t_clock::now(); }



//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord);

//returns inverse of x, x is not modified, not threaded
vector< vector<double> > inverse(vector< vector<double> > x)
{
    if (x.size() != x[0].size())
    {
        cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
    }

    size_t dim = x.size();
    int i, j, ord;
    vector< vector<double> > y(dim,vector<double>(dim,0));//initializes output = 0
    //init_2Dvector(y, dim, dim);
    //1. Unity array y: 
    for (i = 0; i < dim; i++)
    {
        y[i][i] = 1.0;
    }

    double diagon, coef;
    double *ptrx, *ptry, *ptrx2, *ptry2;
    for (ord = 0; ord<dim; ord++)
    {
        //2 Hacemos diagonal de x =1
        int i2;
        if (fabs(x[ord][ord])<1e-15) //If that element is 0, a line that contains a non zero is added
        {
            for (i2 = ord + 1; i2<dim; i2++)
            {
                if (fabs(x[i2][ord])>1e-15) break;
            }
            if (i2 >= dim)
                return{};//error, returns null
            for (i = 0; i<dim; i++)//added a line without 0
            {
                x[ord][i] += x[i2][i];
                y[ord][i] += y[i2][i];
            }
        }
        diagon = 1.0/x[ord][ord];
        ptry = &y[ord][0];
        ptrx = &x[ord][0];
        for (i = 0; i < dim; i++)
        {
            *ptry++ *= diagon;
            *ptrx++ *= diagon;
        }
        //uses the same function but not threaded:
        colum_zero(x,y,0,dim,dim,ord);
    }//end ord
    return y;
}

//threaded version
vector< vector<double> > inverse_th(vector< vector<double> > x)
{
    if (x.size() != x[0].size())
    {
        cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
    }

    int dim = (int) x.size();
    int i, ord;
    vector< vector<double> > y(dim, vector<double>(dim, 0));//initializes output = 0
                                                            //init_2Dvector(y, dim, dim);
                                                            //1. Unity array y: 
    for (i = 0; i < dim; i++)
    {
        y[i][i] = 1.0;
    }

    std::thread tarea[NUCLEOS];
    double diagon;
    double *ptrx, *ptry;// , *ptrx2, *ptry2;
    for (ord = 0; ord<dim; ord++)
    {
        //2 Hacemos diagonal de x =1
        int i2;
        if (fabs(x[ord][ord])<1e-15) //If a diagonal element=0 it is added a column that is not 0 the diagonal element
        {
            for (i2 = ord + 1; i2<dim; i2++)
            {
                if (fabs(x[i2][ord])>1e-15) break;
            }
            if (i2 >= dim)
                return{};//error, returns null
            for (i = 0; i<dim; i++)//It is looked for a line without zero to be added to make the number a non zero one to avoid later divide by 0
            {
                x[ord][i] += x[i2][i];
                y[ord][i] += y[i2][i];
            }
        }
        diagon = 1.0 / x[ord][ord];

        ptry = &y[ord][0];
        ptrx = &x[ord][0];
        for (i = 0; i < dim; i++)
        {
            *ptry++ *= diagon;
            *ptrx++ *= diagon;
        }

        int pos0 = 0, N1 = dim;//initial array position
        if ((N1<1) || (N1>5000))
        {
            cout << "It is detected out than 1-5000 simulations points=" << N1 << " ABORT or press enter to continue" << endl; getchar();
        }
        //cout << "Initiation of " << NUCLEOS << " threads" << endl;
        for (int thread = 0; thread<NUCLEOS; thread++)
        {
            int pos1 = (int)((thread + 1)*N1 / NUCLEOS);//next position
            tarea[thread] = std::thread(colum_zero, std::ref(x), std::ref(y), pos0, pos1, dim, ord);//ojo, coil current=1!!!!!!!!!!!!!!!!!!
            pos0 = pos1;//next thread will work at next point
        }
        for (int thread = 0; thread<NUCLEOS; thread++)
        {
            tarea[thread].join();
            //cout << "Thread num: " << thread << " end\n";
        }
    }//end ord
    return y;
}

//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord)
{
    double coef;
    double *ptrx, *ptry, *ptrx2, *ptry2;
    //Hacemos '0' la columna ord salvo elemento diagonal:
    for (int i = pos0; i<pos1; i++)//Begin to end for every thread
    {
        if (i == ord) continue;
        coef = x[i][ord];//element to make 0 
        if (fabs(coef)<1e-15) continue; //If already zero, it is avoided
        ptry = &y[i][0];
        ptry2 = &y[ord][0];
        ptrx = &x[i][0];
        ptrx2 = &x[ord][0];
        for (int j = 0; j < dim; j++)
        {
            *ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
            *ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
        }
    }
}


void test_6_inverse(int dim)
{
    vector< vector<double> > vec1(dim, vector<double>(dim));
    for (int i=0;i<dim;i++)
        for (int j = 0; j < dim; j++)
        {
            vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
        }
    vector< vector<double> > vec2,vec3;
    double ini, end;
    ini = (double)clock();
    vec2 = inverse(vec1);
    end = (double)clock();
    cout << "=== Time inverse unthreaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
    ini=end;
    vec3 = inverse_th(vec1);
    end = (double)clock();
    cout << "=== Time inverse   threaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
    cout<<vec2[2][2]<<" "<<vec3[2][2]<<endl;//to make the sw to do de inverse
    cout << endl;
}


int main()
{
    test_6_inverse(1000);
    cout << endl << "=== END ===" << endl; getchar(); 
    return 1;
}

Answer 1

After looking deeper in the code of the colum_zero() function I have seen that one thread rewrites in the data to be used by another threads, so the threads are not INDEPENDENT from each other. 在深入研究colum_zero（）函数的代码后，我看到一个线程重写了要由另一个线程使用的数据，因此这些线程彼此之间不是独立的。 Fortunately the compiler detect it and avoid it. 幸运的是，编译器检测到它并避免了它。

Conclusions: 结论：

It is not recommended to try Gauss-Jordan method alone to make multithreads 不建议单独尝试使用Gauss-Jordan方法来创建多线程
If somebody detects that in multithread is slower and the initial function is spreaded correctly for every thread, perhaps is due one thread results are used by another 如果有人检测到多线程中的速度较慢，并且每个线程的初始函数均正确分布，则可能是由于一个线程的结果被另一线程使用了
The main function inverse() works and can be used by other programmers, so this question should not be deleted 主要函数inverse（）可以工作，并且可以被其他程序员使用，因此不应删除此问题

Non answered question: What is a matrix inverse method that could be spreaded in a lot of independent threads to be used in a gpu? 未回答的问题：什么是矩阵逆方法可以在gpu中使用的许多独立线程中扩展？

矩阵反转使用线程较慢

问题描述

1 个解决方案

解决方案1
0 2018-07-12 07:40:35

矩阵反转使用线程较慢

问题描述

1 个解决方案

解决方案1 0 2018-07-12 07:40:35

解决方案1
0 2018-07-12 07:40:35