在 C 中实现了简单的线性回归，但使用 p_thread 的速度要慢得多

Question

I implemented simple linear regression in C, one with single thread and the other with multi-thread which runs much slower.我在 C 中实现了简单的线性回归，一个是单线程，另一个是多线程，运行速度要慢得多。

The single thread version is attached below.下面附上单线程版本。 It is very simple.这很简单。 First it reads the xs and ys from two files within the same folder.首先，它从同一文件夹中的两个文件中读取 xs 和 ys。 Then, the calculation of the coefficient and intercept of the line is just addition and multiplication of the data from xs and ys.那么直线的系数和截距的计算就是xs和ys的数据的加法和乘法。

// gcc simple_regression.cpp -lstdc++ -o simple_regression

#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h> 

const long long int N = 8000000*10;

typedef struct {
    double coef;
    double intercept;
} lr_res;

lr_res lr(double *xs, double *ys) {
    double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
    int i;
    for(i=0; i<N; ++i) {
        sumxy += xs[i]*ys[i];
        sumx += xs[i];
        sumy += ys[i];
        sumx2 += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
    lr_res res;
    res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
    res.intercept = (sumy - res.coef*sumx)/N;
    return res;
}

int main(int argc, char**argv) {

    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;

    //double xs[N], ys[N]; 
    double *xs = new double[N];
    double *ys = new double[N];
    int i;
    //printf("hello!\n");
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    printf("start fitting...\n");

    clock_t t; 
    t = clock(); 

    lr_res res = lr(xs, ys);

    t = clock() - t; 
    double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds 


    printf("coef: %f, intercept: %f\n", res.coef, res.intercept);

    printf("fitting took %f seconds\n", time_taken); 

}

The multi-threaded version is attached below.下面附上多线程版本。 When calculating the coefficient and intercept, it devides the summation work into different threads, with the hope that the speed can be raised up by roughly 8 times compared the single-threaded version.在计算系数和截距时，将求和工作分到不同的线程中，希望能比单线程版本提速8倍左右。

// gcc simple_regression_mt.cpp -lstdc++ -lpthread  -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/

#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> 
#include <pthread.h>

#include<string.h> 

const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;

//double xs[N], ys[N]; 
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];


void *lr_calc(void *arg) {
    char* coreid_str = (char *) arg;
    int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
    int offset = coreid*chunksize;
    //printf("core id:%d starting up...\n", coreid);

    for(int i=offset; i<offset+chunksize; ++i) {
        sumxy[coreid] += xs[i]*ys[i];
        sumx[coreid] += xs[i];
        sumy[coreid] += ys[i];
        sumx2[coreid] += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);

    //printf("core id:%d finishing...\n", coreid);

    return NULL;  
}

char* itoa(int val, int base){  
    static char buf[32] = {0};  
    int i = 30; 
    for(; val && i ; --i, val /= base)  
        buf[i] = "0123456789abcdef"[val % base];    
    return &buf[i+1];   
}


int main(int argc, char**argv) {

    printf("This system has %d processors configured and "
        "%d processors available.\n",
        get_nprocs_conf(), get_nprocs());



    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;


    int i;

    // read x, y into dynamic arrays xs, ys
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    // multithreaded version of linear regression
    printf("start fitting...\n");
    struct timeval tp;
    gettimeofday(&tp, NULL);
    int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;

    pthread_t tid[nprocs];
    for (int i=0; i<nprocs; ++i) {
        char* scoreid = itoa(i+1, 10);
        char* scoreid_dup = strdup(scoreid);  

        pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
    }

    void* v;
    for (int i=0; i<nprocs; ++i) {
        pthread_join(tid[i], &v);
    }

    double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
    for (int i=0; i<nprocs; ++i) {
        ssumxy += sumxy[i];
        ssumx += sumx[i];
        ssumy += sumy[i];
        ssumx2 += sumx2[i];
        //printf("sumxy[i]: %f\n", sumxy[i]);
        //printf("ssumxy: %f\n", ssumxy);
    }
    //printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);

    double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
    //printf("N*ssumxy is: %f\n", ssumx);
    double intercept = (ssumy - coef*ssumx)/N;

    struct timeval tp1;
    gettimeofday(&tp1, NULL);
    int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;

    int time_taken = (ms2 - ms); // in million seconds 


    printf("coef: %f, intercept: %f\n", coef, intercept);    
    printf("fitting took %d milli seconds\n", time_taken); 

}

However, the result suggests otherwise as attached below, the multi-threaded version being much slower instead.然而，结果表明并非如此，如下所示，多线程版本反而慢得多。 Note that the timing is only for calculating/fitting the linear regression and doesn't count in the part of reading data from files.请注意，时间仅用于计算/拟合线性回归，不计入从文件中读取数据的部分。 The time difference also doesn't seem to be the overhead of creating thread.时差似乎也不是创建线程的开销。 (creating thread wouldn't take something like 0.5 seconds or so.) （创建线程不会花费 0.5 秒左右的时间。）

Can anybody give me some advices on this?有人可以给我一些建议吗？ Thank you so much!太感谢了！

peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt 
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 milli seconds

Answer 1

You have an issue of false-sharing.你有一个虚假分享的问题。 Right here:就在这儿：

for(int i=offset; i<offset+chunksize; ++i) {
    sumxy[coreid] += xs[i]*ys[i];
    sumx[coreid] += xs[i];
    sumy[coreid] += ys[i];
    sumx2[coreid] += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}

Memory doesn't get transferred in bits or bytes but in cache lines - currently they are 64 bytes. Memory 不会以位或字节的形式传输，而是以缓存行的形式传输——目前它们是 64 字节。 If two cores modify data on the same cacheline a complex routine will run to fix the issue.如果两个核心修改同一缓存行上的数据，则会运行一个复杂的例程来解决问题。 Even if they modified completely different pieces of the cacheline.即使他们修改了完全不同的缓存行。 Since all your cores operate on more-or-less the same cachelines and modify them - expect a huge slowdown by using multiple threads.由于您的所有内核都在或多或少相同的缓存行上运行并对其进行修改 - 使用多线程会导致速度大幅下降。

To fix it, just create local variables sum**_local for the thread computation and only in the end put them in shared cache.要修复它，只需为线程计算创建局部变量sum**_local ，最后才将它们放入共享缓存中。

double sumxy_local = 0., sumx_local = 0., sumy_local = 0., sumx2_local = 0.;
for(int i=offset; i<offset+chunksize; ++i) {
    sumxy_local += xs[i]*ys[i];
    sumx_local  += xs[i];
    sumy_local  += ys[i];
    sumx2_local  += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
sumxy[coreid] = sumxy_local;
sumx[coreid] = sumx_local;
sumy[coreid] = sumy_local;
sumx2[coreid] = sumx2_local;

This should resolve the problem.这应该可以解决问题。

Disclaimer: there could be other issues regarding multi-threading that could cause slowdown but this one seems most critical.免责声明：可能还有其他有关多线程的问题可能会导致速度下降，但这一个问题似乎最为关键。

Answer 2

My guess is that the speed benefit of running things in parallel is more than offset by the high cost of the threading machinery.我的猜测是并行运行的速度优势被线程机器的高成本所抵消。 Running instructions flat out will be faster than system calls and library overhead for thread creation and coordination.完全运行指令将比用于线程创建和协调的系统调用和库开销更快。

在 C 中实现了简单的线性回归，但使用 p_thread 的速度要慢得多

问题描述

2 个解决方案

解决方案1
1 2020-05-17 08:40:45

解决方案2
0 2020-05-17 07:23:02

在 C 中实现了简单的线性回归，但使用 p_thread 的速度要慢得多

问题描述

2 个解决方案

解决方案1 1 2020-05-17 08:40:45

解决方案2 0 2020-05-17 07:23:02

解决方案1
1 2020-05-17 08:40:45

解决方案2
0 2020-05-17 07:23:02