为什么pthread会降低代码速度？

Question

我是pthread的新手，我编写了这段代码进行测试。 我不明白为什么只用1个pthread运行代码比用多个pthreads运行代码更快。 该代码是解决TSP的遗传算法的设置部分。 我有3个线性数组（city_x，city_y，city_id）保存数据：

x的1
y为1
每个城市的ID为1

这些数组就像线性化的，代表总体元素。 每个元素都有NUM_CITIES个x，y和id数据。 因此，如果我们有：

人口的3个要素
每个元素10个NUM_CITIES
每个数组的数据总数为3 * 10 = 30

该代码在输入时要求总体元素的数量，在city_set数组中设置一些坐标，并使用整个总体所有元素的坐标x，y和id创建全局数组。

#include <pthread.h>

#include <limits> // std::numeric_limits<double>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <utility>
//#include <math.h>
#include <algorithm>    // std::lower_bound, std::find
#include <random>
#include <cmath> 
#include <cstring>
#include <iomanip>      // std::setprecision
#include <vector>       // std::vector

#define NUM_CITIES 10  // This is a tour for the LIN105. It has length 14379.
// #define SIZE_POP 100000000
#define SIZE_MATING 3
#define MUTATION_RATE 0.03
#define STALL_LIMIT 10

// variabili condivise
long size_pop = 0;
long tot_elem = 0;
const int num_threads = 24;
int tid[num_threads];
int start[num_threads];
int stop[num_threads];

// città
int city_set_x[NUM_CITIES];
int city_set_y[NUM_CITIES];
int city_set_id[NUM_CITIES];

// elementi della popolazione
int *city_x;
int *city_y;
int *city_id;

void *setup(void *p) {

    int id = *(int *)p;
    // std::cout << "id: " << id << "\n";

    int s = start[id];

    int perm[NUM_CITIES];
    for(int i = 0; i < NUM_CITIES; ++i) {
        perm[i] = i;
        // std::cout << perm[i] << ",";
    }

    for(long i = start[id]; i < stop[id]; i += NUM_CITIES) {
        std::random_shuffle ( perm, perm + NUM_CITIES );

        for(int j = 0; j < NUM_CITIES; ++j) {
            city_id[i + j] =  perm[j];
            city_x[i + j] =  city_set_x[perm[j]];
            city_y[i + j] =  city_set_y[perm[j]];
            // std::cout << "(" << city_x[i + j] << "," << city_y[i + j] << ") ";
        }
        // std::cout << "\n";
    }

}


static inline const double diffmsec(const struct timeval & a, 
                                    const struct timeval & b) {
    long sec  = (a.tv_sec  - b.tv_sec);
    long usec = (a.tv_usec - b.tv_usec);

    if(usec < 0) {
        --sec;
        usec += 1000000;
    }
    return ((double)(sec*1000)+ (double)usec/1000.0);
}

int main(int argc, char *argv[]) {

    size_pop = atol(argv[1]);

    std::cout << size_pop << "\n";

    tot_elem = NUM_CITIES * size_pop;
    std::cout << "tot_elem: " << tot_elem << "\n";

    struct timeval program_start, program_end, setup_start, setup_end;

    std::vector<double> v_set;

    city_x = (int *)malloc(tot_elem * sizeof(int));
    // memset(city_x, -1, tot_elem * sizeof(int));
    city_y = (int *)malloc(tot_elem * sizeof(int));
    // memset(city_y, -1, tot_elem * sizeof(int));
    city_id = (int *)malloc(tot_elem * sizeof(int));
    for(int i = 0; i < tot_elem; ++i) {
        city_x[i] = -1;
        city_y[i] = -1;
        city_id[i] = -1;
    }

    srand(time(NULL));

    int x[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    int y[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};


    // stampa
    std::cout << "[CITTA.X]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_x[i] = x[i];
        // city_set[i].x = i + 1;
        std::cout << city_set_x[i] << " ";
    }
    std::cout << "\n";

    std::cout << "[CITTA.Y]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_y[i] = y[i];
        // city_set[i].y = i + 1;
        std::cout << city_set_y[i] << " ";
    }
    std::cout << "\n";

    std::cout << "[CITTA.ID]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_id[i] = i;
        std::cout << city_set_id[i] << " ";
    }
    std::cout << "\n";

    // std::cin.get() != '\n';

    pthread_t threads[num_threads];

    for(int i = 0; i < num_threads; ++i) {
        tid[i] = i;
        start[i] = i * NUM_CITIES * floor(size_pop/num_threads);
        // std::cout << "start: " << start << "\n";
        if(i != num_threads - 1) {
            stop[i] = start[i] + (floor(size_pop/num_threads) * NUM_CITIES);
            // std::cout << "stop: " << stop << "\n";
        }
        else {
            stop[i] = tot_elem;
            // std::cout << "stop: " << stop << "\n";
        }
        // std::cout << "\n";
    }

    for(int c = 0; c < 10; c++) {

        gettimeofday(&setup_start, NULL);

        for(int i = 0; i < num_threads; ++i) {
            if( pthread_create( &threads[i], NULL, &setup, (void *) &tid[i]) )
            {
              printf("Thread creation failed\n");
            }
        }

        for(int i = 0; i < num_threads; ++i) {
            pthread_join( threads[i], NULL);
        }

        gettimeofday(&setup_end, NULL);
        v_set.push_back(diffmsec(setup_end, setup_start) / 1000);
    }

    // // stampa
    // std::cout << "[SETUP]\n";
    // for(int i = 0; i < size_pop; ++i){
    //  long idx = i * NUM_CITIES;
    //  std::cout << "pop[" << i << "]: ";
    //  for(int j = 0; j < NUM_CITIES; ++j){
    //      std::cout << "(" << city_x[idx + j] << "," << city_y[idx + j] << ") ";
    //  }
    //  std::cout << "\n";
    // }

    double sum = 0;
    double mean;


    sum = 0;
    for (int i = 0; i < v_set.size(); ++i) {
        sum += v_set[i];
    }
    mean = sum / v_set.size();
    std::cout << "[SET]: " << mean << " s\n";

    free(city_x);
    free(city_y);
    free(city_id);

}

我使用1000000 elemets运行代码，将线程数设置为1， 结果为0.332 s 。 以1000000 elemets运行但将线程数设置为4后， 结果为1.361 s 。 如果我在24处增加数字，则结果为0.60 s，但是顺序的两倍！ 当我超过24个线程数时，结果保持不变或再次增加。

编辑

使用： grep -c处理器/ proc / cpuinfo

我得到56。

使用： cat / proc / cpuinfo

处理器：0

vendor_id：正版英特尔

cpu家族：6

型号：79

型号名称：Intel（R）Xeon（R）CPU E5-2680 v4 @ 2.40GHz步进：1

微码：0xb00001e

cpu兆赫：1967.906

缓存大小：35840 KB

物理编号：0

兄弟姐妹：28

核心编号：0

cpu核心：14

尖酸：0

初始杀虫剂：0

fpu：是的

fpu_exception：是

cpuid等级：20

wp：是的

标志：fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtss lm constant_tsc arch_perftops tcptstqttstptstqtst qpstqtst VMX SMX EST TM2 SSSE3 FMA CX16 xtpr PDCM PCID DCA sse4_1 sse4_2 x2apic movbe POPCNT tsc_deadline_timer AES XSAVE AVX F16C rdrand lahf_lm ABM 3dnowprefetch ARAT EPB PLN PTS dtherm intel_pt tpr_shadow vnmi FlexPriority可EPT VPID fsgsbase tsc_adjust BMI1 HLE AVX2 SMEP bmi2 ERMS invpcid RTM CQM rdseed ADX SMAP xsaveopt cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local

bogomips：4799.62

clflush大小：64

cache_alignment：64

地址大小：物理46位，虚拟48位

对于56个处理器中的每一个。

Answer 1

std::random_shuffle使用共享资源，所有线程都使用它，因此您的程序具有较高的竞争性，线程主要在等待对方。 为每个线程使用单独的随机数生成器（例如， std::mt19937和std::shuffle ，检出cppreference ）。

此外，您可能希望增加NUM_CITIES，因此每个线程使用单独的缓存行。

Answer 2

运行具有多个线程的代码，要求系统在每个线程之间进行上下文切换，这意味着您要承担计算开销，而实际上并没有从中获得任何好处。 另外，您还需要一个循环来计算线程参数，该线程参数会随着生成更多线程而变得计算量更大，但这可能是引入最少的延迟，因为它不需要大量计算。

另请注意，线程可能正在单个物理核心上运行，请在程序运行时检查资源的使用方式。 如果程序仅在单个内核上运行，则实际上您没有使用在多个内核中引入的硬件加速。

最后，由于这是C ++，所以我建议使用本机std :: thread。

最后，我认为这种延迟主要是由于线程之间的上下文切换以及线程可能在单个内核上运行这一事实造成的。 尝试检查在多个物理内核上运行该程序的可能性，并检查所花费的时间。

为什么pthread会降低代码速度？

问题描述

2 个解决方案

解决方案1
3 已采纳 2018-06-21 10:35:12

解决方案2
1 2018-06-21 10:10:25

为什么pthread会降低代码速度？

问题描述

2 个解决方案

解决方案1 3 已采纳 2018-06-21 10:35:12

解决方案2 1 2018-06-21 10:10:25

解决方案1
3 已采纳 2018-06-21 10:35:12

解决方案2
1 2018-06-21 10:10:25