為什么我的並行代碼比順序代碼慢？

Question

我在C中實現了一個並行代碼，用於使用OPENMP進行合並排序。 我的速度提高了3.9秒，這比相同代碼的順序版本慢得多（我得到的是3.6）。 我試圖將代碼優化到最佳狀態但不能提高加速。 你能幫忙解決這個問題嗎？ 謝謝。

 void partition(int arr[],int arr1[],int low,int high,int thread_count)
 {
int tid,mid;

#pragma omp if
if(low<high)
{
    if(thread_count==1)
    {
            mid=(low+high)/2;
            partition(arr,arr1,low,mid,thread_count);
            partition(arr,arr1,mid+1,high,thread_count);
                sort(arr,arr1,low,mid,high);
    }
    else
    {
        #pragma omp parallel num_threads(thread_count) 
        {
                mid=(low+high)/2;
                #pragma omp parallel sections  
                {
                    #pragma omp section
                    {
                        partition(arr,arr1,low,mid,thread_count/2);
                        }
                    #pragma omp section
                    {   
                        partition(arr,arr1,mid+1,high,thread_count/2);
                    }
                }
        }
        sort(arr,arr1,low,mid,high);

    }
}
 }

Answer 1

正如已經正確指出的那樣，您的代碼中存在一些阻止其正確執行的錯誤，因此我首先建議您查看這些錯誤。

無論如何，只考慮OpenMP性能如何與線程一起擴展，也許基於任務指令的實現更適合，因為它克服了前面答案已經指出的限制：

由於sections指令只有兩個部分，我認為你不會從並行子句中產生比兩個更多的線程獲得任何好處

您可以在下面找到這種實現的痕跡：

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/time.h>

void getTime(double *t) {

  struct timeval tv;

  gettimeofday(&tv, 0);
  *t = tv.tv_sec + (tv.tv_usec * 1e-6);
}

int compare( const void * pa, const void * pb ) {

  const int a = *((const int*) pa);
  const int b = *((const int*) pb);

  return (a-b);
}

void merge(int * array, int * workspace, int low, int mid, int high) {

  int i = low;
  int j = mid + 1;
  int l = low;

  while( (l <= mid) && (j <= high) ) {
    if( array[l] <= array[j] ) {
      workspace[i] = array[l];
      l++;
    } else {
      workspace[i] = array[j];
      j++;
    }
    i++;
  }
  if (l > mid) {
    for(int k=j; k <= high; k++) {
      workspace[i]=array[k];
      i++;
    }
  } else {
    for(int k=l; k <= mid; k++) {
      workspace[i]=array[k];
      i++;
    }
  }
  for(int k=low; k <= high; k++) {
    array[k] = workspace[k];
  }
}

void mergesort_impl(int array[],int workspace[],int low,int high) {

  const int threshold = 1000000;

  if( high - low > threshold  ) {
    int mid = (low+high)/2;
    /* Recursively sort on halves */
#ifdef _OPENMP
#pragma omp task 
#endif
    mergesort_impl(array,workspace,low,mid);
#ifdef _OPENMP
#pragma omp task
#endif
    mergesort_impl(array,workspace,mid+1,high);
#ifdef _OPENMP
#pragma omp taskwait
#endif
    /* Merge the two sorted halves */
#ifdef _OPENMP
#pragma omp task
#endif
    merge(array,workspace,low,mid,high);
#ifdef _OPENMP
#pragma omp taskwait
#endif
  } else if (high - low > 0) {
    /* Coarsen the base case */
    qsort(&array[low],high-low+1,sizeof(int),compare);
  }

}

void mergesort(int array[],int workspace[],int low,int high) {
  #ifdef _OPENMP
  #pragma omp parallel
  #endif
  {
#ifdef _OPENMP
#pragma omp single nowait
#endif
    mergesort_impl(array,workspace,low,high);
  }
}

const size_t largest = 100000000;
const size_t length  = 10000000;

int main(int argc, char *argv[]) {

  int * array = NULL;
  int * workspace = NULL;

  double start,end;

  printf("Largest random number generated: %d \n",RAND_MAX);
  printf("Largest random number after truncation: %d \n",largest);
  printf("Array size: %d \n",length);
  /* Allocate and initialize random vector */
  array     = (int*) malloc(length*sizeof(int));
  workspace = (int*) malloc(length*sizeof(int));
  for( int ii = 0; ii < length; ii++)
    array[ii] = rand()%largest;
  /* Sort */  
  getTime(&start);
  mergesort(array,workspace,0,length-1);
  getTime(&end);
  printf("Elapsed time sorting: %g sec.\n", end-start);
  /* Check result */
  for( int ii = 1; ii < length; ii++) {
    if( array[ii] < array[ii-1] ) printf("Error:\n%d %d\n%d %d\n",ii-1,array[ii-1],ii,array[ii]);
  }
  free(array);
  free(workspace);
  return 0;
}

請注意，如果您尋求性能，您還必須保證遞歸的基本情況足夠粗，以避免由於遞歸函數調用而產生大量開銷。 除此之外，我建議您對代碼進行分析，以便您可以很好地了解哪些部分值得優化。

Answer 2

這需要一些搞清楚，這有點令人尷尬，因為當你看到它時，答案是如此簡單。

正如問題所在，程序無法正常運行，而是在某些運行中隨機復制某些數字並丟失其他數字。 這似乎是一個完全並行的錯誤，在使用變量thread_count == 1運行程序時不會出現這種錯誤。

編譯指示“並行部分”是組合的並行和部分指令，在這種情況下意味着它在前一個內部開始第二個並行區域。 其他並行區域內的並行區域很好，但我認為大多數實現在遇到嵌套並行區域時不會給你額外的線程。

修復是要替換

 #pragma omp parallel sections

同

 #pragma omp sections

在此修復之后，程序開始給出正確的答案，並且使用兩個核心系統並且對於一百萬個數字，我得到以下結果的計時。

一個帖子：

time taken: 0.378794

兩個線程：

time taken: 0.203178

由於sections指令只有兩個部分，我認為你不會從並行子句中產生比兩個更多的線程獲得任何好處，所以更改num_threads（thread_count） - > num_threads（2）

但是由於至少我嘗試的兩個實現不能為嵌套的並行區域生成新線程，所以程序不能擴展到兩個以上的線程。

為什么我的並行代碼比順序代碼慢？

問題描述

2 個解決方案

解決方案1
3 2012-09-16 17:28:55

解決方案2
2 已采納 2012-09-16 16:16:14

為什么我的並行代碼比順序代碼慢？

問題描述

2 個解決方案

解決方案1 3 2012-09-16 17:28:55

解決方案2 2 已采納 2012-09-16 16:16:14

解決方案1
3 2012-09-16 17:28:55

解決方案2
2 已采納 2012-09-16 16:16:14