POSIX线程在C中没有产生加速

我正在学习使用Pthreads的并行处理。 我有一个四核心处理器。 不幸的是,以下代码的并行化部分运行速度比非并行化代码慢大约5倍。 我在这里做错了什么? 先谢谢您的帮助。

#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000

struct params {
  int * arr;
  int sum;
};

/* The worker function for the pthreads */
void * myFun (void * x){
  int i;
  struct params * b = (struct params *) x;
  for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
    b->sum += b->arr[i];
  }
  return NULL;
}

/* unparallelized summing function*/
int arrSum(int * arr, int size){
  int sum = 0;
  for (int i = 0; i != size; ++i){
    sum += arr[i];
  }
  return sum;
}

int main(int argc, char * argv[]){
  clock_t begin, end;
  double runTime;
  int rc, i;
  int sum1, sum2 = 0;
  pthread_t threads[NTHREADS];

  /* create array to sum over */
  int * myArr = NULL;
  myArr = (int *) calloc(SIZE, sizeof(int));
  if (myArr == NULL){
    printf("problem allocating memoryn");
    return 1; 
  }
  for (int i = 0; i < SIZE; ++i){
    myArr[i] = 1;
  }

  /* create array of params structs to feed to threads */
  struct params p;
  p.sum = 0;
  struct params inputs[NTHREADS];
  for(i = 0; i != NTHREADS; ++i){
    p.arr = myArr + i*(int)(SIZE/NTHREADS);
    inputs[i] = p;
  }

  /* spawn the threads */
  begin = clock();
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
  }

  /* wait for threads to finish */
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_join(threads[i], NULL);
  }
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Parallelized code run time: %fn", runTime);

  /* run the unparallelized code */
  begin = clock();
  sum2 = arrSum(myArr, SIZE);
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Unparallelized code run time: %fn", runTime);

  /* consolidate and print results from threads */
  for(i = 0; i != NTHREADS; ++i){
    sum1 += inputs[i].sum;
  }
  printf("sum1, sum2: %d, %d n", sum1, sum2);

  free(myArr);

  /* be disappointed when my parallelized code showed no speedup */
  return 1;
}

你错过了并行编程的一个重要方面。

工作线程需要在每个进程中创建一次,而不是为每个任务创建。

创建和销毁线程需要时间。

解决方案是使用线程池并将任务发送到池。

我的建议是使用OpenMP ,它相当简化了这项任务,并与许多编译器一起工作。

例:

int sum = 0
#pragma omp for shared(sum)
 for(int i=0; i<SIZE; ++i)
 {
   #pragma omp atomic
   sum += myArr[i]
 }

为了使这个工作更快,做一些循环展开 - 例如,在一个for循环范围中计算8个数的总和。


主要的问题是你使用的clock()不能返回挂墙时间,但是累计的CPU时间。 这是使用SO的OpenMP标记最常见的错误(如果频率列表对SO有用,它应该显示这一点)。

最简单的方法是使用OpenMP中的函数: omp_get_wtime() 。 这适用于带有GCC,ICC和MSVC的Linux和Windows(我认为Clang现在支持OpenMP 3.1)。

当我在代码中使用这个代码时,我使用了我的四核/八超线程i7 IVB系统:

Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000

其他一些意见。 您的日程安排很容易出错。 您将每个线程的数组设置为

p.arr = myArr + i*(int)(SIZE/NTHREADS);

然后让每个线程都运行(SIZE/NTHREADS) 。 这可能会给出错误的结果来舍入SIZENTHREADS某些值的错误。

你应该让每个线程都运行完毕

int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;

然后让每个线程都指向数组的开始处

int sum = 0;
for (i = start; i < finish; ++i){
    sum += b->arr[i];
}

这实质上就是OpenMP的schedule(static) 。 实际上,您可以通过使用OpenMP获得与pthreads相同的效果

int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
    sum += arr[i];
}

这是我使用的代码

//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>

#define NTHREADS 4
#define SIZE NTHREADS*100000000

struct params {
  int * arr;
  int sum;
};

/* The worker function for the pthreads */
void * myFun (void * x){
  int i;
  struct params * b = (struct params *) x;
  int sum = 0;
  for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
    sum += b->arr[i];
  }
  b->sum = sum;
  return NULL;
}

/* unparallelized summing function*/
int arrSum(int * arr, int size){
  int sum = 0;
  for (int i = 0; i < size; ++i){
    sum += arr[i];
  }
  return sum;
}

int main(int argc, char * argv[]) {
  double runTime;
  int rc, i;
  int sum1, sum2 = 0;
  pthread_t threads[NTHREADS];

  /* create array to sum over */
  int * myArr = NULL;
  myArr = (int *) calloc(SIZE, sizeof(int));
  if (myArr == NULL){
    printf("problem allocating memoryn");
    return 1; 
  }
  for (int i = 0; i < SIZE; ++i){
    myArr[i] = 1;
  }

  /* create array of params structs to feed to threads */
  struct params p;
  p.sum = 0;
  struct params inputs[NTHREADS];
  for(i = 0; i < NTHREADS; ++i){
    p.arr = myArr + i*(int)(SIZE/NTHREADS);
    inputs[i] = p;
  }

  /* spawn the threads */
  runTime = -omp_get_wtime();  
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
  }

  /* wait for threads to finish */
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_join(threads[i], NULL);
  }

  runTime += omp_get_wtime();  
  printf("Parallelized code run time: %fn", runTime);

  /* run the unparallelized code */
  runTime = -omp_get_wtime();
  sum2 = arrSum(myArr, SIZE);
  runTime += omp_get_wtime();
  printf("Unparallelized code run time: %fn", runTime);

  /* consolidate and print results from threads */
  for(i = 0; i != NTHREADS; ++i){
    sum1 += inputs[i].sum;
  }
  printf("sum1, sum2: %d, %d n", sum1, sum2);

  free(myArr);

  /* be disappointed when my parallelized code showed no speedup */
  return 1;
}
链接地址: http://www.djcxy.com/p/86449.html

上一篇: POSIX Threads not producing speed up in C

下一篇: Flexbox: Layout with rowspan