POSIX线程在C中没有产生加速
我正在学习使用Pthreads的并行处理。 我有一个四核心处理器。 不幸的是,以下代码的并行化部分运行速度比非并行化代码慢大约5倍。 我在这里做错了什么? 先谢谢您的帮助。
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
b->sum += b->arr[i];
}
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i != size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]){
clock_t begin, end;
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memoryn");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i != NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
begin = clock();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Parallelized code run time: %fn", runTime);
/* run the unparallelized code */
begin = clock();
sum2 = arrSum(myArr, SIZE);
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Unparallelized code run time: %fn", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
你错过了并行编程的一个重要方面。
工作线程需要在每个进程中创建一次,而不是为每个任务创建。
创建和销毁线程需要时间。
解决方案是使用线程池并将任务发送到池。
我的建议是使用OpenMP ,它相当简化了这项任务,并与许多编译器一起工作。
例:
int sum = 0
#pragma omp for shared(sum)
for(int i=0; i<SIZE; ++i)
{
#pragma omp atomic
sum += myArr[i]
}
为了使这个工作更快,做一些循环展开 - 例如,在一个for
循环范围中计算8个数的总和。
主要的问题是你使用的clock()
不能返回挂墙时间,但是累计的CPU时间。 这是使用SO的OpenMP标记最常见的错误(如果频率列表对SO有用,它应该显示这一点)。
最简单的方法是使用OpenMP中的函数: omp_get_wtime()
。 这适用于带有GCC,ICC和MSVC的Linux和Windows(我认为Clang现在支持OpenMP 3.1)。
当我在代码中使用这个代码时,我使用了我的四核/八超线程i7 IVB系统:
Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000
其他一些意见。 您的日程安排很容易出错。 您将每个线程的数组设置为
p.arr = myArr + i*(int)(SIZE/NTHREADS);
然后让每个线程都运行(SIZE/NTHREADS)
。 这可能会给出错误的结果来舍入SIZE
和NTHREADS
某些值的错误。
你应该让每个线程都运行完毕
int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;
然后让每个线程都指向数组的开始处
int sum = 0;
for (i = start; i < finish; ++i){
sum += b->arr[i];
}
这实质上就是OpenMP的schedule(static)
。 实际上,您可以通过使用OpenMP获得与pthreads
相同的效果
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
sum += arr[i];
}
这是我使用的代码
//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>
#define NTHREADS 4
#define SIZE NTHREADS*100000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
int sum = 0;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
sum += b->arr[i];
}
b->sum = sum;
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]) {
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memoryn");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i < NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
runTime = -omp_get_wtime();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
runTime += omp_get_wtime();
printf("Parallelized code run time: %fn", runTime);
/* run the unparallelized code */
runTime = -omp_get_wtime();
sum2 = arrSum(myArr, SIZE);
runTime += omp_get_wtime();
printf("Unparallelized code run time: %fn", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
链接地址: http://www.djcxy.com/p/86449.html