POSIX Threads not producing speed up in C
I am learning parallel processing using Pthreads. I have a quad core processor. Unfortunately, the parallelized portion of the following code is running roughly 5X slower than the non-parallelized code. What am I doing wrong here? Thanks in advance for the help.
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
b->sum += b->arr[i];
}
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i != size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]){
clock_t begin, end;
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memoryn");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i != NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
begin = clock();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Parallelized code run time: %fn", runTime);
/* run the unparallelized code */
begin = clock();
sum2 = arrSum(myArr, SIZE);
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Unparallelized code run time: %fn", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
You're missing one important aspect of parallel programming.
The worker threads need to be created once per process, not for every task.
Creating and destroying threads takes time.
The solution is to use a thread pool and send tasks to the pool.
My suggestion is to use OpenMP which simplifies this task considerably and works with many compilers.
Example:
int sum = 0
#pragma omp for shared(sum)
for(int i=0; i<SIZE; ++i)
{
#pragma omp atomic
sum += myArr[i]
}
To make this work faster, do some loop unrolling - eg calculate the sum of 8 number in a single for
loop scope.
The main problem is that you're using clock()
which does not return the wall time but the cumulative CPU time. This is the most common mistake with the OpenMP tag with SO (and if the frequency listing was useful on SO it should show this).
The simplest way to get the wall time is to use a function from OpenMP: omp_get_wtime()
. This works Linux and Windows with GCC, ICC, and MSVC (and I assume Clang which now supports OpenMP 3.1).
When I use this with your code I get on my four core/eight hyper-thread i7 IVB system:
Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000
Some other comments. Your scheduling is error prone. You set the array for each thread to
p.arr = myArr + i*(int)(SIZE/NTHREADS);
And then have each thread run over (SIZE/NTHREADS)
. This can give wrong results to to rounding errors for some values of SIZE
and NTHREADS
.
You should have each thread run over
int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;
And then have each thread point to the beginning of the array and do
int sum = 0;
for (i = start; i < finish; ++i){
sum += b->arr[i];
}
This is essentially what OpenMP's schedule(static)
does. In fact you can get the same effect of pthreads
using OpenMP by doing
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
sum += arr[i];
}
Here is the code I used
//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>
#define NTHREADS 4
#define SIZE NTHREADS*100000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
int sum = 0;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
sum += b->arr[i];
}
b->sum = sum;
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]) {
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memoryn");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i < NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
runTime = -omp_get_wtime();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
runTime += omp_get_wtime();
printf("Parallelized code run time: %fn", runTime);
/* run the unparallelized code */
runTime = -omp_get_wtime();
sum2 = arrSum(myArr, SIZE);
runTime += omp_get_wtime();
printf("Unparallelized code run time: %fn", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
链接地址: http://www.djcxy.com/p/86450.html
上一篇: Django中的并发加载处理
下一篇: POSIX线程在C中没有产生加速