How to safely parallel the for
I am developing a originally serial code in KSVD package to support OpenMP. The original code, which serves like im2col in MATLAB and extracts patches from the image, is shown as follows:
/* n stands for the size of an image, sz stands for the patch size to extract */
int blocknum = 0;
for (k=0; k<=n[2]-sz[2]; k+=1) {
for (j=0; j<=n[1]-sz[1]; j+=1) {
for (i=0; i<=n[0]-sz[0]; i+=1) {
/* copy single block */
for (m=0; m<sz[2]; m++) {
for (l=0; l<sz[1]; l++) {
memcpy(b + blocknum*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
}
}
blocknum ++;
}
}
}
While, I would like to make it parallel by replacing the incremental blocknum with an indexing variable blockid.
/* n stands for the size of an image, sz stands for the patch size to extract */
int blockid3, blockid2, blockid;
for (k=0; k<=n[2]-sz[2]; k+=1) {
blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
for (j=0; j<=n[1]-sz[1]; j+=1) {
blockid2 = j * (n[0]-sz[0]+1);
for (i=0; i<=n[0]-sz[0]; i+=1) {
blockid = i + blockid2 + blockid3;
/* copy single block */
for (m=0; m<sz[2]; m++) {
for (l=0; l<sz[1]; l++) {
memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
}
}
}
}
}
Then running leads me to fatal segmentation error. I do not know why (according to the stack trace, it seems related to safe threading). Because I thought parallel threads should not visit the same address once. Am I supposed to set some properties of variables, ie static or shared or private? Here is the stack trace:
Stack Trace (from fault):
[ 0] 0x00007f9bcaa695de /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00210398 _ZN2fl4diag15stacktrace_base7capt
ureERKNS0_14thread_contextEm+000158
[ 1] 0x00007f9bcaa6b62d /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00218669
[ 2] 0x00007f9bcaa6b8f5 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00219381 _ZN2fl4diag13terminate_logEPKcRKN
S0_14thread_contextEb+000165
[ 3] 0x00007f9bc9a714f5 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00447733 _ZN2fl4diag13terminate_logEPKcPK8
ucontextb+000085
[ 4] 0x00007f9bc9a6e5b4 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00435636
[ 5] 0x00007f9bc9a6f333 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00439091
[ 6] 0x00007f9bc9a6f4c7 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00439495
[ 7] 0x00007f9bc9a7085f /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00444511
[ 8] 0x00007f9bc9a70a15 /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00444949
[ 9] 0x00007f9bc89f0cb0 /lib/x86_64-linux-gnu/libpthread.so.0+00064688
[ 10] 0x00007f9bc876cb8e /lib/x86_64-linux-gnu/libc.so.6+01346446
[ 11] 0x00007f9b88238bb8 /home/peiyun/schmax3.0/test_im2col/mex_im2colstep.mexa64+00003000
[ 12] 0x00007f9bcb004eea /usr/lib/gcc/x86_64-linux-gnu/4.6.3//libgomp.so+00032490
[ 13] 0x00007f9bc89e8e9a /lib/x86_64-linux-gnu/libpthread.so.0+00032410
[ 14] 0x00007f9bc87164bd /lib/x86_64-linux-gnu/libc.so.6+00992445 clone+000109
By the way, if they are writing to different addresses, are there any race conditions regarding memcpy inside the omp for-loop?
There are multiple data races in your code, namely:
/* n stands for the size of an image, sz stands for the patch size to extract */
int blockid3, blockid2, blockid;
for (k=0; k<=n[2]-sz[2]; k+=1) {
blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
for (j=0; j<=n[1]-sz[1]; j+=1) {
blockid2 = j * (n[0]-sz[0]+1); // <--- here
for (i=0; i<=n[0]-sz[0]; i+=1) { // <--- here
blockid = i + blockid2 + blockid3; // <--- here
/* copy single block */
for (m=0; m<sz[2]; m++) { // <--- here
for (l=0; l<sz[1]; l++) { // <--- and here
memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
}
}
}
}
}
By the rules of OpenMP blockid2
, i
, blockid
, m
, and l
are all implicitly shared which is not what you want. You should either make them private
, or better declare them inside the parallel region and thus make them implicitly private:
#pragma omp parallel for private(i,m,l,blockid,blockid2)
...
or
int blockid3;
for (k=0; k<=n[2]-sz[2]; k+=1) {
blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
for (j=0; j<=n[1]-sz[1]; j+=1) {
int blockid2 = j * (n[0]-sz[0]+1);
for (int i=0; i<=n[0]-sz[0]; i+=1) {
int blockid = i + blockid2 + blockid3;
/* copy single block */
for (int m=0; m<sz[2]; m++) {
for (int l=0; l<sz[1]; l++) {
memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
}
}
}
}
}
The latter requires a C99-compliant compiler (because of the way loop variables are declared). Your GCC 4.6.3 requires the -std=c99
option to enable C99 compliance. If no such compiler is available (are there still non-C99 compilers in general use?), you should add the private(i,l,m)
clause. You might also want to move the parallelisation to the outermost loop instead in order to minimise the OpenMP overhead.
下一篇: 如何安全地平行for