CUDA streams and concurrent kernel execution
I would like to use streams in order to parallelize the execution of kernels that work on separate device data arrays. Data were allocated on the device and filled from previous kernels.
I have written the following program that shows I can't reach my goal so far. Indeed, the kernels on two non-default streams execute sequentially in their respective streams.
The same behaviour is observed on 2 Intel machines with latest Debian linux version. One has a Tesla C2075 with CUDA 4.2 and the other has a Geforce 460GT with CUDA 5.0. The Visual Profiler shows sequential execution in both the 4.2 and also 5.0 CUDA version.
Here is the code:
#include <iostream>
#include <stdio.h>
#include <ctime>
#include <curand.h>
using namespace std;
// compile and run this way:
// nvcc cuStreamsBasics.cu -arch=sm_20 -o testCuStream -lcuda -lcufft -lcurand
// testCuStream 1024 512 512
/* -------------------------------------------------------------------------- */
// "useful" macros
/* -------------------------------------------------------------------------- */
#define MSG_ASSERT( CONDITION, MSG )
if (! (CONDITION))
{
std::cerr << std::endl << "Dynamic assertion `" #CONDITION "` failed in " << __FILE__
<< " line " << __LINE__ << ": <" << MSG << ">" << std::endl;
exit( 1 );
}
#define ASSERT( CONDITION )
MSG_ASSERT( CONDITION, " " )
// allocate data on the GPU memory, unpinned
#define CUDALLOC_GPU( _TAB, _DIM, _DATATYPE )
MSG_ASSERT(
cudaMalloc( (void**) &_TAB, _DIM * sizeof( _DATATYPE) )
== cudaSuccess , "failed CUDALLOC" );
/* -------------------------------------------------------------------------- */
// the CUDA kernels
/* -------------------------------------------------------------------------- */
// finds index in 1D array from sequential blocks
#define CUDAINDEX_1D
blockIdx.y * ( gridDim.x * blockDim.x ) +
blockIdx.x * blockDim.x +
threadIdx.x;
__global__ void
kernel_diva(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] /= value;
}
__global__ void
kernel_jokea(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] *= value + sin( double(i)) * 1/ cos( double(i) );
}
/* -------------------------------------------------------------------------- */
// usage
/* -------------------------------------------------------------------------- */
static void
usage(int argc, char **argv)
{
if ((argc -1) != 3)
{
printf("Usage: %s <dimx> <dimy> <dimz> n", argv[0]);
printf("do stuffn");
exit(1);
}
}
/* -------------------------------------------------------------------------- */
// main program, finally!
/* -------------------------------------------------------------------------- */
int
main(int argc, char** argv)
{
usage(argc, argv);
size_t x_dim = atoi( argv[1] );
size_t y_dim = atoi( argv[2] );
size_t z_dim = atoi( argv[3] );
cudaStream_t stream1, stream2;
ASSERT( cudaStreamCreate( &stream1 ) == cudaSuccess );
ASSERT( cudaStreamCreate( &stream2 ) == cudaSuccess );
size_t size = x_dim * y_dim * z_dim;
float *data1, *data2;
CUDALLOC_GPU( data1, size, float);
CUDALLOC_GPU( data2, size, float);
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
/* Set seed */
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
/* Generate n floats on device */
curandGenerateUniform(gen, data1, size);
curandGenerateUniform(gen, data2, size);
dim3 dimBlock( z_dim, 1, 1);
dim3 dimGrid( x_dim, y_dim, 1);
clock_t start;
double diff;
cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock>>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
cout << endl << "sequential: " << diff;
cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
cout << endl << "parallel: " << diff;
cudaStreamDestroy( stream1 );
cudaStreamDestroy( stream2 );
return 0;
}
Typically, the dimension of the arrays is 512^3
single float
. I usually just cut the array in blocks of (512,1,1)
threads that I put on a grid of size (1<<15, (rest), 1)
.
Thank you in advance for any hint or comment.
Best regards.
I'm trying to provide an interpretation to why you do not see execution overlap of your two kernels. To this end, I have constructed the code reported below, which uses your two kernels and monitors which Streaming Multiprocessor (SM) each block runs on. I'm using CUDA 6.5 (Release Candidate) and I'm running on a GT540M card, which has only 2
SMs, so it provides a simple playground to work with. The blockSize
choice is delegated to the new CUDA 6.5 cudaOccupancyMaxPotentialBlockSize
facility.
THE CODE
#include <stdio.h>
#include <time.h>
//#define DEBUG_MODE
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**************************************************/
/* STREAMING MULTIPROCESSOR IDENTIFICATION NUMBER */
/**************************************************/
__device__ unsigned int get_smid(void) {
unsigned int ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
/************/
/* KERNEL 1 */
/************/
__global__ void kernel_1(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
data[i] = data[i] / value;
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_1(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] / value;
// }
//}
/************/
/* KERNEL 2 */
/************/
__global__ void kernel_2(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < N) {
data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_2(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
// }
//}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10000;
const float value = 5.55f;
const int rep_num = 20;
// --- CPU memory allocations
float *h_data1 = (float*) malloc(N*sizeof(float));
float *h_data2 = (float*) malloc(N*sizeof(float));
float *h_data1_ref = (float*) malloc(N*sizeof(float));
float *h_data2_ref = (float*) malloc(N*sizeof(float));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1[i] = rand() / RAND_MAX;
h_data2[i] = rand() / RAND_MAX;
}
// --- GPU memory allocations
float *d_data1, *d_data2;
gpuErrchk(cudaMalloc((void**)&d_data1, N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_data2, N*sizeof(float)));
// --- CPU -> GPU memory transfers
gpuErrchk(cudaMemcpy(d_data1, h_data1, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1_ref[i] = h_data1[i] / value;
h_data2_ref[i] = h_data2[i] * (value + sin(double(i)) * 1./cos(double(i)));
}
// --- Stream creations
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
// --- Launch parameters configuration
int blockSize1, blockSize2, minGridSize1, minGridSize2, gridSize1, gridSize2;
cudaOccupancyMaxPotentialBlockSize(&minGridSize1, &blockSize1, kernel_1, 0, N);
cudaOccupancyMaxPotentialBlockSize(&minGridSize2, &blockSize2, kernel_2, 0, N);
gridSize1 = (N + blockSize1 - 1) / blockSize1;
gridSize2 = (N + blockSize2 - 1) / blockSize2;
// --- Allocating space for SM IDs
int *h_sm_11 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_12 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_21 = (int*) malloc(gridSize2*sizeof(int));
int *h_sm_22 = (int*) malloc(gridSize2*sizeof(int));
int *d_sm_11, *d_sm_12, *d_sm_21, *d_sm_22;
gpuErrchk(cudaMalloc((void**)&d_sm_11, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_12, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_21, gridSize2*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_22, gridSize2*sizeof(int)));
// --- Timing individual kernels
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 1 - elapsed time: %3.3f ms n", time/rep_num);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 2 - elapsed time: %3.3f ms n", time/rep_num);
// --- No stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data1, d_data1, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Results check
for (int i=0; i<N; i++) {
if (h_data1[i] != h_data1_ref[i]) {
printf("Kernel1 - Error at i = %i; Host = %f; Device = %fn", i, h_data1_ref[i], h_data1[i]);
return;
}
}
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1>>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data2, d_data2, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (h_data2[i] != h_data2_ref[i]) {
printf("Kernel2 - Error at i = %i; Host = %f; Device = %fn", i, h_data2_ref[i], h_data2[i]);
return;
}
}
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("No stream - elapsed time: %3.3f ms n", time);
// --- Stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1, 0, stream1 >>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1, 0, stream2 >>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream1 >>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream2 >>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Stream - elapsed time: %3.3f ms n", time);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
printf("Test passed!n");
gpuErrchk(cudaMemcpy(h_sm_11, d_sm_11, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_12, d_sm_12, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_21, d_sm_21, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_22, d_sm_22, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
printf("Kernel 1: gridSize = %i; blockSize = %in", gridSize1, blockSize1);
printf("Kernel 2: gridSize = %i; blockSize = %in", gridSize2, blockSize2);
for (int i=0; i<gridSize1; i++) {
printf("Kernel 1 - Data 1: blockNumber = %i; SMID = %dn", i, h_sm_11[i]);
printf("Kernel 1 - Data 2: blockNumber = %i; SMID = %dn", i, h_sm_12[i]);
}
for (int i=0; i<gridSize2; i++) {
printf("Kernel 2 - Data 1: blockNumber = %i; SMID = %dn", i, h_sm_21[i]);
printf("Kernel 2 - Data 2: blockNumber = %i; SMID = %dn", i, h_sm_22[i]);
}
cudaDeviceReset();
return 0;
}
KERNEL TIMINGS FOR N = 100
and N = 10000
N = 100
kernel_1 0.003ms
kernel_2 0.005ms
N = 10000
kernel_1 0.011ms
kernel_2 0.053ms
So, kernel 1 is more computationally expensive than kernel 2.
RESULTS FOR N = 100
Kernel 1: gridSize = 1; blockSize = 100
Kernel 2: gridSize = 1; blockSize = 100
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 1
In this case, each kernel is launched with only one block and this is the timeline.
As you can see, the overlap occurs. By looking at the above outcomes, the scheduler delivers the single blocks of the two calls to kernel 1 in parallel to the two available SMs and then does the same for kernel 2. This seems to be the main reason why overlap occurs.
RESULTS FOR N = 10000
Kernel 1: gridSize = 14; blockSize = 768
Kernel 2: gridSize = 10; blockSize = 1024
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 1 - Data 1: blockNumber = 1; SMID = 1
Kernel 1 - Data 2: blockNumber = 1; SMID = 0
Kernel 1 - Data 1: blockNumber = 2; SMID = 0
Kernel 1 - Data 2: blockNumber = 2; SMID = 1
Kernel 1 - Data 1: blockNumber = 3; SMID = 1
Kernel 1 - Data 2: blockNumber = 3; SMID = 0
Kernel 1 - Data 1: blockNumber = 4; SMID = 0
Kernel 1 - Data 2: blockNumber = 4; SMID = 1
Kernel 1 - Data 1: blockNumber = 5; SMID = 1
Kernel 1 - Data 2: blockNumber = 5; SMID = 0
Kernel 1 - Data 1: blockNumber = 6; SMID = 0
Kernel 1 - Data 2: blockNumber = 6; SMID = 0
Kernel 1 - Data 1: blockNumber = 7; SMID = 1
Kernel 1 - Data 2: blockNumber = 7; SMID = 1
Kernel 1 - Data 1: blockNumber = 8; SMID = 0
Kernel 1 - Data 2: blockNumber = 8; SMID = 1
Kernel 1 - Data 1: blockNumber = 9; SMID = 1
Kernel 1 - Data 2: blockNumber = 9; SMID = 0
Kernel 1 - Data 1: blockNumber = 10; SMID = 0
Kernel 1 - Data 2: blockNumber = 10; SMID = 0
Kernel 1 - Data 1: blockNumber = 11; SMID = 1
Kernel 1 - Data 2: blockNumber = 11; SMID = 1
Kernel 1 - Data 1: blockNumber = 12; SMID = 0
Kernel 1 - Data 2: blockNumber = 12; SMID = 1
Kernel 1 - Data 1: blockNumber = 13; SMID = 1
Kernel 1 - Data 2: blockNumber = 13; SMID = 0
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 0
Kernel 2 - Data 1: blockNumber = 1; SMID = 1
Kernel 2 - Data 2: blockNumber = 1; SMID = 1
Kernel 2 - Data 1: blockNumber = 2; SMID = 1
Kernel 2 - Data 2: blockNumber = 2; SMID = 0
Kernel 2 - Data 1: blockNumber = 3; SMID = 0
Kernel 2 - Data 2: blockNumber = 3; SMID = 1
Kernel 2 - Data 1: blockNumber = 4; SMID = 1
Kernel 2 - Data 2: blockNumber = 4; SMID = 0
Kernel 2 - Data 1: blockNumber = 5; SMID = 0
Kernel 2 - Data 2: blockNumber = 5; SMID = 1
Kernel 2 - Data 1: blockNumber = 6; SMID = 1
Kernel 2 - Data 2: blockNumber = 6; SMID = 0
Kernel 2 - Data 1: blockNumber = 7; SMID = 0
Kernel 2 - Data 2: blockNumber = 7; SMID = 1
Kernel 2 - Data 1: blockNumber = 8; SMID = 1
Kernel 2 - Data 2: blockNumber = 8; SMID = 0
Kernel 2 - Data 1: blockNumber = 9; SMID = 0
Kernel 2 - Data 2: blockNumber = 9; SMID = 1
This is the timeline:
In this case, no overlap occurs. According to the above outcomes, this does not mean that the two SMs are not simultaneously exploited, but (I think) that, due to the larger number of blocks to be launched, assigning two blocks of different kernels or the two blocks of the same kernel does not make much difference in terms of performance and thus the scheduler chooses the second option.
I have tested that, considering more work done per thread, the behavior keeps the same.
链接地址: http://www.djcxy.com/p/80108.html上一篇: 同步两个CUDA流
下一篇: CUDA流和并发内核执行