cuda kernels not executing concurrently

I'm trying to explore the concurrent kernels execution property of my Nvidia Quadro 4000, which has 2.0 capability.

I use 2 different streams, which run the same as follows:

  • Copy H2D two different chunks of pinned memory
  • Run kernel
  • Copyt D2H two different chunks to pinned memory.
  • Kernels of both streams are exactly the same and have 190 ms execution time each.

    In the Visual profiler (version 5.0) I expected both kernels to start execution simultaneously, however they overlap only by 20 ms. here is the code sample :

    enter code here
    
    //initiate the streams
            cudaStream_t stream0,stream1;
            CHK_ERR(cudaStreamCreate(&stream0));
            CHK_ERR(cudaStreamCreate(&stream1));
            //allocate the memory on the GPU for stream0
            CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            //allocate the memory on the GPU for stream1
            CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));
    
            //allocate page-locked memory for stream0
            CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
            //allocate page-locked memory for stream1
            CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
    
            memcpy(host01,in1,width*height*sizeof(char));
            memcpy(host02,in2,width*height*sizeof(char));
    
            memcpy(host11,in1,width*height*sizeof(char));
            memcpy(host12,in2,width*height*sizeof(char));
    
    
    
            cudaEvent_t start, stop;
            float time;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
    
            dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
            dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);
    
            cudaEventRecord(start,0);
            // --------------------
            // Copy images to device
            // --------------------
            //enqueue copies of def stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
            //enqueue copies of ref stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            //CALLING KERNEL
            //enqueue kernel in stream0 and stream1
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");
    
    
            //Copy result back
            CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
            CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            cudaEventRecord( stop, 0 );
            cudaEventSynchronize( stop );
            cudaEventElapsedTime( &time, start, stop );
            printf("Elapsed time=%f msn",time);
    
            memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
            memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));
    
    
            cudaEventDestroy( start ); 
            cudaEventDestroy( stop );
            CHK_ERR(cudaStreamDestroy(stream0));
            CHK_ERR(cudaStreamDestroy(stream1));
    
            CHK_ERR(cudaDeviceReset());
    
    
        } 
    

    Compute Capability 2.x-3.0

    Compute capability 2.x-3.0 devices have a single hardware work queue. The CUDA driver pushes commands into the work queue. The GPU host reads the commands and dispatches the work to the copy engines or the CUDA Work Distributor (CWD). The CUDA driver inserts synchronization commands into the hardware work queue to guarantee that work on the same stream is not able to run concurrently. When the host hits a synchronization command it will stall until the dependent work is completed.

    Concurrent kernel execution improves GPU utilization when a grid is too small to fill the entire GPU or when grids have tail effect (subset of thread blocks execute much longer than other thread blocks).

    Case 1: Back to back kernels on one stream

    If an application launches two kernesl back to back on the same stream the synchronization command inserted by the CUDA driver will not dispatch the 2nd kernel to CWD until the first kernel has completed.

    Case 2: Back to back kernel launches on two streams

    If an application launches two kernels on different streams the host will reads the commands and dispatch the commands to CWD. CWD will rasterize the first grid (order is architecture dependent) and dispatch thread blocks to the SMs. Only when all of the threads blocks from the first grid have been dispatched will CWD dispatch thread blocks from the second grid.

    Compute Capability 3.5

    Compute capability 3.5 introduced several new features to improve GPU utilization. These include: - HyperQ supports multiple independent hardware work queues. - Dynamic Parallelism allows for device code to launch new work. - CWD capacity was increased to 32 grids.

    Resources

  • CUDA C/C++ Streams and Concurrency [webinar]
  • NVIDIA Fermi Whitepaper
  • NVIDIA Kepler GK110 Architecture Whitepaper
  • 链接地址: http://www.djcxy.com/p/80104.html

    上一篇: 了解CUDA依赖性检查

    下一篇: cuda内核不能同时执行