cuda内核不能同时执行

我试图探索我的Nvidia Quadro 4000的具有2.0功能的并发内核执行属性。

我使用2个不同的流,它们的运行方式如下所示:

  • 复制H2D两个不同的固定内存块
  • 运行内核
  • 复制D2H两个不同的块来固定内存。
  • 两个流的内核完全相同,每个执行时间为190毫秒。

    在Visual Profiler(版本5.0)中,我预计两个内核都会同时开始执行,但它们仅重叠20毫秒。 这里是代码示例:

    enter code here
    
    //initiate the streams
            cudaStream_t stream0,stream1;
            CHK_ERR(cudaStreamCreate(&stream0));
            CHK_ERR(cudaStreamCreate(&stream1));
            //allocate the memory on the GPU for stream0
            CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            //allocate the memory on the GPU for stream1
            CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));
    
            //allocate page-locked memory for stream0
            CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
            //allocate page-locked memory for stream1
            CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
    
            memcpy(host01,in1,width*height*sizeof(char));
            memcpy(host02,in2,width*height*sizeof(char));
    
            memcpy(host11,in1,width*height*sizeof(char));
            memcpy(host12,in2,width*height*sizeof(char));
    
    
    
            cudaEvent_t start, stop;
            float time;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
    
            dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
            dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);
    
            cudaEventRecord(start,0);
            // --------------------
            // Copy images to device
            // --------------------
            //enqueue copies of def stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
            //enqueue copies of ref stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            //CALLING KERNEL
            //enqueue kernel in stream0 and stream1
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");
    
    
            //Copy result back
            CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
            CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            cudaEventRecord( stop, 0 );
            cudaEventSynchronize( stop );
            cudaEventElapsedTime( &time, start, stop );
            printf("Elapsed time=%f msn",time);
    
            memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
            memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));
    
    
            cudaEventDestroy( start ); 
            cudaEventDestroy( stop );
            CHK_ERR(cudaStreamDestroy(stream0));
            CHK_ERR(cudaStreamDestroy(stream1));
    
            CHK_ERR(cudaDeviceReset());
    
    
        } 
    

    计算能力2.x-3.0

    计算能力2.x-3.0设备具有单个硬件工作队列。 CUDA驱动程序将命令推入工作队列。 GPU主机读取命令并将工作分发给复制引擎或CUDA工作分配器(CWD)。 CUDA驱动程序将同步命令插入到硬件工作队列中,以确保同一个流上的工作不能同时运行。 当主机点击一个同步命令时,它将停止,直到相关工作完成。

    当网格太小而无法填满整个GPU或网格具有尾部效应(线程块的子集执行比其他线程块长得多)时,并发内核执行可提高GPU利用率。

    案例1:在一个流中背靠背内核

    如果应用程序在同一个流上背靠背启动两个kernesl,则CUDA驱动程序插入的同步命令将不会将第二个内核调度到CWD,直到第一个内核完成。

    情况2:在两个流上背靠背启动内核

    如果应用程序在不同的流上启动两个内核,主机将读取命令并将命令分派给CWD。 CWD将栅格化第一个网格(顺序取决于体系结构)并将线程块分派给SM。 只有当所有来自第一个网格的线程块都被分派后,CWD才会从第二个网格分派线程块。

    计算能力3.5

    计算能力3.5引入了几项新功能来提高GPU利用率。 这些包括: - HyperQ支持多个独立的硬件工作队列。 - 动态并行性允许设备代码发起新的工作。 - CWD容量增加到32个网格。

    资源

  • CUDA C / C ++流和并发[webinar]
  • NVIDIA Fermi白皮书
  • NVIDIA开普勒GK110架构白皮书
  • 链接地址: http://www.djcxy.com/p/80103.html

    上一篇: cuda kernels not executing concurrently

    下一篇: CUDA concurrent kernels serialize when using events