cuda内核不能同时执行

2018-06-28 16:19:18

我试图探索我的Nvidia Quadro 4000的具有2.0功能的并发内核执行属性。

我使用2个不同的流，它们的运行方式如下所示：

复制H2D两个不同的固定内存块

运行内核

复制D2H两个不同的块来固定内存。

两个流的内核完全相同，每个执行时间为190毫秒。

在Visual Profiler（版本5.0）中，我预计两个内核都会同时开始执行，但它们仅重叠20毫秒。这里是代码示例：

enter code here

//initiate the streams
        cudaStream_t stream0,stream1;
        CHK_ERR(cudaStreamCreate(&stream0));
        CHK_ERR(cudaStreamCreate(&stream1));
        //allocate the memory on the GPU for stream0
        CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        //allocate the memory on the GPU for stream1
        CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));

        //allocate page-locked memory for stream0
        CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));

        //allocate page-locked memory for stream1
        CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));


        memcpy(host01,in1,width*height*sizeof(char));
        memcpy(host02,in2,width*height*sizeof(char));

        memcpy(host11,in1,width*height*sizeof(char));
        memcpy(host12,in2,width*height*sizeof(char));



        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
        dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);

        cudaEventRecord(start,0);
        // --------------------
        // Copy images to device
        // --------------------
        //enqueue copies of def stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
        //enqueue copies of ref stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));

        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        //CALLING KERNEL
        //enqueue kernel in stream0 and stream1
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");


        //Copy result back
        CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));

        CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));


        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        cudaEventRecord( stop, 0 );
        cudaEventSynchronize( stop );
        cudaEventElapsedTime( &time, start, stop );
        printf("Elapsed time=%f msn",time);

        memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
        memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));


        cudaEventDestroy( start ); 
        cudaEventDestroy( stop );
        CHK_ERR(cudaStreamDestroy(stream0));
        CHK_ERR(cudaStreamDestroy(stream1));

        CHK_ERR(cudaDeviceReset());


    }

计算能力2.x-3.0

计算能力2.x-3.0设备具有单个硬件工作队列。 CUDA驱动程序将命令推入工作队列。 GPU主机读取命令并将工作分发给复制引擎或CUDA工作分配器（CWD）。 CUDA驱动程序将同步命令插入到硬件工作队列中，以确保同一个流上的工作不能同时运行。当主机点击一个同步命令时，它将停止，直到相关工作完成。

当网格太小而无法填满整个GPU或网格具有尾部效应（线程块的子集执行比其他线程块长得多）时，并发内核执行可提高GPU利用率。

案例1：在一个流中背靠背内核

如果应用程序在同一个流上背靠背启动两个kernesl，则CUDA驱动程序插入的同步命令将不会将第二个内核调度到CWD，直到第一个内核完成。

情况2：在两个流上背靠背启动内核

如果应用程序在不同的流上启动两个内核，主机将读取命令并将命令分派给CWD。 CWD将栅格化第一个网格（顺序取决于体系结构）并将线程块分派给SM。只有当所有来自第一个网格的线程块都被分派后，CWD才会从第二个网格分派线程块。

计算能力3.5

计算能力3.5引入了几项新功能来提高GPU利用率。这些包括： - HyperQ支持多个独立的硬件工作队列。 - 动态并行性允许设备代码发起新的工作。 - CWD容量增加到32个网格。

资源

CUDA C / C ++流和并发[webinar]

NVIDIA Fermi白皮书

NVIDIA开普勒GK110架构白皮书

链接地址: http://www.djcxy.com/p/80103.html

上一篇: cuda kernels not executing concurrently

下一篇: CUDA concurrent kernels serialize when using events