Example of increasing the work per thread in CUDA

Intro : First and as an introduction, i am quite "proud" to ask my first question on StackOverflow. I hope I'll be able to help other people as much as they help me.

Algorithm :

I'm writing a program with CUDA and the problem is the following:

  • Two matrices A (n * 128) and B (m * 128)

  • I take the first row of A, and I compute the distance between that vector and all the rows of B, one by one.

  • I write the result of each distance on a row of a matrix C, so the element C(i,j) of C contains the distance between row i of A and row j of B.

  • and I proceed with the next row of A.

  • I've implemented it this way: I've got a grid made by ( n * m ) blocks, and 128 threads per block. ( 1 * 128 ).

    QUESTION : The program runs successfully with the expected results but the time execution is only around 5 to 10 times faster than the one-threaded CPU version of it. So I would like to know how to increase the work per thread before reduction in order to increase performance .

    Kernel code (original : Not optimized)

     __global__ void EuclideanDistances( float *A, float *B , float *C , int n , int m)
    {
        // SIZE is equal to 128
    __shared__ float accumResult[SIZE];
    float sA;
    float sB;
    
        // MAPPING
    int bx = blockIdx.x;  // n
    int by = blockIdx.y;  // m
    int ty = threadIdx.y; // 128
    int tx = threadIdx.x; // 1
    
    
    sA = A [bx * SIZE + ty];
    sB = B [by * SIZE + ty];
    __syncthreads();
    
    
    accumResult[ty] = (sA - sB) * (sA - sB);
    __syncthreads();
    
    
    // Parallel tree-reduction
    for (int stride = SIZE/2 ; stride > 0 ; stride >>= 1)
        if (ty < stride)
        {
            accumResult[ty] += accumResult [stride + ty];
              __syncthreads();
        }
    
        // Writing results to output matrix
    if ((threadIdx.y == 0))
        C [bx * m + by] = accumResult[ty];
           __syncthreads();
    }
    

    UPDATE

    Now, I'm using another mapping : Instead of taking a grid of n by m blocks and a block of 128 threads, I'm increasing the number of threads within a block in order to decrease the number of blocks.

    New mapping:

    Block of 128 by 8 threads (total of 1024 threads, which is the max size)

    Grid of n/8 by m/8 blocks

    Unfortunately, it's giving wrong results ).

    Optimized kernel code (to be updated)

    __global__ void EuclideanDistances( float *A, float *B , float *C, int n , int m)
    {
        __shared__ float accumResult[SIZE][8];
    __shared__ float sA[SIZE][8];
    __shared__ float sB[SIZE][8];
    
    int bx = blockIdx.x;  // n / 8
    int by = blockIdx.y;  // m / 8
    int tx = threadIdx.x; // 8
    int ty = threadIdx.y; // 128
    int i = bx * tx * SIZE + ty;
    int j = by * tx * SIZE + ty;
    
    sA[ty][tx] = A [i];
    sB[ty][tx] = B[j];
    __syncthreads();
    
    
    accumResult[ty][tx] = (sA[ty][tx] - sB[ty][tx]) * (sA[ty][tx] - sB[ty][tx]);
    __syncthreads();
    
    // Reduction
    for (int stride = SIZE/2 ; stride > 0 ; stride>>=1)
        if (ty < stride)
        {
            accumResult[ty][tx] += accumResult [stride + ty][tx];
            __syncthreads();
        }
    
        C[bx *  m + by] = accumResult[0][tx];
    }
    

    HOST CODE (allocations + kernel calls)

        int main()
    {
         int m = 20000; //MatrixA size : m * SIZE
         int n = 4000;  //MatrixB size : n * SIZE
    
         srand((unsigned)time(0));
    
         // Host Allocations
         float *matrixA = (float *) malloc (n * SIZE * sizeof(float));
         for(int i=0; i < n * SIZE; i++)
             matrixA[i] = (float) (rand()%100)+1;
    
         float *matrixB = (float *) malloc (m * SIZE * sizeof(float));
         for(int i=0; i < m * SIZE; i++)
             matrixB[i] = (float) (rand()%100)+1;
    
         float *results_kernel1 = (float *) malloc (n * m * sizeof(float));
         float *results_kernel2 = (float *) malloc (n * m * sizeof(float));
    
    
         //Device Allocation
         float *d_matrixA;
         float *d_matrixB;
         cudaMalloc((void **)&d_matrixA, n * SIZE * sizeof(float));
         cudaMalloc((void **)&d_matrixB, m * SIZE * sizeof(float));
         cudaMemcpy(d_matrixA , matrixA , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
         cudaMemcpy(d_matrixB , matrixB , m * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
    
         float *d_results_kernel1;
         float *d_results_kernel2;
         cudaMalloc((void **)&d_results_kernel1 , n * m * sizeof(float));
         cudaMalloc((void **)&d_results_kernel2 , n * m * sizeof(float));
    
         dim3 threads1 (1 , 128);
         dim3 blocks1  (n , m);
         EuclideanDistances1 <<<blocks1 , threads1>>> (d_matrixA , d_matrixB , d_results_kernel1 , n , m);
         cudaDeviceSynchronize();
         cudaMemcpy(results_kernel1 , d_results_kernel1 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         cudaFree(d_results_kernel1);
    
         dim3 threads2 (8 , 128);   // 1024 threads per block (maximum)
         dim3 blocks2  (ceil((float)n/8) , ceil((float)m/8));
         EuclideanDistances2 <<<blocks2 , threads2>>> (d_matrixA , d_matrixB , d_results_kernel2 , n , m);
         cudaDeviceSynchronize();
         cudaMemcpy(results_kernel2 , d_results_kernel2 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         cudaFree(d_results_kernel2);
    
         // Visualising and comparing results
         for (int i = 0 ; i < 50 ; i++)
             std::cout << "kernel1 : " << results_kernel1[i] << "  |  kernel2 : " << results_kernel2[i] << std::endl;
    
         free(matrixA);
         free(matrixB);
         free(results_kernel1);
         free(results_kernel2);
    
         return 0;
    }
    

    PS : I have CUDA 6.0 with a NVIDIA GTX 650 (compute capability 3.0)


    It seems your question has 2 components:

  • why isn't my second kernel working?
  • how do I make my code run faster?
  • Why isn't my second kernel working?

    You had several issues:

  • indexing problems in initial calculation of i , j as well as the index for storing the C value.
  • violation of usage of _syncthreads() inside a conditional block
  • item 1 was the key element to get the code working.

    How do I make my code run faster?

    This is more involved. First of all, your attempt at "increasing work per thread" didn't do anything of the kind, it was merely an increase in the number of threads per block (from 128 to 8*128). Each thread was doing approximately the same amount of work. Furthermore, in the process of going to a 2D threadblock for this attempt, I believe a couple of bad things happened:

  • various coalescing and shared-memory-bank-conflict load and store patterns were broken.
  • effective occupancy went down, due the amount of shared memory required per block.
  • The net effect of the second kernel was to approximately double the execution time. So that is not what we want.

    However, increasing work per thread may be a good idea, along with using shared memory, as well as trying to preserve good (global, shared) memory access patterns, as well as allowing for increased occupancy.

    What follows is a work-in-progress along those lines. The following code has your second kernel fixed, along with timing infrastructure, as well as full data verification, as well as 2 new kernels. The first new kernel (#3) is what I would call a "naive" kernel. It simply allocates one thread per output point, and each thread loops through the necessary vectors, computing its individual result. No usage of shared memory, or even much attention to coalescing or any other optimization. However with a tweak to threadblock configuration (16,16) -> (8,32) threads, which I observed from @talonmies answer (now deleted), this kernel performs significantly (3x) faster than your "fast" kernel. After further thought about the (8,32) observation, I concluded that the next attempt at optimization should focus on:

  • elimination of the usage of a parallel reduction to compute the vector distance (ie allow adjacent threads to use a straight for-loop to loop through the vectors)
  • maximization of benefit from the cache
  • efficient usage of shared memory
  • insist on perfect global coalescing/perfect usage of shared memory for all reads and writes
  • Item 4 prompted the question in the comments "may I transpose the matrices?" With this permission, it's possible to re-organize the data to facilitate item 4 above. Item 2 above is addressed in my "fast" kernel (#4) by loading the B vector into shared memory, while allowing the cache to mostly focus on caching the A vectors, hopefully reducing cache-thrashing (A is the smaller of the 2 vector arrays, at about 2MB - fermi L2 is 768K, Kepler L2 is 1.5MB). By delivering A in transposed form, and effectively "transposing" B on-chip from shared memory, it's possible to use a straight for-loop to compute the vector distance, while allowing adjacent threads to have perfectly coalesced reads and writes, as well as "efficient" use of shared memory (ie non-bank-conflicted loads, and broadcast reads).

    For my particular timing, (Quadro5000 cc2.0 GPU, CUDA 6, RHEL 5.5) I see that your "fast" kernel requires about 2 seconds, my "naive" kernel requires about 0.7 seconds, and my "fast" kernel requires about 0.2 seconds, albeit with transposed (A,C) data.

    EDIT: I've made one additional optimization, that is to have each block compute multiple ( CHKSIZE ) B vectors at one time. You can set CHKSIZE to 1 to see the previous result (~0.2sec). I found CHKSIZE of 4 gave good improvement. This is an attack at attempting to exploit the data re-use of A. With this additional optimization at CHKSIZE of 4, the kernel time for kernel 4 drops to about 0.1 second.

    Following is the code and a sample run:

    $ cat t460.cu 
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    
    // both M and N must be evenly divisible by SIZE, M must be evenly divisible by CHKSIZE
    #define SIZE 128
    #define N 4000
    #define M 20000
    #define CHKSIZE 4
    
     __global__ void EuclideanDistances1( float *A, float *B , float *C , int n , int m)
    {
        // SIZE is equal to 128
    __shared__ float accumResult[SIZE];
    float sA;
    float sB;
    
        // MAPPING
    int bx = blockIdx.x;  // n
    int by = blockIdx.y;  // m
    int ty = threadIdx.y; // 128
    //int tx = threadIdx.x; // 1
    
    sA = A [bx * SIZE + ty];
    sB = B [by * SIZE + ty];
    __syncthreads();
    
    accumResult[ty] = (sA - sB) * (sA - sB);
    __syncthreads();
    
    // Parallel tree-reduction
    for (int stride = SIZE/2 ; stride > 0 ; stride >>= 1){
        if (ty < stride)
        {
            accumResult[ty] += accumResult [stride + ty];
        }
              __syncthreads();
      }
    
        // Writing results to output matrix
    if ((ty == 0))
        C [bx * m + by] = accumResult[ty];
           __syncthreads();
    }
    
    __global__ void EuclideanDistances2( float *A, float *B , float *C, int n , int m)
    {
    __shared__ float accumResult[SIZE][8];
    __shared__ float sA[SIZE][8];
    __shared__ float sB[SIZE][8];
    
    int bx = blockIdx.x;  // n / 8
    int by = blockIdx.y;  // m
    int tx = threadIdx.x; // 8
    int ty = threadIdx.y; // 128
    int i = ((bx*8) + tx) * SIZE + ty;
    int j = by * SIZE + ty;
    
    sA[ty][tx] = A[i];
    sB[ty][tx] = B[j];
    __syncthreads();
    
    accumResult[ty][tx] = (sA[ty][tx] - sB[ty][tx]) * (sA[ty][tx] - sB[ty][tx]);
    __syncthreads();
    
    // Reduction
    for (int stride = SIZE/2 ; stride > 0 ; stride>>=1){
        if (ty < stride)
        {
            accumResult[ty][tx] += accumResult [stride + ty][tx];
        }
        __syncthreads();
      }
    
    if (ty == 0)
        C[((bx*8)+tx) *  m + by] = accumResult[0][tx];
    }
    //naive kernel
    __global__ void EuclideanDistances3( float *A, float *B , float *C, int n , int m){
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
      float result = 0.0f;
    
      if ((idx < n) && (idy < m)){
        for (int i = 0; i < SIZE; i++){
          float temp = A[(idx*SIZE)+i] - B[(idy*SIZE)+i];
          result += temp * temp;}
        C[(idx*m) + idy] = result;
      }
    }
    //optimized kernel
    __global__ void EuclideanDistances4( const float *A, const float *B , float *C, const int n , const int m){
      // n, A,  4000 this kernel assumes A is column-major A(SIZE, n)
      // m, B, 20000 this kernel assumes B is row-major    B(m, SIZE)
      // this kernel assumes C is column-major             C(m,n)
      // this kernel assumes number of threads per threadblock == SIZE
      // CHKSIZE is the number of B vectors that will be compute per block
      __shared__ float my_sB[CHKSIZE*SIZE];  // enough shared storage for CHKSIZE vectors of B
      int bx  = blockIdx.x; // one block per CHKSIZE rows of B (the larger input matrix)
      while ((bx*CHKSIZE) < m){ // not used, this while loop could be used to extend a block to multiple chunks
        int tx  = threadIdx.x;
        for (int i = 0; i < CHKSIZE; i++)  // load vectors of B into shared memory
          my_sB[(i*SIZE)+tx] = B[(((bx*CHKSIZE)+i)*SIZE)+tx];
        __syncthreads();
        while (tx < n){  //loop across all vectors in A
          float result[CHKSIZE];
          for (int i = 0; i < CHKSIZE; i++)
            result[i] = 0.0f;
          for (int i = 0; i < SIZE; i++){
            float Atemp = A[(n*i)+tx];
            for (int j = 0; j < CHKSIZE; j++){ // compute all CHKSIZE B vectors with read of A
              float temp = Atemp - my_sB[i + (j*SIZE)];
              result[j] += temp * temp;}}
          for (int i = 0; i < CHKSIZE; i++) // store CHKSIZE results
            C[((i+(bx*CHKSIZE))*n)+ tx] = result[i];
          tx += blockDim.x;  } // continue looping across vectors in A
        __syncthreads(); // necessary to prevent warps from racing ahead, if block looping is used
        bx += gridDim.x;}
    }
    
    float comp_euclid_sq(const float *rA, const float *rB, const int size){
    
      float result = 0.0f;
      float temp;
      for (int i = 0; i < size; i++){
        temp = (rA[i] - rB[i]);
        result += temp * temp;}
      return result;
    }
    
    int main()
    {
         float et1=0.0f, et2=0.0f, et3=0.0f, et4=0.0f;
         cudaEvent_t start1, start2, start3,start4, stop1, stop2, stop3, stop4;
         cudaEventCreate(&start1);
         cudaEventCreate(&start2);
         cudaEventCreate(&start3);
         cudaEventCreate(&start4);
         cudaEventCreate(&stop1);
         cudaEventCreate(&stop2);
         cudaEventCreate(&stop3);
         cudaEventCreate(&stop4);
    
         int n = N;  //MatrixA size : n * SIZE
         int m = M; //MatrixB size : m * SIZE
    
         srand((unsigned)time(0));
    
         // Host Allocations
         float *matrixA = (float *) malloc (n * SIZE * sizeof(float));
         for(int i=0; i < n * SIZE; i++)
             matrixA[i] = (float) (rand()%100)+1;
    
         float *matrixB = (float *) malloc (m * SIZE * sizeof(float));
         for(int i=0; i < m * SIZE; i++)
             matrixB[i] = (float) (rand()%100)+1;
    
         float *results_kernel = (float *) malloc (n * m * sizeof(float));
         float *cpu_results_kernel = (float *) malloc (n * m * sizeof(float));
         for (int i = 0; i< n*m; i++)
           cpu_results_kernel[i] = comp_euclid_sq(matrixA + ((i/m)*SIZE), matrixB + (i%m)*SIZE, SIZE);
    
         //Device Allocation
         float *d_matrixA;
         float *d_matrixB;
         cudaMalloc((void **)&d_matrixA, n * SIZE * sizeof(float));
         cudaMalloc((void **)&d_matrixB, m * SIZE * sizeof(float));
         cudaMemcpy(d_matrixA , matrixA , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
         cudaMemcpy(d_matrixB , matrixB , m * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
    
         float *d_results_kernel;
         cudaMalloc((void **)&d_results_kernel , n * m * sizeof(float));
    
    
         dim3 threads1 (1 , SIZE);
         dim3 blocks1  (n , m);
         cudaEventRecord(start1);
         EuclideanDistances1 <<<blocks1 , threads1>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
         cudaEventRecord(stop1);
         cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         for (int i = 0; i< n*m; i++) {
           if (results_kernel[i] != cpu_results_kernel[i])  {printf("cpu/kernel1 mismatch at %d, cpu: %f, kernel1: %fn", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
         cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
         cudaEventSynchronize(stop1);
         cudaEventElapsedTime(&et1, start1, stop1);
    
         dim3 threads2 (8 , SIZE);   // 1024 threads per block (maximum)
         dim3 blocks2  (n/8 , m); // assumes n evenly divisible by 8
         cudaEventRecord(start2);
         EuclideanDistances2 <<<blocks2 , threads2>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
         cudaEventRecord(stop2);
         cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         for (int i = 0; i< n*m; i++) {
           if (results_kernel[i] != cpu_results_kernel[i])  {printf("cpu/kernel2 mismatch at %d, cpu: %f, kernel1: %fn", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
         cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
         cudaEventSynchronize(stop2);
         cudaEventElapsedTime(&et2, start2, stop2);
    
         cudaFuncSetCacheConfig(EuclideanDistances3, cudaFuncCachePreferL1);
         dim3 threads3 (8, 32);   // 1024 threads per block (maximum)
         dim3 blocks3  (n/threads3.x , m/threads3.y); // assumes evenly divisible
         cudaEventRecord(start3);
         EuclideanDistances3 <<<blocks3 , threads3>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
         cudaEventRecord(stop3);
         cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         for (int i = 0; i< n*m; i++) {
           if (results_kernel[i] != cpu_results_kernel[i])  {printf("cpu/kernel3 mismatch at %d, cpu: %f, kernel3: %fn", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
         cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
         cudaEventSynchronize(stop3);
         cudaEventElapsedTime(&et3, start3, stop3);
    
         // transpose matrix A
         float *matrixA_T = (float *) malloc (n * SIZE * sizeof(float));
           for (int i = 0; i < n; i++)
             for (int j = 0; j < SIZE; j++)
               matrixA_T[(j*n)+i] = matrixA[(i*SIZE)+j];
         cudaMemcpy(d_matrixA , matrixA_T , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
    
         cudaFuncSetCacheConfig(EuclideanDistances4, cudaFuncCachePreferL1);
         dim3 threads4(SIZE); // one thread per vector element
         dim3 blocks4(m/CHKSIZE);
         cudaEventRecord(start4);
         EuclideanDistances4 <<<blocks4 , threads4>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
         cudaEventRecord(stop4);
         cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
         // test for correct transposed result C(m,n)
         for (int i = 0; i< n; i++)
           for (int j = 0; j < m; j++)
             if (results_kernel[(j*n)+i] != cpu_results_kernel[(i*m)+j])  {printf("cpu/kernel4 mismatch at %d,%d, cpu: %f, kernel4: %fn", i,j, cpu_results_kernel[(i*m)+j], results_kernel[(j*n)+i]); return 1;}
         cudaEventSynchronize(stop4);
         cudaEventElapsedTime(&et4, start4, stop4);
         cudaFree(d_results_kernel);
    
         printf("Success!n");
         printf("kernel1 : %.fms, kernel2 : %.fms, kernel3 : %.fms, kernel4 : %.fmsn", et1, et2, et3, et4);
    
         free(matrixA);
         free(matrixB);
         free(results_kernel);
    
         return 0;
    }
    
    $ nvcc -O3 -arch=sm_20 -o t460 t460.cu
    $ ./t460
    Success!
    kernel1 : 2213ms, kernel2 : 4660ms, kernel3 : 691ms, kernel4 : 99ms
    $
    

    Hopefully that will get you going with more ideas of things to work on. You may get different timings of course on your cc3.0 device.

    Are further optimizations possible? Probably. The first target I would look at would be to figure out how to take advantage of the data-reuse opportunities on vector A. (data re-use of vector B is already handled in the kernel 4 by loading it into shared memory. There may be ways to use some shared memory to store portions of A to make the code run even faster.)

    I guess I should also mention that following the lead of the code you provided, this code is computing the square of the euclidean distance. A trivial modification to the kernels can make it compute the actual euclidean distance instead ( C[...] = sqrtf(...); ) The validation I have included, however, assumes the results are "in-range" for perfect storage of an integer quantity in a float . Your test case satisfies this requirement, but otherwise the validation code would need to be modified (if sqrtf were used).

    链接地址: http://www.djcxy.com/p/20938.html

    上一篇: 使用JQM解析RSS Feed的特定子类别

    下一篇: 增加CUDA中每个线程的工作量的示例