Cuda not giving correct answer when array size is larger than 1,000,000

I have wrote a simple sum reduction code which seems to work just fine until i increase array size to 1 million what can be the problem.

#define BLOCK_SIZE 128
#define ARRAY_SIZE 10000

cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size);

__global__ void sumKernel(const long *input, long *output)
{
    int tid = threadIdx.x;
    int bid = blockDim.x * blockIdx.x;

    __shared__ long data[BLOCK_SIZE];

    if(bid+tid < ARRAY_SIZE)
           data[tid] = input[bid+tid];
    else
           data[tid] = 0;

     __syncthreads();

    for(int i = BLOCK_SIZE/2; i >= 1; i >>= 1)
    {
        if(tid < i)
        data[tid] += data[tid + i];
        __syncthreads(); 
    }

    if(tid == 0)
        output[blockIdx.x] = data[0];
}

int main()
{    
    int totalBlocks = ARRAY_SIZE/BLOCK_SIZE;

    if(ARRAY_SIZE % BLOCK_SIZE != 0)
        totalBlocks++;

    long *input = (long*) malloc(ARRAY_SIZE * sizeof(long) );
    long *output = (long*) malloc(totalBlocks * sizeof(long) );

    for(int i=0; i<ARRAY_SIZE; i++)
    {
        input[i] = i+1 ;
    }
// Add vectors in parallel.
        cudaError_t cudaStatus = addWithCuda(input, output, totalBlocks, ARRAY_SIZE);
        if (cudaStatus != cudaSuccess) {
             fprintf(stderr, "addWithCuda failed!");
             return 1;
        }

    long ans = 0;
    for(int i =0 ; i < totalBlocks ;i++)
    {
        ans = ans + output[i];
    }

    printf("Final Ans : %ld",ans);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
        if (cudaStatus != cudaSuccess) {
              fprintf(stderr, "cudaDeviceReset failed!");
              return 1;
         }

     getchar();

      return 0;
}

     // Helper function for using CUDA to add vectors in parallel.
     cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size)
     {
          long *dev_input = 0;
          long *dev_output = 0;

          cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
           cudaStatus = cudaSetDevice(0);
         if (cudaStatus != cudaSuccess) {
             fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
             goto Error;
     }

// Allocate GPU buffers for two vectors (one input, one output)    .

     cudaStatus = cudaMalloc((void**)&dev_input, size * sizeof(long));
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         goto Error;
         }

cudaStatus = cudaMalloc((void**)&dev_output, totalBlocks * sizeof(long));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_input, input, size * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_output, output, (totalBlocks) * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(output, dev_output, totalBlocks * sizeof(long), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_input);
cudaFree(dev_output);

return cudaStatus;
}

and just for the reference if it has to do somthing with my GPU device, my GPU is GTXX 650ti. and here is the info about GPU:

Maximum number of threads per multiprocessor: 2048

Maximum number of threads per block: 1024

Maximum sizes of each dimension of a block: 1024 x 1024 x 64

Maximum sizes of each dimension of a grid: 2147483647 x 65535 x 65535

Maximum memory pitch: 2147483647 bytes

Texture alignment: 512 bytes


Actually the answer =could not fit in long as well so after using long double for datatypes this issue was resolved. Thanks all!


One problem in your code is that your last cudaMemcpy is not set up correctly:

cudaMemcpy(output, dev_output, totalBlocks * sizeof(int), cudaMemcpyDeviceToHost);

All of your data is long data so you should be copying using sizeof(long) not sizeof(int)

Another problem in your code is using the wrong printf format identifier for a long datatype:

printf("n %d n",output[i]);

use something like this instead:

printf("n %ld n",output[i]);

You may also have a problem with a large block count if you are not compiling for sm_30 architecture. In that case, proper cuda error checking would identify the problem.


You don't check for errors after sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output); . Normally, if you would check for the last occured error it should give the error invalid configuration argument . Try adding the following after the sumKernel line.

cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    printf(stderr, "sumKernel failed: %sn", cudaGetErrorString(cudaStatus));
    goto Error;
}

See this question for more information about the error.

链接地址: http://www.djcxy.com/p/38470.html

上一篇: 关于CUDA的架构(SM,SP)

下一篇: 当数组大小大于1,000,000时,Cuda没有给出正确答案