当数组大小大于1,000,000时,Cuda没有给出正确答案
我写了一个简单的总和缩减代码,似乎工作得很好,直到我将数组大小增加到100万可能是问题。
#define BLOCK_SIZE 128
#define ARRAY_SIZE 10000
cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size);
__global__ void sumKernel(const long *input, long *output)
{
int tid = threadIdx.x;
int bid = blockDim.x * blockIdx.x;
__shared__ long data[BLOCK_SIZE];
if(bid+tid < ARRAY_SIZE)
data[tid] = input[bid+tid];
else
data[tid] = 0;
__syncthreads();
for(int i = BLOCK_SIZE/2; i >= 1; i >>= 1)
{
if(tid < i)
data[tid] += data[tid + i];
__syncthreads();
}
if(tid == 0)
output[blockIdx.x] = data[0];
}
int main()
{
int totalBlocks = ARRAY_SIZE/BLOCK_SIZE;
if(ARRAY_SIZE % BLOCK_SIZE != 0)
totalBlocks++;
long *input = (long*) malloc(ARRAY_SIZE * sizeof(long) );
long *output = (long*) malloc(totalBlocks * sizeof(long) );
for(int i=0; i<ARRAY_SIZE; i++)
{
input[i] = i+1 ;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(input, output, totalBlocks, ARRAY_SIZE);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
long ans = 0;
for(int i =0 ; i < totalBlocks ;i++)
{
ans = ans + output[i];
}
printf("Final Ans : %ld",ans);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
getchar();
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size)
{
long *dev_input = 0;
long *dev_output = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for two vectors (one input, one output) .
cudaStatus = cudaMalloc((void**)&dev_input, size * sizeof(long));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_output, totalBlocks * sizeof(long));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_input, input, size * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_output, output, (totalBlocks) * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(output, dev_output, totalBlocks * sizeof(long), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_input);
cudaFree(dev_output);
return cudaStatus;
}
如果需要使用我的GPU设备做参考,我的GPU是GTXX 650ti。 这里是关于GPU的信息:
每个多处理器的最大线程数:2048
每块的最大线程数:1024
块的每个尺寸的最大尺寸:1024 x 1024 x 64
网格的每个尺寸的最大尺寸:2147483647 x 65535 x 65535
最大存储器间距:2147483647字节
纹理对齐:512字节
实际上,答案=无法长时间适用,因此在使用long double作为数据类型后,此问题已得到解决。 谢谢大家!
你的代码中的一个问题是你的最后一个cudaMemcpy设置不正确:
cudaMemcpy(output, dev_output, totalBlocks * sizeof(int), cudaMemcpyDeviceToHost);
所有数据都是长数据,因此您应该使用sizeof(long)
not sizeof(int)
代码中的另一个问题是对于长数据类型使用错误的printf格式标识符:
printf("n %d n",output[i]);
改用这样的东西:
printf("n %ld n",output[i]);
如果您未针对sm_30体系结构进行编译,则可能还会遇到大块数问题。 在这种情况下,适当的cuda错误检查将识别问题。
在sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);
之后不检查错误sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);
。 通常情况下,如果你要检查最后发生的错误,它应该给错误invalid configuration argument
。 在sumKernel
行之后尝试添加以下内容。
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
printf(stderr, "sumKernel failed: %sn", cudaGetErrorString(cudaStatus));
goto Error;
}
请参阅此问题以获取有关该错误的更多信息。
链接地址: http://www.djcxy.com/p/38469.html上一篇: Cuda not giving correct answer when array size is larger than 1,000,000