Getting stack overflows with a CUDA kernel

I have a huge problem with the code I am programming. I am not an expert, and i asked many people before coming here. corrected a lot of things, too. So, I guess I am ready to show you the code and ask you my questions. I will put the entire code here, as a way to make you understand well what my problem is. The thing i wanna do there is, if ARRAY_SIZE is too big for the THREAD_SIZE, so I put the data of the big array into a smaller array, specially-created with size THREAD_SIZE . Then, I send it to the kernel and do whatever I have to do. But I am having problem on the part

isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];

where the code stops, due to stack overflow. First, I made a double pointer of big_matrix. But people in the #cuda channel at freenode irc network told me it was too big for the CPU memory to handle it, that I should create a linear pointer. I did it, but I still have the same problem of stack overflow. So, here it goes... updated after some changes, that didnt work yet (the stack overflow stopped, but theres a linking and manifest update fail)

#define ARRAY_SIZE 2048
#define THREAD_SIZE 32
#define PI 3.14


int main(int argc, char** argv) 
{
        int array_plus=0,x,y;
        float time;
        //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
        //bool array_rest;
        cudaEvent_t start,stop;
        float *d_isub_matrix;

    float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
    float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
    float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
    float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];

        //if the array's size is not compatible with the thread's size, it won't work.

        //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
        //isub_matrix=(float*) malloc(memsize);
        //osub_matrix=(float*) malloc(memsize);

        if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0))
        {

            //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
            //it has to be like this (lots of loops)



            //populating the big array
            for(x=0;x<ARRAY_SIZE;x++)
            {
                for(y=0;y<ARRAY_SIZE;y++)
                    big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
            }

            //kind of loop for the big array

            //Start counting the time of processing (everything)
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start,0);

            while(array_plus<ARRAY_SIZE)
            {

                //putting the big array's values into the sub-matrix

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus)*ARRAY_SIZE+y];
                }

                cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
            cudaMalloc((void**)&osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
            cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

                //call the cuda kernel

                twiddle_factor<<<1,256>>>(isub_matrix,osub_matrix);//<----

                cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

                array_plus=array_plus+THREAD_SIZE;
                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        big_matrix2[x*THREAD_SIZE+array_plus+y]=osub_matrix[x*THREAD_SIZE+y];
                }

                array_rest=array_plus+(ARRAY_SIZE);

                cudaFree(isub_matrix);
                cudaFree(osub_matrix);
                system("PAUSE");
            }

            //Stop the time

            cudaEventRecord(stop,0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&time,start,stop);

            //Free memory in GPU




            printf("The processing time took... %fms to finish",time);
                    system("PAUSE");

        }
        printf("The processing time took...NAO ENTROU!");
        system("PAUSE");
        return 0;
}

//things to do: TRANSPOSITION!!!!

Another question is about the parallel part. The compiler (Visual Studio) says that I engaged too many pow() and exp() at once. How should I solve this problem?

if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
    {
        block[xIndex][yIndex]=exp(sum_sin[xIndex][yIndex])+exp(sum_cos[xIndex][yIndex]);
    }

The original code is down here. I commented it because i wanted to know if at least my code was taking some value in the GPU. But it wasnt even launching the Kernel... so sad)

__global__ void twiddle_factor(float *isub_matrix, float *osub_matrix)
{
    __shared__ float block[THREAD_SIZE][THREAD_SIZE];
    // int x,y,z;
    unsigned int xIndex = threadIdx.x;
    unsigned int yIndex = threadIdx.y;
    /*
    int sum_sines=0.0;
    int sum_cosines=0.0;
    float sum_sin[THREAD_SIZE],sum_cos[THREAD_SIZE];
    float angle=(2*PI)/THREAD_SIZE;

    //put into shared memory the FFT calculation (F(u))

    for(x=0;x<THREAD_SIZE;x++)
    {
        for(y=0;y<THREAD_SIZE;y++)
        {
            for(z=0;z<THREAD_SIZE;z++)
            {
                sum_sines=sum_sin+sin(isub_matrix[y*THREAD_SIZE+z]*(angle*z));
                sum_cosines=sum_cos+cos(isub_matrix[y*THREAD_SIZE+z]*(angle*z));

            }
            sum_sin[x][y]=sum_sines/THREAD_SIZE;
            sum_cos[x][y]=sum_cosines/THREAD_SIZE;

        }
    }
    */

    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
        block[xIndex][yIndex]=pow(THREAD_SIZE,0.5);

        //block[xIndex][yIndex]=pow(exp(sum_sin[xIndex*THREAD_SIZE+yIndex])+exp(sum_cos[xIndex*THREAD_SIZE+yIndex]),0.5);

        __syncthreads();

    //transposition X x Y
    //transfer back the results into another sub-matrix that is allocated in CPU

    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
            osub_matrix[yIndex*THREAD_SIZE+xIndex]=block[xIndex][yIndex];



    __syncthreads();
}

Thanks for reading it all!

Below is the entire code:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#define ARRAY_SIZE 2048
#define THREAD_SIZE 32
#define PI 3.14



__global__ void twiddle_factor(float *isub_matrix, float *osub_matrix)
{
    __shared__ float block[THREAD_SIZE][THREAD_SIZE];
    int x,y,z;
    unsigned int xIndex = threadIdx.x;
    unsigned int yIndex = threadIdx.y;

    float sum_sines=0.0;
    //float expo_sums;
    float sum_cosines=0.0;
    float sum_sin[THREAD_SIZE][THREAD_SIZE],sum_cos[THREAD_SIZE][THREAD_SIZE];
    float angle=(2*PI)/THREAD_SIZE;

    //put into shared memory the FFT calculation (F(u))

    for(x=0;x<THREAD_SIZE;x++)
    {
        for(y=0;y<THREAD_SIZE;y++)
        {
            for(z=0;z<THREAD_SIZE;z++)
            {
                sum_sines=sum_sines+sin(isub_matrix[y*THREAD_SIZE+z]*(angle*z));
                sum_cosines=sum_cosines+cos(isub_matrix[y*THREAD_SIZE+z]*(angle*z));

            }
            sum_sin[x][y]=sum_sines/THREAD_SIZE;
            sum_cos[x][y]=sum_cosines/THREAD_SIZE;

        }
    }


    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
    {
        block[xIndex][yIndex]=exp(sum_sin[xIndex][yIndex])+exp(sum_cos[xIndex][yIndex]);
    }




        __syncthreads();

    //transposition X x Y
    //transfer back the results into another sub-matrix that is allocated in CPU

    if((xIndex<THREAD_SIZE)&&(yIndex<THREAD_SIZE))
            osub_matrix[yIndex*THREAD_SIZE+xIndex]=block[xIndex][yIndex];



    __syncthreads();
}


int main(int argc, char** argv) 
{
        int array_plus=0,x,y;
        float time;
        //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
        //bool array_rest;
        cudaEvent_t start,stop;
        float *d_isub_matrix,*d_osub_matrix;

        float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
        float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
        float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];

        //if the array's size is not compatible with the thread's size, it won't work.

        //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
        //isub_matrix=(float*) malloc(memsize);
        //osub_matrix=(float*) malloc(memsize);

        if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE))
        {

            //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
            //it has to be like this (lots of loops)



            //populating the big array
            for(x=0;x<ARRAY_SIZE;x++)
            {
                for(y=0;y<ARRAY_SIZE;y++)
                    big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
            }

            //kind of loop for the big array

            //Start counting the time of processing (everything)
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start,0);

            while(array_plus<ARRAY_SIZE)
            {

                //putting the big array's values into the sub-matrix

                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
                }

                cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

                //call the cuda kernel

                twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix);//<----

                cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);

                array_plus=array_plus+THREAD_SIZE;
                for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        big_matrix2[x*THREAD_SIZE+array_plus+y]=osub_matrix[x*THREAD_SIZE+y];
                }


                cudaFree(isub_matrix);
                cudaFree(osub_matrix);
                cudaFree(d_osub_matrix);
                cudaFree(d_isub_matrix);
            }

            //Stop the time

            cudaEventRecord(stop,0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&time,start,stop);

            //Free memory in GPU

I see loads of problem in this code.

  • You are not allocating memory for isub_matrix before copying the data from big_matrix to isub_matrix

        for(x=0;x<THREAD_SIZE;x++)
        {
            for(y=0;y<THREAD_SIZE;y++)
                isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
        }
    
  • You are not doing any cudaMemcpy from host to device for isub_matrix. After allocating memory on the device for isub_matrix, you need to copy the data.

  • I see that inside the while loop you are computing the same data.

            //putting the big array's values into the sub-matrix
    
            for(x=0;x<THREAD_SIZE;x++)
            {
                for(y=0;y<THREAD_SIZE;y++)
                    isub_matrix[x*THREAD_SIZE+y]=big_matrix[x*ARRAY_SIZE+y];
            }
    
  • The for loop should be dependent on the array_plus.

    I would suggest u to do this

    for(x=0;x<THREAD_SIZE;x++)
                {
                    for(y=0;y<THREAD_SIZE;y++)
                        isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus)*ARRAY_SIZE+y];
                }
    
  • moreover, I don't feel the use of array_rest. What is that used for?

  • Based on the updated version:

    The error I see is

  • you are using osub_matrix as both host and device pointers. I would suggest you to create a another float pointer and use it for the device
  • float *d_osub_matrix;

    cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));

    and call.

    twiddle_factor<<<1,256>>>(d_isub_matrix,d_osub_matrix);
    

    Then do

    cudaMemcpy(osub_matrix,d_osub_matrix, ((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
    
  • By the way, it is not

    twiddle_factor<<<1,256>>>(isub_matrix,osub_matrix);

  • It should be

    twiddle_factor<<<1,256>>>(d_isub_matrix,osub_matrix);

    Final and completed code:

    int main(int argc, char** argv)
    {
            int array_plus=0,x,y;
            int array_plus_x, array_plus_y;
            float time;
            //unsigned int memsize=sizeof(float)*THREAD_SIZE*THREAD_SIZE;
            //bool array_rest;
            cudaEvent_t start,stop;
            float *d_isub_matrix,*d_osub_matrix;
    
            float *big_matrix = new float[ARRAY_SIZE*ARRAY_SIZE];
            float *big_matrix2 = new float[ARRAY_SIZE*ARRAY_SIZE];
            float *isub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
            float *osub_matrix = new float[THREAD_SIZE*THREAD_SIZE];
    
            //if the array's size is not compatible with the thread's size, it won't work.
    
            //array_rest=(ARRAY_SIZE*ARRAY_SIZE)/(THREAD_SIZE*THREAD_SIZE);
            //isub_matrix=(float*) malloc(memsize);
            //osub_matrix=(float*) malloc(memsize);
    
            if(((ARRAY_SIZE*ARRAY_SIZE)%(THREAD_SIZE*THREAD_SIZE)==0)&&(ARRAY_SIZE>=THREAD_SIZE))
            {
    
                //allocating space in CPU memory and GPU memory for the big matrix and its sub matrixes
                //it has to be like this (lots of loops)
    
    
    
                //populating the big array
                for(x=0;x<ARRAY_SIZE;x++)
                {
                    for(y=0;y<ARRAY_SIZE;y++)
                        big_matrix[x*ARRAY_SIZE+y]=rand()%10000;
                }
    
                //kind of loop for the big array
    
                //Start counting the time of processing (everything)
                cudaEventCreate(&start);
                cudaEventCreate(&stop);
    
                cudaEventRecord(start,0);
                for(array_plus_x = 0; array_plus_x < ARRAY_SIZE; array_plus_x += THREAD_SIZE)
                for(array_plus_y = 0; array_plus_y < ARRAY_SIZE; array_plus_y += THREAD_SIZE)
                {
    
    
                    //putting the big array's values into the sub-matrix
    
                    for(x=0;x<THREAD_SIZE;x++)
                    {
                        for(y=0;y<THREAD_SIZE;y++)
                            isub_matrix[x*THREAD_SIZE+y]=big_matrix[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)];
                    }
    
                    cudaMalloc((void**)&d_isub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                    cudaMalloc((void**)&d_osub_matrix,THREAD_SIZE*THREAD_SIZE*sizeof(float));
                    cudaMemcpy(d_isub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);
    
                    //call the cuda kernel
    
                    dim3 block(32,32);
                    twiddle_factor<<<1,block>>>(d_isub_matrix,d_osub_matrix);//<----
    
                    cudaMemcpy(osub_matrix,d_osub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
    
                    for(x=0;x<THREAD_SIZE;x++)
                    {
                        for(y=0;y<THREAD_SIZE;y++)
                            big_matrix2[(x+array_plus_x)*ARRAY_SIZE+(y+array_plus_y)]=osub_matrix[x*THREAD_SIZE+y];
                    }
    
                    cudaFree(d_osub_matrix);
                    cudaFree(d_isub_matrix);
                }
    
                //Stop the time
    
                cudaEventRecord(stop,0);
                cudaEventSynchronize(stop);
                cudaEventElapsedTime(&time,start,stop);
    
                //Free memory in GPU
    

    I think the problem is in the line.

     cudaMemcpy(osub_matrix,isub_matrix,((THREAD_SIZE*THREAD_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
    

    This is because you allocate both osub_matrix and isub_matrix in the device.

    链接地址: http://www.djcxy.com/p/56068.html

    上一篇: Heroku应用程序可以使用不同的/多个端口吗?

    下一篇: 使用CUDA内核获取堆栈溢出