/*
 * otbGPUGPUPointSetToDensityImageFilter.cu
 *
 *  Created on: 10-Mar-2010
 *      Author: christop
 */

#include <cstdio>
#include <cuda.h>
#include <driver_types.h>
#include <vector_types.h>
//#include "cuPrintf.cu"

//Block thread size
#define TILE_SIZE_X 8
#define TILE_SIZE_Y 16

//Chunk size for the point set
#define CHUNK_SIZE 4096

//number of pixel processed per thread
#define PIX_PER_THREAD 4

//#define SQRT_2PI 2.5066282746310002
//FIXME switch to parameter!!!
#define NORMALIZATION 1253.3141373155001f

__constant__ float pt_c[CHUNK_SIZE*2];

__global__ void pointDensityKernel(float* pix, int numPoint, int originX, int originY, float spacingX, float spacingY, int imageWidth, int radiussq)
{
   int x1 = blockIdx.x*blockDim.x*PIX_PER_THREAD+ threadIdx.x;
   int y = blockIdx.y*blockDim.y + threadIdx.y;

   int x2 = x1 + blockDim.x;
   int x3 = x1 + 2*blockDim.x;
   int x4 = x1 + 3*blockDim.x;

//   inf accum1 = 0;
//   int accum2 = 0;
//   int accum3 = 0;
//   int accum4 = 0;
   float accum1 = 0.0f;
   float accum2 = 0.0f;
   float accum3 = 0.0f;
   float accum4 = 0.0f;

   for (int k = 0; k < numPoint; k++)
   {
     float ptX = pt_c[2 * k];
     float ptY = pt_c[2 * k + 1];
     float pixX1 = x1 * spacingX + originX;
     float pixX2 = x2 * spacingX + originX;
     float pixX3 = x3 * spacingX + originX;
     float pixX4 = x4 * spacingX + originX;
     float pixY = y * spacingY + originY;
     float disty_sq = (ptY - pixY) * (ptY - pixY);

     float distsq;
     distsq = (ptX - pixX1) * (ptX - pixX1) + disty_sq;
//     accum1 += ((distsq < radiussq) ? 1 : 0);
     accum1 += __expf(-distsq/radiussq/2);

     distsq = (ptX - pixX2) * (ptX - pixX2) + disty_sq;
//     accum2 += ((distsq < radiussq) ? 1 : 0);
     accum2 += __expf(-distsq/radiussq/2);

     distsq = (ptX - pixX3) * (ptX - pixX3) + disty_sq;
//     accum3 += ((distsq < radiussq) ? 1 : 0);
     accum3 += __expf(-distsq/radiussq/2);

     distsq = (ptX - pixX4) * (ptX - pixX4) + disty_sq;
//     accum4 += ((distsq < radiussq) ? 1 : 0);
     accum4 += __expf(-distsq/radiussq/2);
   }
//   float surface = M_PI*radiussq;
//   pix[x1+y*imageWidth] += accum1/surface;
//   pix[x2+y*imageWidth] += accum2/surface;
//   pix[x3+y*imageWidth] += accum3/surface;
//   pix[x4+y*imageWidth] += accum4/surface;
   pix[x1+y*imageWidth] += accum1/NORMALIZATION;
   pix[x2+y*imageWidth] += accum2/NORMALIZATION;
   pix[x3+y*imageWidth] += accum3/NORMALIZATION;
   pix[x4+y*imageWidth] += accum4/NORMALIZATION;

}

extern "C" void pointDensityProcessing(float* pix, float* pt, int numPoint, int originX, int originY, float spacingX, float spacingY, int imageWidth, int imageHeight, int radius)
{
  // Initialize
  if (cuInit(0) != CUDA_SUCCESS)
    exit (0);

  //Transfer data to GPU memory
  float* pixD;
//  float* ptD;
  printf("Allocating image on GPU: %d\n", imageWidth*imageHeight*sizeof(float));
  cudaMalloc((void**) &pixD, imageWidth*imageHeight*sizeof(float));
  cudaMemset(pixD, 0, imageWidth*imageHeight*sizeof(float));
  //cudaMemcpy(pixD, pix, imageWidth*imageHeight*sizeof(float), cudaMemcpyHostToDevice);//FIXME not necessary maybe, calloc equivalent?

//  printf("Allocating points on GPU: %d\n", numPoint*2*sizeof(float));
//  cudaMalloc((void**) &ptD, numPoint*2*sizeof(float));
//  cudaMemcpy(pt_c, pt, numPoint*2*sizeof(float), cudaMemcpyHostToDevice);//FIXME
//  cudaMemcpyToSymbol(pt_c, pt, numPoint*2*sizeof(float), 0, cudaMemcpyHostToDevice);

  dim3 dimBlock( TILE_SIZE_X, TILE_SIZE_Y);
  dim3 dimGrid(imageWidth/(TILE_SIZE_X*PIX_PER_THREAD), imageHeight/TILE_SIZE_Y);

  int radiussq = radius*radius;

  int i=0;//FIXME
  for(i = 0; i < numPoint/CHUNK_SIZE; i++)
  {
    printf("Process chunk %i\n", i);
    cudaMemcpyToSymbol(pt_c, &pt[i*CHUNK_SIZE*2], CHUNK_SIZE*2*sizeof(float), 0, cudaMemcpyHostToDevice);//FIXME

    //Call GPU kernel
    printf("Launch kernel\n");
    printf("Parameters: %d, %d, %d, %d, %f, %f, %d, %d \n", pixD, CHUNK_SIZE, originX, originY, spacingX, spacingY, imageWidth, radiussq);
    //  cudaPrintfInit();
    pointDensityKernel<<<dimGrid,dimBlock>>>(pixD, CHUNK_SIZE, originX, originY, spacingX, spacingY, imageWidth, radiussq);
    cudaThreadSynchronize();
    //  cudaPrintfDisplay(stdout, true);
    //  cudaPrintfEnd();
  }

  //one more time if the chunk size not integer
  if (numPoint % CHUNK_SIZE != 0)
  {
    printf("Process last chunk\n");
    cudaMemcpyToSymbol(pt_c, &pt[i*CHUNK_SIZE*2], (numPoint % CHUNK_SIZE)*2*sizeof(float), 0, cudaMemcpyHostToDevice);

    //Call GPU kernel
    printf("Launch kernel\n");
    printf("Parameters: %d, %d, %d, %d, %f, %f, %d, %d \n", pixD, (numPoint % CHUNK_SIZE), originX, originY, spacingX, spacingY, imageWidth, radiussq);
    pointDensityKernel<<<dimGrid,dimBlock>>>(pixD, (numPoint % CHUNK_SIZE), originX, originY, spacingX, spacingY, imageWidth, radiussq);
    cudaThreadSynchronize();
  }

  cudaError_t err;

  err = cudaGetLastError();
  if (cudaSuccess != err)
    {
    fprintf(stderr, "Cuda error: %s: %s.\n", "FunctionName()", cudaGetErrorString( err) );
    exit(EXIT_FAILURE);
    }

  cudaThreadSynchronize();

  //Transfer result to CPU memory
//  printf("Get memory back\n");
  cudaMemcpy(pix, pixD, imageWidth*imageHeight*sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(pixD);
  //cudaFree(ptD);

}



