/*
 * otbOCLPointSetToDensityImageFilter.cl
 *
 *  Created on: 14-Apr-2010
 *      Author: christop
 */


//Block thread size
#define TILE_SIZE_X 8
#define TILE_SIZE_Y 16

//Chunk size for the point set
#define CHUNK_SIZE 4096

//number of pixel processed per thread
#define PIX_PER_THREAD 4

//#define SQRT_2PI 2.5066282746310002
//FIXME switch to parameter!!!
#define SQRT_2PIRADIUS 1253.3141373155001

//__constant__ float pt_c[CHUNK_SIZE*2];

__kernel __attribute__((reqd_work_group_size(TILE_SIZE_X, TILE_SIZE_Y, 1)))
void pointDensityKernel(
    __global float* pix,
    __constant float* pt_c,
    int numPoint,
    int originX,
    int originY,
    float spacingX,
    float spacingY,
    int imageWidth,
    int radiussq)
{
   int x1 = get_group_id(0)*TILE_SIZE_X*PIX_PER_THREAD + get_local_id(0);
   int y = get_group_id(1)*TILE_SIZE_Y + get_local_id(1);

   int x2 = x1 + TILE_SIZE_X;
   int x3 = x1 + 2*TILE_SIZE_X;
   int x4 = x1 + 3*TILE_SIZE_X;

//   inf accum1 = 0;
//   int accum2 = 0;
//   int accum3 = 0;
//   int accum4 = 0;
   float accum1 = 0;
   float accum2 = 0;
   float accum3 = 0;
   float accum4 = 0;
   for (int k = 0; k < numPoint; k++)
   {
     float ptX = pt_c[2 * k];
     float ptY = pt_c[2 * k + 1];
     float pixX1 = x1 * spacingX + originX;
     float pixX2 = x2 * spacingX + originX;
     float pixX3 = x3 * spacingX + originX;
     float pixX4 = x4 * spacingX + originX;
     float pixY = y * spacingY + originY;
//     float disty_sq = (ptY - pixY) * (ptY - pixY);

     float distsq;
     distsq = (ptX - pixX1) * (ptX - pixX1) + (ptY - pixY) * (ptY - pixY);
//     accum1 += ((distsq < radiussq) ? 1 : 0);
     accum1 += native_exp(-distsq/radiussq/2);

     distsq = (ptX - pixX2) * (ptX - pixX2) + (ptY - pixY) * (ptY - pixY);
//     accum2 += ((distsq < radiussq) ? 1 : 0);
     accum2 += native_exp(-distsq/radiussq/2);

     distsq = (ptX - pixX3) * (ptX - pixX3) + (ptY - pixY) * (ptY - pixY);
//     accum3 += ((distsq < radiussq) ? 1 : 0);
     accum3 += native_exp(-distsq/radiussq/2);

     distsq = (ptX - pixX4) * (ptX - pixX4) + (ptY - pixY) * (ptY - pixY);
//     accum4 += ((distsq < radiussq) ? 1 : 0);
     accum4 += native_exp(-distsq/radiussq/2);
   }
//   float surface = M_PI*radiussq;
//   pix[x1+y*imageWidth] += accum1/surface;
//   pix[x2+y*imageWidth] += accum2/surface;
//   pix[x3+y*imageWidth] += accum3/surface;
//   pix[x4+y*imageWidth] += accum4/surface;
   pix[x1+y*imageWidth] += accum1/SQRT_2PIRADIUS;
   pix[x2+y*imageWidth] += accum2/SQRT_2PIRADIUS;
   pix[x3+y*imageWidth] += accum3/SQRT_2PIRADIUS;
   pix[x4+y*imageWidth] += accum4/SQRT_2PIRADIUS;

}


__kernel __attribute__((reqd_work_group_size(TILE_SIZE_X, TILE_SIZE_Y, 1)))
void zeroImageKernel( __global float* pix, int imageWidth)
{
  int x1 = get_group_id(0)*TILE_SIZE_X*PIX_PER_THREAD + get_local_id(0);
  int y = get_group_id(1)*TILE_SIZE_Y + get_local_id(1);
  int x2 = x1 + TILE_SIZE_X;
  int x3 = x1 + 2*TILE_SIZE_X;
  int x4 = x1 + 3*TILE_SIZE_X;

  pix[x1+y*imageWidth] = 0;
  pix[x2+y*imageWidth] = 0;
  pix[x3+y*imageWidth] = 0;
  pix[x4+y*imageWidth] = 0;
}



