/*
 * otbGPUNDVIFilter.cu
 *
 *  Created on: Apr 26, 2010
 *      Author: christop
 */

#include <cstdio>
#include <cuda.h>
#include <driver_types.h>
#include <vector_types.h>

//Block thread size
#define TILE_SIZE_X 8
#define TILE_SIZE_Y 16

#define MAX_BANDS 256
__constant__ float ref_c[MAX_BANDS];

__global__ void samKernel(float* pix, float* sam, int numBands,  int imageWidth)
{
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;
  float refnorm=0;
  float pixnorm=0;
  float dotprod=0;
  float pixValue;
  for (int i = 0; i<numBands; i++)
    {
      pixValue =  pix[numBands*(x +  y*imageWidth) + i];
      refnorm += ref_c[i]*ref_c[i];
      pixnorm += pixValue*pixValue;
      dotprod += ref_c[i]*pixValue;

    }
  sam[x +  y*imageWidth] = acosf(dotprod/sqrtf(refnorm*pixnorm));
}

extern "C" void samProcessing(float* pix, float* sam, int numBands, const float* ref, int imageWidth, int imageHeight)
{
  // Initialize
  if (cuInit(0) != CUDA_SUCCESS)
    exit (0);

  float* pixD;
  cudaMalloc((void**) &pixD, numBands*imageWidth*imageHeight*sizeof(float));
  cudaMemcpy(pixD, pix, numBands*imageWidth*imageHeight*sizeof(float), cudaMemcpyHostToDevice);

  float* samD;
  cudaMalloc((void**) &samD, imageWidth*imageHeight*sizeof(float));

  cudaMemcpyToSymbol(ref_c, ref, numBands*sizeof(float), 0, cudaMemcpyHostToDevice);

  dim3 dimBlock( TILE_SIZE_X, TILE_SIZE_Y);
  dim3 dimGrid(imageWidth/TILE_SIZE_X, imageHeight/TILE_SIZE_Y);

//  printf("Launching kernel");
  samKernel<<<dimGrid,dimBlock>>>(pixD, samD, numBands, imageWidth);
  cudaThreadSynchronize();
  cudaMemcpy(sam, samD, imageWidth*imageHeight*sizeof(float), cudaMemcpyDeviceToHost);
}
