/*
 * otbGPUNDVIFilter.cu
 *
 *  Created on: Apr 26, 2010
 *      Author: christop
 */

#include <cstdio>
#include <cuda.h>
#include <driver_types.h>
#include <vector_types.h>

//Block thread size
#define TILE_SIZE_X 8
#define TILE_SIZE_Y 16

__global__ void ndviKernel(float* pix, float* ndvi, int numBands, int indexRed, int indexNIR, int imageWidth)
{
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  float nir = pix[numBands*(x +  y*imageWidth) + indexNIR];
  float red = pix[numBands*(x +  y*imageWidth) + indexRed];
  ndvi[x +  y*imageWidth] = (nir - red)/(nir + red);
}

extern "C" void ndviProcessing(float* pix, float* ndvi, int numBands, int indexRed, int indexNIR, int imageWidth, int imageHeight)
{
  // Initialize
  if (cuInit(0) != CUDA_SUCCESS)
    exit (0);

  float* pixD;
  cudaMalloc((void**) &pixD, numBands*imageWidth*imageHeight*sizeof(float));
  cudaMemcpy(pixD, pix, numBands*imageWidth*imageHeight*sizeof(float), cudaMemcpyHostToDevice);

  float* ndviD;
  cudaMalloc((void**) &ndviD, imageWidth*imageHeight*sizeof(float));

  dim3 dimBlock( TILE_SIZE_X, TILE_SIZE_Y);
  dim3 dimGrid(imageWidth/TILE_SIZE_X, imageHeight/TILE_SIZE_Y);

//  printf("Launching kernel");
  ndviKernel<<<dimGrid,dimBlock>>>(pixD, ndviD, numBands, indexRed, indexNIR, imageWidth);
  cudaThreadSynchronize();
  cudaMemcpy(ndvi, ndviD, imageWidth*imageHeight*sizeof(float), cudaMemcpyDeviceToHost);
}
