/*
 * otbGPUGPUPointSetToDensityImageFilter.cu
 *
 *  Created on: 21-Apr-2010
 *      Author: christop
 */

#include <cstdio>
#include <cassert>
#include <cuda.h>
#include <driver_types.h>
#include <vector_types.h>

//Block thread size
#define TILE_SIZE_X 16
#define TILE_SIZE_Y 8

__global__ void fineCorrelationKernel(float * fixed, float * moving, float * corr,
                                      int nx, int ny,//number of loop inside the threads
                                      int fixWidth, int movWidth,
                                      int fixStride, int movStride,
                                      int patchWindowSize, int searchWindowSize)
{
  //  __shared__ float patch[patchWindowSize][patchWindowSize];
  //  __shared__ float patch[searchWindowSize][searchWindowSize];//too large for the patch

  //  patch[threadIdx.x][threadIdx.y] = fixed[fixStride + (threadIdx.x-PATCHRADIUS)
  //                                          + fixWidth*(threadIdx.y-PATCHRADIUS)
  //                                          + blockIdx.x];
  //  __syncthreads();

  if ((nx * blockDim.x + threadIdx.x < searchWindowSize) && (ny * blockDim.y + threadIdx.y < searchWindowSize))
    {
    float crossProductSum = 0;
    float fSquareSum = 0;
    float mSquareSum = 0;

    for (int j = 0; j <= patchWindowSize; j++)
      {
      for (int i = 0; i <= patchWindowSize; i++)
        {
        //            float vfixed = fixed[fixStride + i + fixWidth*j + blockIdx.x];
        //            float vfixed = patch[i + PATCHRADIUS][j + PATCHRADIUS];
        //            float vmoving = moving[movStride + i + offsetX + nx*TILE_SIZE_X + movWidth * (j + offsetY + ny*TILE_SIZE_Y) + blockIdx.x];
        float vfixed = fixed[fixStride + i + fixWidth * (j + blockIdx.y) + blockIdx.x];
        //            float vfixed = patch[i + PATCHRADIUS][j + PATCHRADIUS];
        float vmoving = moving[movStride + i + threadIdx.x + nx * blockDim.x + movWidth * (j + threadIdx.y + blockIdx.y
            + ny * blockDim.y) + blockIdx.x];

        crossProductSum += vfixed * vmoving;
        fSquareSum += vfixed * vfixed;
        mSquareSum += vmoving * vmoving;
        }
      }

    if (crossProductSum > 0.00001f)
      {
      float norm = sqrtf(fSquareSum * mSquareSum);
      corr[threadIdx.x + nx * blockDim.x + searchWindowSize * (threadIdx.y + ny * blockDim.y) + (blockIdx.x + gridDim.x
          * blockIdx.y) * (searchWindowSize * searchWindowSize)] += crossProductSum / norm;
      }

    }
}

extern "C" void fineCorrelationProcessing(float * fixedD, float * movingD, float * corrD,
                                          int fixWidth, int movWidth,
                                          int fixStride, int movStride,
                                          float* corr, int nBlocks, int patchRadius, int searchRadius)
{

  int patchWindowSize = 2*patchRadius+1;
  int searchWindowSize = 2*searchRadius+1;
  int width = fixWidth - 2*patchRadius;
  assert(nBlocks >= width);
  dim3 dimBlock( TILE_SIZE_X, TILE_SIZE_Y);
  dim3 dimGrid(width, nBlocks/width);
  cudaMemset(corrD, 0, nBlocks*searchWindowSize*searchWindowSize*sizeof(float));
//  printf("Launching kernel\n");

  int timeX = ceil((float) searchWindowSize/dimBlock.x);
  int timeY = ceil((float) searchWindowSize/dimBlock.y);

  for (int ny = 0; ny < timeY; ny++)
    {
    for (int nx = 0; nx < timeX; nx++)
    {
    fineCorrelationKernel<<<dimGrid,dimBlock>>>(fixedD, movingD, corrD, nx, ny, fixWidth, movWidth,
                                                fixStride, movStride, patchWindowSize, searchWindowSize);
    }
  }
  cudaThreadSynchronize();
  cudaMemcpy(corr, corrD, nBlocks*searchWindowSize*searchWindowSize*sizeof(float), cudaMemcpyDeviceToHost);
}

extern "C" void loadIntoGPU(float* fixed, float* moving, float** fixedD, float** movingD, float** corrD,
                            int fixedWidth, int fixedHeight, int movingWidth, int movingHeight, int nBlocks,
                            int searchRadius)
{
  int searchWindowSize = 2*searchRadius+1;

  // Initialize
  if (cuInit(0) != CUDA_SUCCESS)
    exit (0);

  cudaMalloc((void**) fixedD, fixedWidth*fixedHeight*sizeof(float));
  cudaMemcpy(*fixedD, fixed, fixedWidth*fixedHeight*sizeof(float), cudaMemcpyHostToDevice);

  cudaMalloc((void**) movingD, movingWidth*movingHeight*sizeof(float));
  cudaMemcpy(*movingD, moving, movingWidth*movingHeight*sizeof(float), cudaMemcpyHostToDevice);

  cudaMalloc((void**) corrD, nBlocks*searchWindowSize*searchWindowSize*sizeof(float));

}

extern "C" void freeGPU(float* fixedD, float* movingD, float* corrD)
{
  cudaFree(fixedD);
  cudaFree(movingD);
  cudaFree(corrD);
}
