/*
 * otbOCLPointSetToDensityImageFilter.txx
 *
 *  Created on: 14-Apr-2010
 *      Author: christop
 */
#ifndef __otbOCLPointSetToDensityImageFilter_txx
#define __otbOCLPointSetToDensityImageFilter_txx

#include "otbOCLPointSetToDensityImageFilter.h"
#include <fstream>

//Only valid for nvidia stuff???
#include <oclUtils.h>

//FIXME Copied from .cl, better if all was in one place
//Block thread size
#define TILE_SIZE_X 8
#define TILE_SIZE_Y 16

//Chunk size for the point set
#define CHUNK_SIZE 4096

//number of pixel processed per thread
#define PIX_PER_THREAD 4

namespace otb
{

/*-------------------------------------------------------
 * BeforeThreadedGenerateData
 --------------------------------------------------------*/
template <class TInputPointSet, class TOutputImage>
void
OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::BeforeThreadedGenerateData()
{
  /**
   * Initialize GPU
   */
   cl_int ciErrNum;
   char cBuffer[1024];
   std::string msgString;

   std::cout << "Initializing GPU..." << std::endl;

   cl_platform_id clSelectedPlatformID = NULL;
   ciErrNum = oclGetPlatformID (&clSelectedPlatformID);
   ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
   msgString += "PlatformInfo: ";
   msgString += cBuffer;
   msgString += "\n";

   cl_uint ciDeviceCount;
   ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);
   std::cout <<  "Number of devices: ";
   std::cout <<  ciDeviceCount;
   std::cout <<  "\n";

   cl_device_id *devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount);
   ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);

   clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
   std::cout <<  "clGetDeviceInfo 0: ";
   std::cout << cBuffer;
   std::cout <<  "\n";
   clGetDeviceInfo(devices[1], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
   std::cout <<  "clGetDeviceInfo 1: ";
   std::cout << cBuffer;
   std::cout <<  "\n";

   GPUContext = clCreateContext(0, 1, devices, NULL, NULL, &ciErrNum);//Take only the first device
   GPUCommandQueue = clCreateCommandQueue(GPUContext, devices[0], 0, &ciErrNum);

   std::cout << msgString << std::endl;


   /**
   * Load program sources
   */
   size_t kernelLength;
   const char *filename = "/home/christop/OTB-HPC/Code/src/otbOCLPointSetToDensityImageFilter.cl";
   char *srcOCLpointset;
   std::ifstream file;
   file.open(filename);
     // get length of file:
   file.seekg (0, std::ios::end);
   kernelLength = file.tellg();
   file.seekg (0, std::ios::beg);

   // allocate memory for sources
   srcOCLpointset = new char [kernelLength];

   // read data as a block:
   file.read (srcOCLpointset, kernelLength);
   file.close();

   std::cout << "Read openCL program: " << kernelLength << std::endl;

   std::cout << "Building program..." << std::endl;
   pointSetProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&srcOCLpointset, &kernelLength, &ciErrNum);
   oclCheckError(ciErrNum, CL_SUCCESS);
   ciErrNum = clBuildProgram(pointSetProgram, 0, NULL, NULL, NULL, NULL);

   //debug
   char *build_log;
   size_t ret_val_size;
   clGetProgramBuildInfo(pointSetProgram, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
   build_log = new char[ret_val_size+1];
   clGetProgramBuildInfo(pointSetProgram, devices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
   build_log[ret_val_size] = '\0';
   std::cout << "BUILD LOG: "<< std::endl;
   std::cout << build_log << std::endl;
   //debug

   oclCheckError(ciErrNum, CL_SUCCESS);

   delete[] srcOCLpointset;

   pointDensityKernel = clCreateKernel(pointSetProgram, "pointDensityKernel", &ciErrNum);
   oclCheckError(ciErrNum, CL_SUCCESS);

   zeroImageKernel = clCreateKernel(pointSetProgram, "zeroImageKernel", &ciErrNum);
   oclCheckError(ciErrNum, CL_SUCCESS);
}

/*-------------------------------------------------------
 * AfterThreadedGenerateData
 --------------------------------------------------------*/
template <class TInputPointSet, class TOutputImage>
void
OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::AfterThreadedGenerateData()
{
  cl_int ciErrNum;
  ciErrNum = clReleaseKernel(zeroImageKernel);
  oclCheckError(ciErrNum, CL_SUCCESS);
  ciErrNum = clReleaseKernel(pointDensityKernel);
  oclCheckError(ciErrNum, CL_SUCCESS);
  ciErrNum = clReleaseProgram(pointSetProgram);
  oclCheckError(ciErrNum, CL_SUCCESS);
  ciErrNum = clReleaseCommandQueue(GPUCommandQueue);
  oclCheckError(ciErrNum, CL_SUCCESS);
  ciErrNum = clReleaseContext(GPUContext);
  oclCheckError(ciErrNum, CL_SUCCESS);
}

/*-------------------------------------------------------
 * ThreadedGenerateData
 --------------------------------------------------------*/
template <class TInputPointSet, class TOutputImage>
void
OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::ThreadedGenerateData(
    const   OutputImageRegionType&     outputRegionForThread,
    int   threadId)
{
  sleep(threadId);
  std::cerr << threadId << " -> " << outputRegionForThread.GetIndex() << std::endl;

  typename ImageType::Pointer outputPtr = dynamic_cast<ImageType *> (this->itk::ProcessObject::GetOutput(0));

  bool padding = false;
  m_ExtendedRegion = PadBlockRegion(outputRegionForThread);
  if (m_ExtendedRegion != this->GetOutput()->GetRequestedRegion())
  {
    padding = true;
  }

  if (padding)
  {
    m_OutputIntermediatePtr = ImageType::New();
    m_OutputIntermediatePtr->CopyInformation(outputPtr);
    m_OutputIntermediatePtr->SetBufferedRegion(m_ExtendedRegion);
    m_OutputIntermediatePtr->Allocate();
  }
  else
  {
    outputPtr->SetBufferedRegion(outputRegionForThread);
    outputPtr->Allocate();
    m_OutputIntermediatePtr = outputPtr;
  }

  //Superclass::GenerateData();
  //Do processing here
  this->DoProcessing();

  //handle the output

  if (padding)
  {
    outputPtr->SetBufferedRegion(outputRegionForThread);
    outputPtr->Allocate();
    ImageConstIterator it = ImageConstIterator(m_OutputIntermediatePtr, outputRegionForThread);
    ImageIterator itOut = ImageIterator(outputPtr, outputRegionForThread);
    for (it.GoToBegin(), itOut.GoToBegin(); !it.IsAtEnd(); ++it, ++itOut)
    {
      itOut.Set(it.Get());
    }

  }
  else
  {
    outputPtr = m_OutputIntermediatePtr;
  }

}

template<class TInputPointSet, class TOutputImage>
typename OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>::RegionType
OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::PadBlockRegion(RegionType region)
{
  typename RegionType::SizeType size = region.GetSize();
  std::cout << "Size before padding: " << size << std::endl;
  if ((size[0] % m_BlkSize[0]) != 0)
    size[0] = size[0] + m_BlkSize[0] - (size[0] % m_BlkSize[0]);
  if ((size[1] % m_BlkSize[1]) != 0)
    size[1] = size[1] + m_BlkSize[1] - (size[1] % m_BlkSize[1]);
  std::cout << "Size after padding: " << size << std::endl;
  region.SetSize(size);
  return region;
}

template<class TInputPointSet, class TOutputImage>
void OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::DoProcessing()
{
  //Translate to simple CPU structure
  float * pix = m_OutputIntermediatePtr->GetBufferPointer();
  int originX = m_OutputIntermediatePtr->GetBufferedRegion().GetIndex()[0];
  int originY = m_OutputIntermediatePtr->GetBufferedRegion().GetIndex()[1];
  float spacingX = m_OutputIntermediatePtr->GetSpacing()[0];
  float spacingY = m_OutputIntermediatePtr->GetSpacing()[1];
  int imageWidth = m_OutputIntermediatePtr->GetBufferedRegion().GetSize()[0];
  int imageHeight = m_OutputIntermediatePtr->GetBufferedRegion().GetSize()[1];
  int radius = this->GetRadius();
  std::cout << "Radius: " << radius << std::endl;
  //    TransformIndexToPhysicalPoint
  int numPoint = this->GetInput()->GetNumberOfPoints();//FIXME check int vs long

  float * pt = new float[2 * numPoint];

  typedef typename PointSetType::PointsContainer::ConstIterator iteratorType;
  iteratorType it = this->GetInput()->GetPoints()->Begin();
  int i = 0;
  while (it != this->GetInput()->GetPoints()->End())
  {
    pt[2 * i] = it.Value()[0];
    pt[2 * i + 1] = it.Value()[1];
    ++it;
    ++i;
  }

  std::cout << "Few point: " << pt[0]<<", " << pt[1] << " " << pt[2]<<", " << pt[3] << std::endl; 

  //Call OCL translation method
  std::cout << "Calling OCL translation method" << std::endl;
  this->LaunchKernel(pix, pt, numPoint, originX, originY, spacingX, spacingY, imageWidth, imageHeight, radius);

  delete[] pt;
}

template<class TInputPointSet, class TOutputImage>
void OCLPointSetToDensityImageFilter<TInputPointSet, TOutputImage>
::LaunchKernel(float* pix, float* pt, int numPoint, int originX, int originY, float spacingX, float spacingY, int imageWidth, int imageHeight, int radius)
{

  //the GPU access should be in memory lock (we don't want several CPU threads to acces the GPU)
  m_Mutex.Lock();
  cl_int ciErrNum;
   //Transfer data to GPU memory
//   float* pixD;
 //  float* ptD;
   printf("Allocating image on GPU: %d\n", imageWidth*imageHeight*sizeof(float));
//   cudaMalloc((void**) &pixD, imageWidth*imageHeight*sizeof(float));
   cl_mem pixD = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, imageWidth*imageHeight* sizeof(cl_float), NULL, &ciErrNum);
   //  cudaMemset(pixD, 0, imageWidth*imageHeight*sizeof(float)); //FIXME find out how to initialize the memory

//   dim3 dimBlock( TILE_SIZE_X, TILE_SIZE_Y);
//   dim3 dimGrid(imageWidth/(TILE_SIZE_X*PIX_PER_THREAD), imageHeight/TILE_SIZE_Y);
   size_t localWorkSize[2], globalWorkSize[2];
   localWorkSize[0] = TILE_SIZE_X;
   localWorkSize[1] = TILE_SIZE_Y;
   globalWorkSize[0] = imageWidth/PIX_PER_THREAD;
   globalWorkSize[1] = imageHeight;
   std::cout << "localWorkSize: " << localWorkSize[0] << ", " << localWorkSize[1] << std::endl;
   std::cout << "globalWorkSize: " << globalWorkSize[0] << ", " <<  globalWorkSize[1] << std::endl;

   ciErrNum = clFinish(GPUCommandQueue);

   //Set image to 0
   clSetKernelArg(zeroImageKernel, 0, sizeof(cl_mem),    (void*)&pixD);
   clSetKernelArg(zeroImageKernel, 1, sizeof(int),       (void*)&imageWidth);
   ciErrNum = clEnqueueNDRangeKernel(GPUCommandQueue, zeroImageKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
   oclCheckError(ciErrNum, CL_SUCCESS);
   int radiussq = radius*radius;
   int i=0;//FIXME
   for(i = 0; i < numPoint/CHUNK_SIZE; i++)
   {
     printf("Process chunk %i\n", i);
//     cudaMemcpyToSymbol(pt_c, &pt[i*CHUNK_SIZE*2], CHUNK_SIZE*2*sizeof(float), 0, cudaMemcpyHostToDevice);//FIXME
     cl_mem pt_c = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, CHUNK_SIZE*2* sizeof(cl_float), &pt[i*CHUNK_SIZE*2], &ciErrNum);

     //Call GPU kernel
     printf("Launch kernel\n");
     printf("Parameters: %d, %d, %d, %d, %d, %f, %f, %d, %d \n", pixD, pt_c, CHUNK_SIZE, originX, originY, spacingX, spacingY, imageWidth, radiussq);
     int numPoints = CHUNK_SIZE;
     //  cudaPrintfInit();
//     pointDensityKernel<<<dimGrid,dimBlock>>>(pixD, CHUNK_SIZE, originX, originY, spacingX, spacingY, imageWidth, radiussq);
     clSetKernelArg(pointDensityKernel, 0, sizeof(cl_mem),    (void*)&pixD);
     clSetKernelArg(pointDensityKernel, 1, sizeof(cl_mem),    (void*)&pt_c);
     clSetKernelArg(pointDensityKernel, 2, sizeof(int),       (void*)&numPoints);
     clSetKernelArg(pointDensityKernel, 3, sizeof(int),       (void*)&originX);
     clSetKernelArg(pointDensityKernel, 4, sizeof(int),       (void*)&originY);
     clSetKernelArg(pointDensityKernel, 5, sizeof(float),     (void*)&spacingX);
     clSetKernelArg(pointDensityKernel, 6, sizeof(float),     (void*)&spacingY);
     clSetKernelArg(pointDensityKernel, 7, sizeof(int),       (void*)&imageWidth);
     clSetKernelArg(pointDensityKernel, 8, sizeof(int),       (void*)&radiussq);
     ciErrNum = clEnqueueNDRangeKernel(GPUCommandQueue, pointDensityKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
     oclCheckError(ciErrNum, CL_SUCCESS);
     clReleaseMemObject(pt_c);
//     cudaThreadSynchronize();
//     ciErrNum = clFinish(GPUCommandQueue);
   }

   //one more time if the chunk size not integer
   if (numPoint % CHUNK_SIZE != 0)
   {
     printf("Process last chunk\n");
//     cudaMemcpyToSymbol(pt_c, &pt[i*CHUNK_SIZE*2], (numPoint % CHUNK_SIZE)*2*sizeof(float), 0, cudaMemcpyHostToDevice);
     cl_mem pt_c = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (numPoint % CHUNK_SIZE)*2* sizeof(cl_float), &pt[i*CHUNK_SIZE*2], &ciErrNum);

     //Call GPU kernel
     printf("Launch kernel\n");
     printf("Parameters: %d, %d, %d, %d, %d, %f, %f, %d, %d \n", pixD, pt_c, numPoint %CHUNK_SIZE, originX, originY, spacingX, spacingY, imageWidth, radiussq);
     int numPoints =  numPoint % CHUNK_SIZE;
//     pointDensityKernel<<<dimGrid,dimBlock>>>(pixD, (numPoint % CHUNK_SIZE), originX, originY, spacingX, spacingY, imageWidth, radiussq);
     clSetKernelArg(pointDensityKernel, 0, sizeof(cl_mem),    (void*)&pixD);
     clSetKernelArg(pointDensityKernel, 1, sizeof(cl_mem),    (void*)&pt_c);
     clSetKernelArg(pointDensityKernel, 2, sizeof(int),       (void*)&numPoints);
     clSetKernelArg(pointDensityKernel, 3, sizeof(int),       (void*)&originX);
     clSetKernelArg(pointDensityKernel, 4, sizeof(int),       (void*)&originY);
     clSetKernelArg(pointDensityKernel, 5, sizeof(float),     (void*)&spacingX);
     clSetKernelArg(pointDensityKernel, 6, sizeof(float),     (void*)&spacingY);
     clSetKernelArg(pointDensityKernel, 7, sizeof(int),       (void*)&imageWidth);
     clSetKernelArg(pointDensityKernel, 8, sizeof(int),       (void*)&radiussq);
     ciErrNum = clEnqueueNDRangeKernel(GPUCommandQueue, pointDensityKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
     oclCheckError(ciErrNum, CL_SUCCESS);
     clReleaseMemObject(pt_c);
//     cudaThreadSynchronize();
//     ciErrNum = clFinish(GPUCommandQueue);
   }

//   cudaError_t err;

//   err = cudaGetLastError();
//   if (cudaSuccess != err)
//     {
//     fprintf(stderr, "Cuda error: %s: %s.\n", "FunctionName()", cudaGetErrorString( err) );
//     exit(EXIT_FAILURE);
//     }
//
//   cudaThreadSynchronize();
   ciErrNum = clFinish(GPUCommandQueue);

   //Transfer result to CPU memory
   printf("Get memory back\n");
//   cudaMemcpy(pix, pixD, imageWidth*imageHeight*sizeof(float), cudaMemcpyDeviceToHost);
   ciErrNum = clEnqueueReadBuffer(GPUCommandQueue, pixD, CL_TRUE, 0, imageWidth*imageHeight* sizeof(cl_float), pix, 0, NULL, NULL);

//   cudaFree(pixD);
   clReleaseMemObject(pixD);


   m_Mutex.Unlock();

}

}

#endif
