/**
 *   ___ _   _ ___   _     _       ___ ___ ___ ___
 *  / __| | | |   \ /_\   | |  ___| _ ) __/ __/ __|
 * | (__| |_| | |) / _ \  | |_|___| _ \ _| (_ \__ \
 *  \___|\___/|___/_/ \_\ |____|  |___/_| \___|___/
 *
 * File lj_cuda.cu: Implementation of class gpu_lj
 *
 **/

#include "CudaLBFGS/lbfgs.h"
#include "CudaLBFGS/potential.h"

namespace gpu_lj_d 	// All device functions for gpu_lj in here
{
	__constant__ size_t m_ndim;	// 3*natoms

	// Inner core kernel for LJ
	// Returns the gradient in x, y, z and energy in w
	__device__ double4 lj_pair(double4 r1, double4 r2, double4 g)	
	{
		double3 r;
		// Calculate the distance vector
		r.x = r2.x - r1.x;
		r.y = r2.y - r1.y;
		r.z = r2.z - r1.z;

		// Distance squared
		double rsq = r.x * r.x + r.y * r.y + r.z * r.z;

		// Ignore pair if distance is too small, this is usually the self-interaction of the atom
		if(rsq < 1.0e-6)
			return g;

		// Calculate 1/r**2, 1/r**6, 1/r**12 
		double ir2 = 1.0 / rsq;
		double ir6 = ir2*ir2*ir2;
		double ir12 = ir6*ir6;

		// Calculate the energy
		double tmp = 2.0 * (ir12 - ir6);
		g.w += tmp;
		// Calculate the gradient
		tmp = 4.0 * (12.0 * ir12 - 6.0 * ir6) * ir2;
		g.x += tmp*r.x;
		g.y += tmp*r.y;
		g.z += tmp*r.z;

		return g;
	}

	// Perform the calculation for one block 
	__device__ double4 tile_calculation(double4 r, double4 g)
	{
		extern __shared__ double4 shared_r[];
		for (size_t i = 0; i < (m_ndim/3); i++) {
			g=gpu_lj_d::lj_pair(r, shared_r[i], g);
		}
		return g;
	}

	__global__ void kernel_combined_f_gradf(const double *d_x, double *d_y, double *d_grad)
	{
		extern __shared__ double4 shared_r[];
		double4 myPosition;	// Each thread deals with one atom
		double4 g = {0.0, 0.0, 0.0, 0.0};	

		// Current test version of LJ uses only one block for efficient shared memory use
		int tid = threadIdx.x;

		// Make sure w is initialised to 0 on ALL shared memory so reduction is safe 
		shared_r[tid].w = 0.0;

		if (tid < (m_ndim/3)) {		// Thread numbers greater than the number of atoms do nothing
			// Read the coordinates from global memory into memory local to each thread
			myPosition.x = d_x[3*tid];
			myPosition.y = d_x[3*tid+1];
			myPosition.z = d_x[3*tid+2];

			myPosition.w = 0.0;

			// Also copy coordinates from global memory into shared memory
			shared_r[tid].x = myPosition.x; 
			shared_r[tid].y = myPosition.y; 
			shared_r[tid].z = myPosition.z; 

			__syncthreads();	// Wait until copy to shared memory completed

			// Carry out energy and gradient calculations 
			g = gpu_lj_d::tile_calculation(myPosition, g);

			__syncthreads();	// Wait until the calculations are done

			// Copy energy in w to shared memory for reduction and copy gradient to global memory
			shared_r[tid].w = g.w;
			d_grad[3*tid] = g.x;
			d_grad[3*tid+1] = g.y;
			d_grad[3*tid+2] = g.z;

			__syncthreads();

		}
		// Shared memory reduction to add up all energy contributions to get a total energy
		int i = blockDim.x/2;
		while (i != 0) {
			if (tid < i){
				shared_r[tid].w += shared_r[tid + i].w;
			}
			__syncthreads();
			i /= 2;
		}

		if (tid==0){
			*d_y = shared_r[0].w;	// Write total energy back to global memory
		}
	}
}

	gpu_lj::gpu_lj(size_t ndim)
: cost_function(ndim)
	//gpu_lj constructor
{
	// Copy 3*natoms to GPU
	CudaSafeCall( cudaMemcpyToSymbol(gpu_lj_d::m_ndim, &m_numDimensions, sizeof(size_t)) );

	// Set launch parameters for main potential kernel
	blockDim.x = 1024;	// No. of threads (1024 is max. per block) 
	int numblocks = ((m_numDimensions/3)+ blockDim.x - 1)/blockDim.x;
	gridDim.x = numblocks;	// Number of blocks (one)

	shared_size = sizeof(double4) * (blockDim.x);	// Size of shared memory

	//If shared size bigger than device shared memory, warn and exit
	int device;
	cudaGetDevice(&device);

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, device);

	if (shared_size > props.sharedMemPerBlock){
		std::cerr << "The amount of shared memory requested exceeds availability. " << std::endl;
		exit(EXIT_FAILURE);
	}
}

void gpu_lj::f_gradf(const double *d_x, double *d_f, double *d_grad, _Bool *coldfusiont, double *coldfusionlim)
{
	gpu_lj_d::kernel_combined_f_gradf<<<gridDim, blockDim, shared_size>>>(d_x, d_f, d_grad);
	CudaCheckError();
	cudaDeviceSynchronize();
}
