/**
 *   ___ _   _ ___   _     _       ___ ___ ___ ___
 *  / __| | | |   \ /_\   | |  ___| _ ) __/ __/ __|
 * | (__| |_| | |) / _ \  | |_|___| _ \ _| (_ \__ \
 *  \___|\___/|___/_/ \_\ |____|  |___/_| \___|___/
 *
 * File rigidbodies_cuda.cu: Rigid bodies coordinate and gradient transformations for GPU implementation.
 **/

#include "lbfgs.h"

namespace gpu_lbfgs
{
	// Variables on the GPU
	__device__ int singlesthreads;
	__device__ int thisbody;
	__device__ int numaddblocks;

	//Kernels
	__global__ void transform(int *d_cmax, double *d_xrigid, double *d_grmi);
	__global__ void transform2(double *d_x, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_grmi);
	__global__ void singleatoms(double *d_x, double *d_xrigid, int *d_cmax, int *m_d_RigidSingles);
	__global__ void gradtransform(int *d_cmax, double *d_xrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3);
	__global__ void gradtransform2(double *d_gk, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_gkrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *d_blockresult);
	__global__ void addblockresult(double *d_blockresult, double *d_gkrigid);
	__global__ void intermediate(double *d_gk, int *d_maxsite, int *d_cmax, int *m_d_nRigidSites, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *m_d_SitesRigid, int *m_d_RigidGroups, double *d_temparray);
	__global__ void finalreduction(double *d_gk, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_gkrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *d_blockresult, double *d_temparray);
	__global__ void addblockresult2(double *d_blockresult, double *d_gkrigid, int *d_cmax);
	__global__ void gradsingleatoms(double *d_gk, double *d_gkrigid, int *d_cmax, int *m_d_RigidSingles);
}

void lbfgs::transform_rigidtoc(int hostcmax, double *d_x, double *d_xrigid, int degFreedoms, int hostmaxsite, int *d_cmax, int *d_maxsite)
{
	using namespace gpu_lbfgs;

	double *d_grmi;
	CudaSafeCall( cudaMalloc(&d_grmi, 9 * m_nRigidBody * sizeof(double)) );

	dim3 blockDim;
	blockDim.x = 1024;
	dim3 gridDim;
	gridDim.x = ((hostcmax)+ blockDim.x - 1)/blockDim.x;

	gpu_lbfgs::transform<<<gridDim, blockDim>>>(d_cmax, d_xrigid, d_grmi);
	CudaCheckError();
	cudaDeviceSynchronize();

	int nothreads = hostmaxsite*hostcmax; 
	blockDim.x = 1024;
	gridDim.x = ((nothreads)+ blockDim.x - 1)/blockDim.x;

	gpu_lbfgs::transform2<<<gridDim, blockDim>>>(d_x, d_cmax, d_xrigid, m_d_nRigidSites, m_d_RigidGroups, m_d_SitesRigid, d_maxsite, d_grmi); 
	CudaCheckError();
	cudaDeviceSynchronize();

	if (degFreedoms > 6*hostcmax){
		int hostsinglesthreads = (degFreedoms - 6*hostcmax)/3;
		CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::singlesthreads, &hostsinglesthreads,  sizeof(int)) );

		blockDim.x = 1024;
		gridDim.x = ((hostsinglesthreads)+ blockDim.x - 1)/blockDim.x;

		gpu_lbfgs::singleatoms<<<gridDim, blockDim>>>(d_x, d_xrigid, d_cmax, m_d_RigidSingles);
		CudaCheckError();
		cudaDeviceSynchronize();
	}

	CudaSafeCall( cudaFree(d_grmi) );

}

void lbfgs::transform_grad(double *d_gk, double *d_xrigid, double *d_gkrigid, int hostcmax, int *d_cmax, int hostmaxsite, int *d_maxsite, int degFreedoms)
{
	using namespace gpu_lbfgs;

	double *d_grmi1;
	double *d_grmi2;
	double *d_grmi3;
	CudaSafeCall( cudaMalloc(&d_grmi1, 9 * m_nRigidBody * sizeof(double)) );
	CudaSafeCall( cudaMalloc(&d_grmi2, 9 * m_nRigidBody * sizeof(double)) );
	CudaSafeCall( cudaMalloc(&d_grmi3, 9 * m_nRigidBody * sizeof(double)) );

	dim3 blockDim;
	blockDim.x = 512;
	dim3 gridDim;
	gridDim.x = ((hostcmax)+ blockDim.x - 1)/blockDim.x;

	gpu_lbfgs::gradtransform<<<gridDim, blockDim>>>(d_cmax, d_xrigid, d_grmi1, d_grmi2, d_grmi3);
	CudaCheckError();
	cudaDeviceSynchronize();

	blockDim.x = 1024;
	gridDim.x = 1024;

	double *d_blockresult;
	CudaSafeCall( cudaMalloc(&d_blockresult, 3*gridDim.x * sizeof(double)) );
	CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::numaddblocks, &gridDim.x, sizeof(int)) );

	for (int hostthisbody = 0; hostthisbody < hostcmax; ++hostthisbody){
		CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::thisbody, &hostthisbody, sizeof(int)) );
		int shared_size = sizeof(double3) * blockDim.x;

		gpu_lbfgs::gradtransform2<<<gridDim, blockDim, shared_size>>>(d_gk, d_cmax, d_xrigid, m_d_nRigidSites, m_d_RigidGroups, m_d_SitesRigid, d_maxsite, d_gkrigid, d_grmi1, d_grmi2, d_grmi3, d_blockresult);
		CudaCheckError();
		cudaDeviceSynchronize();

		shared_size = sizeof(double3) * gridDim.x;

		gpu_lbfgs::addblockresult<<<1, gridDim, shared_size>>>(d_blockresult, d_gkrigid);
		CudaCheckError();
		cudaDeviceSynchronize();
	}

	CudaSafeCall( cudaFree(d_blockresult) );

	double *d_temparray;
	CudaSafeCall( cudaMalloc(&d_temparray, 3 * hostcmax * hostmaxsite * sizeof(double)) );

	int nothreads = hostmaxsite * hostcmax;
	blockDim.x = 1024;
	gridDim.x = ((nothreads)+ blockDim.x - 1)/blockDim.x;

	gpu_lbfgs::intermediate<<<gridDim, blockDim>>>(d_gk, d_maxsite, d_cmax, m_d_nRigidSites, d_grmi1, d_grmi2, d_grmi3, m_d_SitesRigid, m_d_RigidGroups, d_temparray);
	CudaCheckError();
	cudaDeviceSynchronize();

	blockDim.x = 1024;
	gridDim.x = 1024;

	CudaSafeCall( cudaMalloc(&d_blockresult, 3*gridDim.x * sizeof(double)) );
	CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::numaddblocks, &gridDim.x, sizeof(int)) );

	for (int hostthisbody = 0; hostthisbody < hostcmax; ++hostthisbody){
		CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::thisbody, &hostthisbody, sizeof(int)) );
		int shared_size = sizeof(double3) * blockDim.x;

		gpu_lbfgs::finalreduction<<<gridDim, blockDim, shared_size>>>(d_gk, d_cmax, d_xrigid, m_d_nRigidSites, m_d_RigidGroups, m_d_SitesRigid, d_maxsite, d_gkrigid, d_grmi1, d_grmi2, d_grmi3, d_blockresult, d_temparray);
		CudaCheckError();
		cudaDeviceSynchronize();

		shared_size = sizeof(double3) * gridDim.x;

		gpu_lbfgs::addblockresult2<<<1, gridDim, shared_size>>>(d_blockresult, d_gkrigid, d_cmax);
		CudaCheckError();
		cudaDeviceSynchronize();
	}

	if (degFreedoms > 6*hostcmax){
		int hostsinglesthreads = (degFreedoms - 6*hostcmax)/3;
		CudaSafeCall( cudaMemcpyToSymbol(gpu_lbfgs::singlesthreads, &hostsinglesthreads, sizeof(int)) );

		blockDim.x = 1024;
		gridDim.x = ((hostsinglesthreads)+ blockDim.x - 1)/blockDim.x;

		gpu_lbfgs::gradsingleatoms<<<gridDim, blockDim>>>(d_gk, d_gkrigid, d_cmax, m_d_RigidSingles);
		CudaCheckError();
		cudaDeviceSynchronize();
	}

	CudaSafeCall( cudaFree(d_grmi1) );
	CudaSafeCall( cudaFree(d_grmi2) );
	CudaSafeCall( cudaFree(d_grmi3) );
	CudaSafeCall( cudaFree(d_temparray) );
	CudaSafeCall( cudaFree(d_blockresult) );

}

namespace gpu_lbfgs
{
	__device__ void rmdrvt(double3 p, double rmi[9], double drmi1[9], double drmi2[9], double drmi3[9], bool gtest)
	{
		double theta2 = p.x*p.x + p.y*p.y + p.z*p.z;
		if (theta2 < 1.0e-12){
			rmi[0] = 1.0; // RM(1,1)
			rmi[1] = p.z; // RM(2,1)
			rmi[2] = -p.y; // RM(3,1)
			rmi[3] = -p.z; // RM(1,2)
			rmi[4] = 1.0; // RM(2,2)
			rmi[5] = p.x; // RM(3,2)
			rmi[6] = p.y; // RM(1,3)
			rmi[7] = -p.x; // RM(2,3)
			rmi[8] = 1.0; // RM(3,3)

			if (gtest){
				drmi1[0] = 0.0;
				drmi1[1] = 0.5*p.y;
				drmi1[2] = 0.5*p.z;
				drmi1[3] = 0.5*p.y;
				drmi1[4] = -p.x;
				drmi1[5] = 1.0;
				drmi1[6] = 0.5*p.z;
				drmi1[7] = -1.0;
				drmi1[8] = -p.x;

				drmi2[0] = -1.0*p.y;
				drmi2[1] = 0.5*p.x;
				drmi2[2] = -1.0;
				drmi2[3] = 0.5*p.x;
				drmi2[4] = 0.0;
				drmi2[5] = 0.5*p.z;
				drmi2[6] = 1.0;
				drmi2[7] = 0.5*p.z;
				drmi2[8] = -1.0*p.y;

				drmi3[0] = -p.z;
				drmi3[1] = 1.0;
				drmi3[2] = 0.5*p.x;
				drmi3[3] = -1.0;
				drmi3[4] = -p.z;
				drmi3[5] = 0.5*p.y;
				drmi3[6] = 0.5*p.x;
				drmi3[7] = 0.5*p.y;
				drmi3[8] = 0.0;
			}
		}

		else{
			double theta = sqrt(theta2);
			double ct = cos(theta);
			double st = sin(theta);
			double theta3 = 1.0/(theta2*theta);

			theta = 1.0 / theta;

			double3 pn;
			pn.x = theta*p.x;
			pn.y = theta*p.y;
			pn.z = theta*p.z; 

			double e[9];
			e[0] = 0.0; // E(1,1)
			e[1] = pn.z; // E(2,1)
			e[2] = -pn.y; // E(3,1)
			e[3] = -pn.z; // E(1,2)
			e[4] = 0.0; // E(2,2)
			e[5] = pn.x; // E(3,2)
			e[6] = pn.y; // E(1,3)
			e[7] = -pn.x; // E(2,3)
			e[8] = 0.0; // E(3,3)

			double esq[9];
			esq[0] = e[0]*e[0] + e[3]*e[1] + e[6]*e[2];
			esq[1] = e[1]*e[0] + e[4]*e[1] + e[7]*e[2];
			esq[2] = e[2]*e[0] + e[5]*e[1] + e[8]*e[2];
			esq[3] = e[0]*e[3] + e[3]*e[4] + e[6]*e[5];
			esq[4] = e[1]*e[3] + e[4]*e[4] + e[7]*e[5];
			esq[5] = e[2]*e[3] + e[5]*e[4] + e[8]*e[5];
			esq[6] = e[0]*e[6] + e[3]*e[7] + e[6]*e[8];
			esq[7] = e[1]*e[6] + e[4]*e[7] + e[7]*e[8];
			esq[8] = e[2]*e[6] + e[5]*e[7] + e[8]*e[8];

			rmi[0] = 1.0 + (1.0 - ct)*esq[0] + st*e[0];
			rmi[1] = 0.0 + (1.0 - ct)*esq[1] + st*e[1];
			rmi[2] = 0.0 + (1.0 - ct)*esq[2] + st*e[2];
			rmi[3] = 0.0 + (1.0 - ct)*esq[3] + st*e[3];
			rmi[4] = 1.0 + (1.0 - ct)*esq[4] + st*e[4];
			rmi[5] = 0.0 + (1.0 - ct)*esq[5] + st*e[5];
			rmi[6] = 0.0 + (1.0 - ct)*esq[6] + st*e[6];
			rmi[7] = 0.0 + (1.0 - ct)*esq[7] + st*e[7];
			rmi[8] = 1.0 + (1.0 - ct)*esq[8] + st*e[8];

			if (gtest){
				double de1[9];
				de1[0] = 0.0;
				de1[1] = -p.z*p.x*theta3;
				de1[2] = p.y*p.x*theta3;
				de1[3] = p.z*p.x*theta3;
				de1[4] = 0.0;
				de1[5] = theta - p.x*p.x*theta3;
				de1[6] = -p.y*p.x*theta3;
				de1[7] = -(theta - p.x*p.x*theta3);
				de1[8] = 0.0;

				double de2[9];
				de2[0] = 0.0;
				de2[1] = -p.z*p.y*theta3;
				de2[2] = -(theta - p.y*p.y*theta3);
				de2[3] = p.z*p.y*theta3;
				de2[4] = 0.0;
				de2[5] = -p.x*p.y*theta3;
				de2[6] = theta - p.y*p.y*theta3;
				de2[7] = p.x*p.y*theta3;
				de2[8] = 0.0;

				double de3[9];
				de3[0] = 0.0;
				de3[1] = theta - p.z*p.z*theta3;
				de3[2] = p.y*p.z*theta3;
				de3[3] = -(theta - p.z*p.z*theta3);
				de3[4] = 0.0;
				de3[5] = -p.x*p.z*theta3;
				de3[6] = -p.y*p.z*theta3;
				de3[7] = p.x*p.z*theta3;
				de3[8] = 0.0;

				drmi1[0] = st*pn.x*esq[0] + (1.0 - ct)*((de1[0]*e[0]+de1[3]*e[1]+de1[6]*e[2]) 
						+ (e[0]*de1[0]+e[3]*de1[1]+e[6]*de1[2])) + ct*pn.x*e[0] + st*de1[0];
				drmi1[1] = st*pn.x*esq[1] + (1.0 - ct)*((de1[1]*e[0]+de1[4]*e[1]+de1[7]*e[2])
						+ (e[1]*de1[0]+e[4]*de1[1]+e[7]*de1[2])) + ct*pn.x*e[1] + st*de1[1];
				drmi1[2] = st*pn.x*esq[2] + (1.0 - ct)*((de1[2]*e[0]+de1[5]*e[1]+de1[8]*e[2])
						+ (e[2]*de1[0]+e[5]*de1[1]+e[8]*de1[2])) + ct*pn.x*e[2] + st*de1[2];
				drmi1[3] = st*pn.x*esq[3] + (1.0 - ct)*((de1[0]*e[3]+de1[3]*e[4]+de1[6]*e[5])
						+ (e[0]*de1[3]+e[3]*de1[4]+e[6]*de1[5])) + ct*pn.x*e[3] + st*de1[3];
				drmi1[4] = st*pn.x*esq[4] + (1.0 - ct)*((de1[1]*e[3]+de1[4]*e[4]+de1[7]*e[5])
						+ (e[1]*de1[3]+e[4]*de1[4]+e[7]*de1[5])) + ct*pn.x*e[4] + st*de1[4];
				drmi1[5] = st*pn.x*esq[5] + (1.0 - ct)*((de1[2]*e[3]+de1[5]*e[4]+de1[8]*e[5])
						+ (e[2]*de1[3]+e[5]*de1[4]+e[8]*de1[5])) + ct*pn.x*e[5] + st*de1[5];
				drmi1[6] = st*pn.x*esq[6] + (1.0 - ct)*((de1[0]*e[6]+de1[3]*e[7]+de1[6]*e[8])
						+ (e[0]*de1[6]+e[3]*de1[7]+e[6]*de1[8])) + ct*pn.x*e[6] + st*de1[6];
				drmi1[7] = st*pn.x*esq[7] + (1.0 - ct)*((de1[1]*e[6]+de1[4]*e[7]+de1[7]*e[8])
						+ (e[1]*de1[6]+e[4]*de1[7]+e[7]*de1[8])) + ct*pn.x*e[7] + st*de1[7];
				drmi1[8] = st*pn.x*esq[8] + (1.0 - ct)*((de1[2]*e[6]+de1[5]*e[7]+de1[8]*e[8])
						+ (e[2]*de1[6]+e[5]*de1[7]+e[8]*de1[8])) + ct*pn.x*e[8] + st*de1[8];

				drmi2[0] = st*pn.y*esq[0] + (1.0 - ct)*((de2[0]*e[0]+de2[3]*e[1]+de2[6]*e[2])
						+ (e[0]*de2[0]+e[3]*de2[1]+e[6]*de2[2])) + ct*pn.y*e[0] + st*de2[0];
				drmi2[1] = st*pn.y*esq[1] + (1.0 - ct)*((de2[1]*e[0]+de2[4]*e[1]+de2[7]*e[2])
						+ (e[1]*de2[0]+e[4]*de2[1]+e[7]*de2[2])) + ct*pn.y*e[1] + st*de2[1];
				drmi2[2] = st*pn.y*esq[2] + (1.0 - ct)*((de2[2]*e[0]+de2[5]*e[1]+de2[8]*e[2])
						+ (e[2]*de2[0]+e[5]*de2[1]+e[8]*de2[2])) + ct*pn.y*e[2] + st*de2[2];
				drmi2[3] = st*pn.y*esq[3] + (1.0 - ct)*((de2[0]*e[3]+de2[3]*e[4]+de2[6]*e[5])
						+ (e[0]*de2[3]+e[3]*de2[4]+e[6]*de2[5])) + ct*pn.y*e[3] + st*de2[3];
				drmi2[4] = st*pn.y*esq[4] + (1.0 - ct)*((de2[1]*e[3]+de2[4]*e[4]+de2[7]*e[5])
						+ (e[1]*de2[3]+e[4]*de2[4]+e[7]*de2[5])) + ct*pn.y*e[4] + st*de2[4];
				drmi2[5] = st*pn.y*esq[5] + (1.0 - ct)*((de2[2]*e[3]+de2[5]*e[4]+de2[8]*e[5])
						+ (e[2]*de2[3]+e[5]*de2[4]+e[8]*de2[5])) + ct*pn.y*e[5] + st*de2[5];
				drmi2[6] = st*pn.y*esq[6] + (1.0 - ct)*((de2[0]*e[6]+de2[3]*e[7]+de2[6]*e[8])
						+ (e[0]*de2[6]+e[3]*de2[7]+e[6]*de2[8])) + ct*pn.y*e[6] + st*de2[6];
				drmi2[7] = st*pn.y*esq[7] + (1.0 - ct)*((de2[1]*e[6]+de2[4]*e[7]+de2[7]*e[8])
						+ (e[1]*de2[6]+e[4]*de2[7]+e[7]*de2[8])) + ct*pn.y*e[7] + st*de2[7];
				drmi2[8] = st*pn.y*esq[8] + (1.0 - ct)*((de2[2]*e[6]+de2[5]*e[7]+de2[8]*e[8])
						+ (e[2]*de2[6]+e[5]*de2[7]+e[8]*de2[8])) + ct*pn.y*e[8] + st*de2[8];

				drmi3[0] = st*pn.z*esq[0] + (1.0 - ct)*((de3[0]*e[0]+de3[3]*e[1]+de3[6]*e[2])
						+ (e[0]*de3[0]+e[3]*de3[1]+e[6]*de3[2])) + ct*pn.z*e[0] + st*de3[0];
				drmi3[1] = st*pn.z*esq[1] + (1.0 - ct)*((de3[1]*e[0]+de3[4]*e[1]+de3[7]*e[2])
						+ (e[1]*de3[0]+e[4]*de3[1]+e[7]*de3[2])) + ct*pn.z*e[1] + st*de3[1];
				drmi3[2] = st*pn.z*esq[2] + (1.0 - ct)*((de3[2]*e[0]+de3[5]*e[1]+de3[8]*e[2])
						+ (e[2]*de3[0]+e[5]*de3[1]+e[8]*de3[2])) + ct*pn.z*e[2] + st*de3[2];
				drmi3[3] = st*pn.z*esq[3] + (1.0 - ct)*((de3[0]*e[3]+de3[3]*e[4]+de3[6]*e[5])
						+ (e[0]*de3[3]+e[3]*de3[4]+e[6]*de3[5])) + ct*pn.z*e[3] + st*de3[3];
				drmi3[4] = st*pn.z*esq[4] + (1.0 - ct)*((de3[1]*e[3]+de3[4]*e[4]+de3[7]*e[5])
						+ (e[1]*de3[3]+e[4]*de3[4]+e[7]*de3[5])) + ct*pn.z*e[4] + st*de3[4];
				drmi3[5] = st*pn.z*esq[5] + (1.0 - ct)*((de3[2]*e[3]+de3[5]*e[4]+de3[8]*e[5])
						+ (e[2]*de3[3]+e[5]*de3[4]+e[8]*de3[5])) + ct*pn.z*e[5] + st*de3[5];
				drmi3[6] = st*pn.z*esq[6] + (1.0 - ct)*((de3[0]*e[6]+de3[3]*e[7]+de3[6]*e[8])
						+ (e[0]*de3[6]+e[3]*de3[7]+e[6]*de3[8])) + ct*pn.z*e[6] + st*de3[6];
				drmi3[7] = st*pn.z*esq[7] + (1.0 - ct)*((de3[1]*e[6]+de3[4]*e[7]+de3[7]*e[8])
						+ (e[1]*de3[6]+e[4]*de3[7]+e[7]*de3[8])) + ct*pn.z*e[7] + st*de3[7];
				drmi3[8] = st*pn.z*esq[8] + (1.0 - ct)*((de3[2]*e[6]+de3[5]*e[7]+de3[8]*e[8])
						+ (e[2]*de3[6]+e[5]*de3[7]+e[8]*de3[8])) + ct*pn.z*e[8] + st*de3[8];

			}
		}
	}

	__global__ void transform(int *d_cmax, double *d_xrigid, double *d_grmi)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		while (tid < (*d_cmax)){
			bool gtest = false;

			double3 p; 	// rotation vector

			p.x = d_xrigid[3*(*d_cmax) + 3*tid];
			p.y = d_xrigid[3*(*d_cmax) + 3*tid + 1];
			p.z = d_xrigid[3*(*d_cmax) + 3*tid + 2];

			double rmi[9]; 	// rotation matrix 
			double drmi1[9];
			double drmi2[9];
			double drmi3[9]; 

			gpu_lbfgs::rmdrvt(p, rmi, drmi1, drmi2, drmi3, gtest);

			d_grmi[0+9*tid]= rmi[0];
			d_grmi[1+9*tid]= rmi[1];
			d_grmi[2+9*tid]= rmi[2];

			d_grmi[3+9*tid]= rmi[3];
			d_grmi[4+9*tid]= rmi[4];
			d_grmi[5+9*tid]= rmi[5];

			d_grmi[6+9*tid]= rmi[6];
			d_grmi[7+9*tid]= rmi[7];
			d_grmi[8+9*tid]= rmi[8];

			tid += blockDim.x * gridDim.x;
		}
	}

	__global__ void transform2(double *d_x, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_grmi)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		int thisrigidbody = tid / *(d_maxsite);

		while (tid < (*d_cmax)*(*d_maxsite)){
			int i = tid - (thisrigidbody*(*d_maxsite));

			if (i < m_d_nRigidSites[thisrigidbody]){
				int myatom = m_d_RigidGroups[tid];
				d_x[3*myatom - 3] = d_xrigid[3*thisrigidbody] +  d_grmi[0+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi[3+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi[6+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				d_x[3*myatom - 2] = d_xrigid[3*thisrigidbody + 1] + d_grmi[1+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi[4+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi[7+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				d_x[3*myatom-1] = d_xrigid[3*thisrigidbody + 2] + d_grmi[2+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi[5+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi[8+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
			}

			tid += blockDim.x * gridDim.x;
		}
	}

	__global__ void singleatoms(double *d_x, double *d_xrigid, int *d_cmax, int *m_d_RigidSingles)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		while (tid < (singlesthreads)){
			int thisatom = m_d_RigidSingles[tid];

			d_x[3*thisatom - 3] = d_xrigid[6*(*d_cmax) + 3*tid];
			d_x[3*thisatom - 2] = d_xrigid[6*(*d_cmax) + 3*tid + 1];
			d_x[3*thisatom - 1] = d_xrigid[6*(*d_cmax) + 3*tid + 2];

			tid += blockDim.x * gridDim.x;
		}
	}

	__global__ void gradtransform(int *d_cmax, double *d_xrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		while (tid < (*d_cmax)){
			bool gtest = true;

			double3 pi;
			pi.x = d_xrigid[3*(*d_cmax) + 3*tid];
			pi.y = d_xrigid[3*(*d_cmax) + 3*tid + 1];
			pi.z = d_xrigid[3*(*d_cmax) + 3*tid + 2];

			double rmi[9];
			double drmi1[9];
			double drmi2[9];
			double drmi3[9];

			gpu_lbfgs::rmdrvt(pi, rmi, drmi1, drmi2, drmi3, gtest);

			d_grmi1[0+9*tid]= drmi1[0];
			d_grmi1[1+9*tid]= drmi1[1];
			d_grmi1[2+9*tid]= drmi1[2];

			d_grmi1[3+9*tid]= drmi1[3];
			d_grmi1[4+9*tid]= drmi1[4];
			d_grmi1[5+9*tid]= drmi1[5];

			d_grmi1[6+9*tid]= drmi1[6];
			d_grmi1[7+9*tid]= drmi1[7];
			d_grmi1[8+9*tid]= drmi1[8];


			d_grmi2[0+9*tid]= drmi2[0];
			d_grmi2[1+9*tid]= drmi2[1];
			d_grmi2[2+9*tid]= drmi2[2];

			d_grmi2[3+9*tid]= drmi2[3];
			d_grmi2[4+9*tid]= drmi2[4];
			d_grmi2[5+9*tid]= drmi2[5];

			d_grmi2[6+9*tid]= drmi2[6];
			d_grmi2[7+9*tid]= drmi2[7];
			d_grmi2[8+9*tid]= drmi2[8];


			d_grmi3[0+9*tid]= drmi3[0];
			d_grmi3[1+9*tid]= drmi3[1];
			d_grmi3[2+9*tid]= drmi3[2];

			d_grmi3[3+9*tid]= drmi3[3];
			d_grmi3[4+9*tid]= drmi3[4];
			d_grmi3[5+9*tid]= drmi3[5];

			d_grmi3[6+9*tid]= drmi3[6];
			d_grmi3[7+9*tid]= drmi3[7];
			d_grmi3[8+9*tid]= drmi3[8];

			tid += blockDim.x * gridDim.x;
		}
	}

	__global__ void gradtransform2(double *d_gk, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_gkrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *d_blockresult)
	{
		extern __shared__ double3 smem[];

		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		smem[threadIdx.x].x = 0.0;
		smem[threadIdx.x].y = 0.0;
		smem[threadIdx.x].z = 0.0;

		double3 temp;
		temp.x = 0.0;
		temp.y = 0.0;
		temp.z = 0.0;

		while (tid < m_d_nRigidSites[thisbody]){
			int myatom = m_d_RigidGroups[tid+(*d_maxsite)*thisbody];

			temp.x += d_gk[3*myatom - 3];
			temp.y += d_gk[3*myatom - 2];
			temp.z += d_gk[3*myatom - 1];

			tid += blockDim.x * gridDim.x;
		}

		smem[threadIdx.x].x = temp.x;
		smem[threadIdx.x].y = temp.y;
		smem[threadIdx.x].z = temp.z;

		__syncthreads();

		int index = blockDim.x/2;
		while (index != 0) {
			if (threadIdx.x < index){
				smem[threadIdx.x].x += smem[threadIdx.x + index].x;
				smem[threadIdx.x].y += smem[threadIdx.x + index].y;
				smem[threadIdx.x].z += smem[threadIdx.x + index].z;
			}
			__syncthreads();
			index /= 2;
		}

		if (threadIdx.x == 0){
			d_blockresult[3*blockIdx.x] = smem[0].x;
			d_blockresult[3*blockIdx.x + 1] = smem[0].y;
			d_blockresult[3*blockIdx.x + 2] = smem[0].z;
		}
	}

	__global__ void	addblockresult(double *d_blockresult, double *d_gkrigid)
	{
		extern __shared__ double3 smem[];

		smem[threadIdx.x].x = 0.0;
		smem[threadIdx.x].y = 0.0;
		smem[threadIdx.x].z = 0.0;

		if (threadIdx.x < numaddblocks){
			smem[threadIdx.x].x = d_blockresult[3*threadIdx.x];
			smem[threadIdx.x].y = d_blockresult[3*threadIdx.x + 1];
			smem[threadIdx.x].z = d_blockresult[3*threadIdx.x + 2];
		}
		__syncthreads();

		int index = blockDim.x/2;
		while (index != 0) {
			if (threadIdx.x < index){
				smem[threadIdx.x].x += smem[threadIdx.x + index].x;
				smem[threadIdx.x].y += smem[threadIdx.x + index].y;
				smem[threadIdx.x].z += smem[threadIdx.x + index].z;
			}
			__syncthreads();
			index /= 2;
		}

		if (threadIdx.x == 0){
			d_gkrigid[3*thisbody] = smem[0].x;
			d_gkrigid[3*thisbody + 1] = smem[0].y;
			d_gkrigid[3*thisbody + 2] = smem[0].z;
		}
	}

	__global__ void intermediate(double *d_gk, int *d_maxsite, int *d_cmax, int *m_d_nRigidSites, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *m_d_SitesRigid, int *m_d_RigidGroups, double *d_temparray)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		int thisrigidbody = tid / (*d_maxsite);

		while (tid < (*d_cmax)*(*d_maxsite)){
			int i = tid - (thisrigidbody*(*d_maxsite));

			if (i < m_d_nRigidSites[thisrigidbody]){
				int myatom = m_d_RigidGroups[tid];

				double3 dr1;
				double3 dr2;
				double3 dr3;

				dr1.x = d_grmi1[0+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi1[3+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi1[6+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr1.y = d_grmi1[1+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi1[4+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi1[7+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr1.z = d_grmi1[2+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi1[5+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi1[8+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];

				dr2.x = d_grmi2[0+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi2[3+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi2[6+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr2.y = d_grmi2[1+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi2[4+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi2[7+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr2.z = d_grmi2[2+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi2[5+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi2[8+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];

				dr3.x = d_grmi3[0+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi3[3+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi3[6+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr3.y = d_grmi3[1+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi3[4+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi3[7+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];
				dr3.z = d_grmi3[2+9*thisrigidbody]*m_d_SitesRigid[i + thisrigidbody*3*(*d_maxsite)] + d_grmi3[5+9*thisrigidbody]*m_d_SitesRigid[i + (*d_maxsite) + thisrigidbody*3*(*d_maxsite)] + d_grmi3[8+9*thisrigidbody]*m_d_SitesRigid[i + 2*(*d_maxsite) + thisrigidbody*3*(*d_maxsite)];

				d_temparray[3*i+(*d_maxsite)*3*thisrigidbody] = d_gk[3*myatom - 3]*dr1.x + d_gk[3*myatom - 2]*dr1.y + d_gk[3*myatom - 1]*dr1.z;
				d_temparray[1 + 3*i+(*d_maxsite)*3*thisrigidbody] = d_gk[3*myatom - 3]*dr2.x + d_gk[3*myatom - 2]*dr2.y + d_gk[3*myatom - 1]*dr2.z;
				d_temparray[2 + 3*i+(*d_maxsite)*3*thisrigidbody] = d_gk[3*myatom - 3]*dr3.x + d_gk[3*myatom - 2]*dr3.y + d_gk[3*myatom - 1]*dr3.z;

			}

			tid += blockDim.x * gridDim.x;

		}
	}

	__global__ void finalreduction(double *d_gk, int *d_cmax, double *d_xrigid, int *m_d_nRigidSites, int *m_d_RigidGroups, double *m_d_SitesRigid, int *d_maxsite, double *d_gkrigid, double *d_grmi1, double *d_grmi2, double *d_grmi3, double *d_blockresult, double *d_temparray)
	{
		extern __shared__ double3 smem[];

		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		smem[threadIdx.x].x = 0.0;
		smem[threadIdx.x].y = 0.0;
		smem[threadIdx.x].z = 0.0;

		double3 temp;
		temp.x = 0.0;
		temp.y = 0.0;
		temp.z = 0.0;

		while (tid < m_d_nRigidSites[thisbody]){
			temp.x += d_temparray[3*tid + 3*(*d_maxsite)*thisbody];
			temp.y += d_temparray[1 + 3*tid + 3*(*d_maxsite)*thisbody];
			temp.z += d_temparray[2 + 3*tid + 3*(*d_maxsite)*thisbody];

			tid += blockDim.x * gridDim.x;
		}

		smem[threadIdx.x].x = temp.x;
		smem[threadIdx.x].y = temp.y;
		smem[threadIdx.x].z = temp.z;

		__syncthreads();

		int index = blockDim.x/2;
		while (index != 0) {
			if (threadIdx.x < index){
				smem[threadIdx.x].x += smem[threadIdx.x + index].x;
				smem[threadIdx.x].y += smem[threadIdx.x + index].y;
				smem[threadIdx.x].z += smem[threadIdx.x + index].z;
			}
			__syncthreads();
			index /= 2;
		}

		if (threadIdx.x == 0){
			d_blockresult[3*blockIdx.x] = smem[0].x;
			d_blockresult[3*blockIdx.x + 1] = smem[0].y;
			d_blockresult[3*blockIdx.x + 2] = smem[0].z;
		}
	}

	__global__ void addblockresult2(double *d_blockresult, double *d_gkrigid, int *d_cmax)
	{
		extern __shared__ double3 smem[];

		smem[threadIdx.x].x = 0.0;
		smem[threadIdx.x].y = 0.0;
		smem[threadIdx.x].z = 0.0;

		if (threadIdx.x < numaddblocks){
			smem[threadIdx.x].x = d_blockresult[3*threadIdx.x];
			smem[threadIdx.x].y = d_blockresult[3*threadIdx.x + 1];
			smem[threadIdx.x].z = d_blockresult[3*threadIdx.x + 2];
		}
		__syncthreads();

		int index = blockDim.x/2;
		while (index != 0) {
			if (threadIdx.x < index){
				smem[threadIdx.x].x += smem[threadIdx.x + index].x;
				smem[threadIdx.x].y += smem[threadIdx.x + index].y;
				smem[threadIdx.x].z += smem[threadIdx.x + index].z;
			}
			__syncthreads();
			index /= 2;
		}
		if (threadIdx.x == 0){
			d_gkrigid[3*(*d_cmax) + 3*thisbody] = smem[0].x;
			d_gkrigid[3*(*d_cmax) + 3*thisbody + 1] = smem[0].y;
			d_gkrigid[3*(*d_cmax) + 3*thisbody + 2] = smem[0].z;
		}
	}

	__global__ void gradsingleatoms(double *d_gk, double *d_gkrigid, int *d_cmax, int *m_d_RigidSingles)
	{
		int tid = blockIdx.x * blockDim.x + threadIdx.x;

		while (tid < (singlesthreads)){
			int thisatom = m_d_RigidSingles[tid];

			d_gkrigid[6*(*d_cmax) + 3*tid] = d_gk[3*thisatom - 3];
			d_gkrigid[6*(*d_cmax) + 3*tid + 1] = d_gk[3*thisatom - 2];
			d_gkrigid[6*(*d_cmax) + 3*tid + 2] = d_gk[3*thisatom - 1];

			tid += blockDim.x * gridDim.x;
		}
	}

}
