Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zsgpu #14

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,22 @@ if(USE_ACC)
)
endif()

#LAPACK cmake not right.
#need to use this as a variable below
find_package(LAPACK)

find_package(BLAS)
enable_language(CUDA)
find_package(CUDA REQUIRED)
find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})

#enable_language(FORTRAN)
#find_library(FORTRANLIBS fortranlibs ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})

set_property(TARGET sgpu.exe PROPERTY CXX_STANDARD 14)
target_compile_options(sgpu.exe PRIVATE ${TEST_COMP_FLAGS} -O2)
target_link_libraries(sgpu.exe PRIVATE SlaterGPU io)
#target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" )
target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${FORTRANLIBS}" "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so" "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs" )
file(COPY geom_1 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY geom_2 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY geom_3 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
Expand Down
30 changes: 15 additions & 15 deletions examples/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ int main(int argc, char* argv[]) {

int charge = 0;
int nup = 0;
int * atno = new int[100](); //
int * atno = new int[100]();
double * coords;
vector< vector< double > > basis;
vector< vector< double > > basis_aux;
Expand Down Expand Up @@ -153,17 +153,17 @@ int main(int argc, char* argv[]) {
double * T = new double[N2];
double * pVp = new double[N2];

//Setup for VdV and 4c integral tests
//Setup for VdV and 4c integral tests

double* Pao = new double[N2]();
//need Pao file!!
Pao[0] = 1.;
read_square(N,Pao,"Pao");

int nc = 1;
float* coordsc = new float[3*nc];
coordsc[0] = 0.; coordsc[1] = 0.; coordsc[2] = coordsf[2]+20.;

double* V = new double[nc];
double* dV = new double[3*nc];

Expand All @@ -179,15 +179,15 @@ int main(int argc, char* argv[]) {

if (check_PS() > 0)
compute_ps_integrals_to_disk(natoms,atno,coords,basis,basis_aux,prl);
else
else
{
printf("Computing Standard Integrals:\n");

auto t1 = chrono::high_resolution_clock::now();
compute_ST(natoms, atno, coordsf, basis, nrad, size_ang, ang_g, ang_w, S, T, prl);

auto t2 = chrono::high_resolution_clock::now();

bool do_2c_v1 = read_int("DO_2c_V1");
if (do_2c_v1)
{
Expand Down Expand Up @@ -216,12 +216,12 @@ int main(int argc, char* argv[]) {
else
if (prl > 0) printf(" using compute_all_3c_v2 \n");
compute_all_3c_v2(0,natoms,atno,coordsf,basis,basis_aux,nrad,size_ang,ang_g,ang_w,C,prl);
}
}
else
{
compute_Enp_para(ngpu,natoms,atno,coordsf,basis,nrad,size_ang,ang_g,ang_w,En,pVp,prl);
auto t4 = chrono::high_resolution_clock::now();

compute_all_3c_para(ngpu,0,natoms,atno,coordsf,basis,basis_aux,nrad,size_ang,ang_g,ang_w,C,prl);
}

Expand All @@ -231,26 +231,26 @@ int main(int argc, char* argv[]) {
auto t6 = chrono::high_resolution_clock::now();
if (c4 > 0)
compute_all_4c_v2(natoms,atno,coordsf,basis,nrad,size_ang,ang_g,ang_w,g,prl);

auto t7 = chrono::high_resolution_clock::now();

auto elapsed12 = chrono::duration_cast<chrono::nanoseconds>(t2-t1).count();
auto elapsed23 = chrono::duration_cast<chrono::nanoseconds>(t3-t2).count();
auto elapsed34 = chrono::duration_cast<chrono::nanoseconds>(t4-t3).count();
auto elapsed45 = chrono::duration_cast<chrono::nanoseconds>(t5-t4).count();
auto elapsed56 = chrono::duration_cast<chrono::nanoseconds>(t6-t5).count();
auto elapsed67 = chrono::duration_cast<chrono::nanoseconds>(t7-t6).count();
auto elapsed17 = chrono::duration_cast<chrono::nanoseconds>(t7-t1).count();
auto elapsed67 = chrono::duration_cast<chrono::nanoseconds>(t7-t6).count();
auto elapsed17 = chrono::duration_cast<chrono::nanoseconds>(t7-t1).count();

printf("-------------------------------\n");
printf("Integral ST time: %5.3e s\n",(double)elapsed12/1.e9);
printf("Integral 2c2e time: %5.3e s\n",(double)elapsed23/1.e9);

if (nomp == 1)
{
printf("Integral Vne time: %5.3e s\n",(double)elapsed34/1.e9);
printf("Integral 3c2e time: %5.3e s\n",(double)elapsed45/1.e9);
}
}
else
{
printf("Integral Vne (para) time: %5.3e s\n",(double)elapsed34/1.e9);
Expand All @@ -268,7 +268,7 @@ int main(int argc, char* argv[]) {
#pragma acc exit data copyout(A[0:Naux2],C[0:N2a],S[0:N2],En[0:N2],T[0:N2],pVp[0:N2])
#pragma acc exit data copyout(V[0:nc],dV[0:3*nc],Pao[0:N2],coordsc[0:3*nc],g[0:N2*N2])

if (prl > 0) printf("Printing Standard Integral Files:\n");
if (prl > 0) printf("Printing Standard Integral Files:\n");
write_S_En_T(N,S,En,T);
write_square(Naux,A,"A",2);
write_square(N,pVp,"pVp",2);
Expand Down
35 changes: 25 additions & 10 deletions include/becke.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,49 @@ using std::vector;

void print_gradient(int natoms, double* grad);

void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords);
void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, double* coords);
void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords);
//MD void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, float* coordsf);

void becke_weight_nc(int natoms, float* grid1, float* wt1, float* coords);
void becke_weight_nc(int natoms, float* grid1, float* wt1, double* coords);
void becke_weight_nc(int natoms, float* grid1, float* wt1, float* coordsf);

void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3,
void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3,
int Z1, int Z2, int Z3, float A2, float B2, float C2, float A3, float B3, float C3);
void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
int Z1, int Z2, float A2, float B2, float C2);
void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double* wt2,
double zeta1, double zeta2, double A2, double B2, double C2);

void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, float* grid, float* wt);
void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, float* grid, float* wt);
void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);

void compute_rho(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, double* grid, double* rho, double* drho, double* Td, int prl);
void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
void compute_rho(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);

void compute_delta(int natoms, int* atno, float* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, bool gga, bool tau, float* rho, int gsa, float* grid, float* wt, double* diff, int prl);
void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, double* vxc, double* vxcs, int nrad, int gsa, double* grid, double* wt, double* fxc, int prl);
void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, double* vxc, double* vxcs, int nrad, int gsa, float* grid, float* wt, double* fxc, int prl);
void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, float* vxc, float* vxcs, int nrad, int gs, float* grid, float* wt, double* fxc, int prl);
void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, int gsa, float* grid, float* wt, double* vxc, double* fxc, int prl);
void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, int gsa, float* grid, float* wt, float* vxc, double* fxc, int prl);

void compute_dft_grad(int natoms, int* atno, float* coords, vector<vector<double> > &basis, double* Pao, bool is_gga, double* vxc, double* vxcs, int gs, float* grid, double* grad, int prl);
void compute_delta(int natoms, int* atno, double* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, bool gga, bool tau, float* rho, int gsa, float* grid, float* wt, double* diff, int prl);

void compute_dft_grad(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, bool is_gga, double* vxc, double* vxcs, int gs, float* grid, double* grad, int prl);

//void atomic_charges(int natoms, int gs, float* chg, double* rho, float** wta, int prl);
void atomic_charges(int natoms, int gs, float* chg, double* rho, float* zta, float** gridall, float** wta, int prl);
void batomic_charges(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, float* chg, double* rho, int prl);
void batomic_charges(float alpha, int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, float* chg, double* rho, int prl);

void becke_charges(int natoms, int gs, float* chg, double* rho, float* wt, int prl);
void becke_charges(int natoms, int gs, double* chg, double* rho, double* wt, int prl);
void density_in_basis2(int natoms, int* atno, float* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, int gsa, float* grid, float* wt, double* Paom, int prl);
void density_in_basis2(int natoms, int* atno, double* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, int gsa, float* grid, float* wt, double* Paom, int prl);

double becke_ad(int Z1, int Z2);
float becke_a(int Z1, int Z2);
float bf3(float f1);
double bf3d(double f1);
Expand Down
4 changes: 0 additions & 4 deletions include/braggslater.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,14 @@ float get_radii(int Z1)
return val;
}

#if USE_ACC
//#pragma acc routine seq
#endif
float bsf(int a0, int a1, int a2)
{
float val = 0.35*a0 + 0.85*a1 + a2;
return val;
}

#if USE_ACC
//#pragma acc routine seq
#endif
float get_radii_2(int Z)
{
if (Z<=0) Z = 1;
Expand Down
File renamed without changes.
File renamed without changes.
8 changes: 6 additions & 2 deletions src/libcintw/cintprep.h → include/cintprep.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ extern "C" {

using namespace std;

//allow external access to normalization
double get_gto_norm(int shl1, double zeta1);

class CINTPrep {
public: //should probably make this private...
string xyzfile;
Expand Down Expand Up @@ -48,6 +51,7 @@ class CINTPrep {
void set_bas(int *&bas_in);
void set_env(double *&env_in);
void assign_coords(int natoms, int *atomlist, double *coords, bool in_bohr = true);
void assign_coords(int natoms, int *atomlist, float *coords, bool in_bohr = true);

//RI funcs
int get_var_dim_ri();
Expand All @@ -56,12 +60,12 @@ class CINTPrep {

void read_xyz(string inpxyz);
void read_bas(string inpbas);
void read_bas_ri(string inpbas);
bool read_bas_ri(string inpbas);
void prep_env();

void copy_atoms(vector< int > &atoms_copy);
void copy_coord(vector< double > &coord_copy);

unordered_map<short, short> anum_to_N;
};

Expand Down
8 changes: 7 additions & 1 deletion src/libcintw/cintwrapper.h → include/cintwrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ extern "C" {
int *atm, int natm, int *bas, int nbas, double *env);
int cint1e_ovlp_sph(double *buf, int *shls,
int *atm, int natm, int *bas, int nbas, double *env);
int cint1e_ovlpip_sph(double *buf, int *shls,
int *atm, int natm, int *bas, int nbas, double *env);
int cint2e_sph(double *buf, int *shls,
int *atm, int natm, int *bas, int nbas, double *env, CINTOpt *no_opt);

Expand Down Expand Up @@ -95,11 +97,15 @@ void get_hcore(double *hcore, int N,
int natm, int nbas, int nenv,
int *atm, int *bas, double *env);

void get_tcore(double *tcore, int N,
int natm, int nbas, int nenv,
int *atm, int *bas, double *env);

void gen_eri(double **eri, int N,
int natm, int nbas, int nenv,
int *atm, int *bas, double *env);

void gen_jMOI_gto(double **eri, int N,
void gen_jMOI_gto(double **eri, int N,
int natm, int nbas, int nenv,
int *atm, int *bas, double *env);

Expand Down
57 changes: 57 additions & 0 deletions include/cpu_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef CPU_UTILH
#define CPU_UTILH

#include <stdio.h>
#include <cstdlib>
#include <stdlib.h>

#include <algorithm>
#include <math.h>
#include <complex>

#include <ctime>
#include <sys/time.h>

using namespace std;

//#define lapack_complex_float complex< float >
//#define lapack_complex_double complex< double >

//#include <lapacke.h>
//#include <cblas.h>

double randomf(double a, double b);

void expmat_complex_cpu(int N, double* theta, double* thetai, double* etheta);
void expmat_cpu(int N, double *theta, double *etheta);

int la_diagR(int neig, double* A, double* eigen, double* eigeni);
void la_diag(int neig, int s1, double* A, double* Ae);
int invert_stable_cpu(double* A, int size, double delta);
int invert_stable_cpu(double* A, int size, double delta, int prl);
int invert_stable_cpu_2(double* A, int size, double delta);
int invert_stable_cpu_3(double* A, int size, double delta, bool root);
int invert_cpu(double* A, int size, int mode);
int mat_root_cpu(double* A, int size);
int mat_root_inv_cpu(double* A, int size);
int mat_root_inv_stable_cpu(double* A, int size, double inv_cutoff, int prl);

void trans_cpu(float* Bt, float* B, int m, int n);
void trans_cpu(double* Bt, double* B, int m, int n);
void cross(double* m, double* r1, double* r2);
int sign(double val);

void mat_times_mat_cpu(float* C, float* A, float* B, int M, int N, int K);
void mat_times_mat_cpu(float* C, float* A, float* B, int N);
void mat_times_mat_cpu(double* C, double* A, double* B, int M, int N, int K);
void mat_times_mat_ct_cpu(double* C, double* A, double* B, int M, int N, int K);
void mat_times_mat_cpu(double* C, double* A, double* B, int N);
void mat_times_mat_at_cpu(double* C, double* A, double* B, int N);
void mat_times_mat_bt_cpu(float* C, float* A, float* B, int M, int N, int K);
void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K, int LDAB);
void mat_times_mat_bt_cpu(float* C, float* A, float* B, int N);
void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K);
void mat_times_mat_bt_cpu(double* C, double* A, double* B, int N);


#endif
44 changes: 44 additions & 0 deletions include/cuda_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#ifndef CUDA_UTILH
#define CUDA_UTILH

//#include <cublas.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

int invert_eigen_cusolver(int size, double* A, double eig_max, cusolverDnHandle_t& cu_hdl);
int invert_stable_cusolver(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl);
int invert_stable_cusolver_2(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl);
int invert_cusolver(int N, float* A, cusolverDnHandle_t& cu_hdl);
int invert_cusolver(int N, double* A, cusolverDnHandle_t& cu_hdl);
void diagonalize_cusolver(int N, float* A, float* Ae, cusolverDnHandle_t& cu_hdl);
void diagonalize_cusolver(int Ne, int N, double* A, double* Ae, cusolverDnHandle_t& cu_hdl);

int mat_root_inv_cusolver(double* A, int size, cusolverDnHandle_t& cu_hdl);
int mat_root_inv_stable_cusolver(double* A, int size, double delta, cusolverDnHandle_t& cu_hdl);

void expmat(int N, double* theta1, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);
//double expmat_complex(int N, double* theta1, double* theta1i, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);
double expmat_complex(int N, double* theta1, double* theta1i, double* jCA, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);

void mat_times_mat(float* C, float* A, float* B, int N);
void mat_times_mat(float* C, float* A, float* B, int M, int N, int K);
void mat_times_mat_at(float* C, float* A, float* B, int M, int N, int K);
void mat_times_mat_bt(float* C, float* A, float* B, int N);

void mat_times_mat(double* C, double* A, double* B, int N);
void mat_times_mat(double* C, double* A, double* B, int M, int N, int K);
void mat_times_mat_at(double* C, double* A, double* B, int N);
void mat_times_mat_at(double* C, double* A, double* B, int M, int N, int K);
void mat_times_mat_bt(double* C, double* A, double* B, int N);
void mat_times_mat_bt(double* C, double* A, double* B, int M, int N, int K);

#pragma acc routine seq
void trans(float* Bt, float* B, int m, int n);
#pragma acc routine seq
void trans(double* Bt, double* B, int m, int n);

void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first);
void copy_to_all_gpu(int ngpu, int s1, float* A, int include_first);
void copy_to_all_gpu(int ngpu, int s1, int s2, double** A, int include_first);

#endif
Loading