ZimmermanGroup · joshkamm · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -16,9 +16,22 @@ if(USE_ACC)
         )
 endif()
 
+#LAPACK cmake not right.
+#need to use this as a variable below
+find_package(LAPACK)
+
+find_package(BLAS)
+enable_language(CUDA)
+find_package(CUDA REQUIRED)
+find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
+#enable_language(FORTRAN)
+#find_library(FORTRANLIBS fortranlibs ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
 set_property(TARGET sgpu.exe PROPERTY CXX_STANDARD 14)
 target_compile_options(sgpu.exe PRIVATE ${TEST_COMP_FLAGS} -O2)
-target_link_libraries(sgpu.exe PRIVATE SlaterGPU io)
+#target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" )
+target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${FORTRANLIBS}" "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so" "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs" )
 file(COPY geom_1 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY geom_2 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY geom_3 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

diff --git a/examples/main.cpp b/examples/main.cpp
@@ -108,7 +108,7 @@ int main(int argc, char* argv[]) {
 
   int charge = 0;
   int nup = 0;
-  int * atno = new int[100](); // 
+  int * atno = new int[100]();
   double * coords;
   vector< vector< double > > basis;
   vector< vector< double > > basis_aux;
@@ -153,17 +153,17 @@ int main(int argc, char* argv[]) {
     double * T = new double[N2];
     double * pVp = new double[N2];
 
-    //Setup for VdV and 4c integral tests 
-    
+    //Setup for VdV and 4c integral tests
+
     double* Pao = new double[N2]();
     //need Pao file!!
     Pao[0] = 1.;
     read_square(N,Pao,"Pao");
-    
+
     int nc = 1;
     float* coordsc = new float[3*nc];
     coordsc[0] = 0.; coordsc[1] = 0.; coordsc[2] = coordsf[2]+20.;
-    
+
     double* V = new double[nc];
     double* dV = new double[3*nc];
 
@@ -179,15 +179,15 @@ int main(int argc, char* argv[]) {
 
     if (check_PS() > 0)
       compute_ps_integrals_to_disk(natoms,atno,coords,basis,basis_aux,prl);
-    else 
+    else
     {
       printf("Computing Standard Integrals:\n");
 
       auto t1 = chrono::high_resolution_clock::now();
       compute_ST(natoms, atno, coordsf, basis, nrad, size_ang, ang_g, ang_w, S, T, prl);
 
       auto t2 = chrono::high_resolution_clock::now();
-      
+
       bool do_2c_v1 = read_int("DO_2c_V1");
       if (do_2c_v1)
       {
@@ -216,12 +216,12 @@ int main(int argc, char* argv[]) {
         else
           if (prl > 0) printf("  using compute_all_3c_v2 \n");
           compute_all_3c_v2(0,natoms,atno,coordsf,basis,basis_aux,nrad,size_ang,ang_g,ang_w,C,prl);
-      } 
+      }
       else
       {
         compute_Enp_para(ngpu,natoms,atno,coordsf,basis,nrad,size_ang,ang_g,ang_w,En,pVp,prl);
         auto t4 = chrono::high_resolution_clock::now();
-        
+
         compute_all_3c_para(ngpu,0,natoms,atno,coordsf,basis,basis_aux,nrad,size_ang,ang_g,ang_w,C,prl);
       }
 
@@ -231,26 +231,26 @@ int main(int argc, char* argv[]) {
       auto t6 = chrono::high_resolution_clock::now();
       if (c4 > 0)
         compute_all_4c_v2(natoms,atno,coordsf,basis,nrad,size_ang,ang_g,ang_w,g,prl);
-      
+
       auto t7 = chrono::high_resolution_clock::now();
 
       auto elapsed12 = chrono::duration_cast<chrono::nanoseconds>(t2-t1).count();
       auto elapsed23 = chrono::duration_cast<chrono::nanoseconds>(t3-t2).count();
       auto elapsed34 = chrono::duration_cast<chrono::nanoseconds>(t4-t3).count();
       auto elapsed45 = chrono::duration_cast<chrono::nanoseconds>(t5-t4).count();
       auto elapsed56 = chrono::duration_cast<chrono::nanoseconds>(t6-t5).count();
-      auto elapsed67 = chrono::duration_cast<chrono::nanoseconds>(t7-t6).count();    
-      auto elapsed17 = chrono::duration_cast<chrono::nanoseconds>(t7-t1).count();    
+      auto elapsed67 = chrono::duration_cast<chrono::nanoseconds>(t7-t6).count();
+      auto elapsed17 = chrono::duration_cast<chrono::nanoseconds>(t7-t1).count();
 
       printf("-------------------------------\n");
       printf("Integral ST   time: %5.3e s\n",(double)elapsed12/1.e9);
       printf("Integral 2c2e time: %5.3e s\n",(double)elapsed23/1.e9);
-      
+
       if (nomp == 1)
       {
         printf("Integral Vne  time: %5.3e s\n",(double)elapsed34/1.e9);
         printf("Integral 3c2e time: %5.3e s\n",(double)elapsed45/1.e9);
-      } 
+      }
       else
       {
         printf("Integral Vne  (para)  time: %5.3e s\n",(double)elapsed34/1.e9);
@@ -268,7 +268,7 @@ int main(int argc, char* argv[]) {
       #pragma acc exit data copyout(A[0:Naux2],C[0:N2a],S[0:N2],En[0:N2],T[0:N2],pVp[0:N2])
       #pragma acc exit data copyout(V[0:nc],dV[0:3*nc],Pao[0:N2],coordsc[0:3*nc],g[0:N2*N2])
 
-      if (prl > 0) printf("Printing Standard Integral Files:\n");  
+      if (prl > 0) printf("Printing Standard Integral Files:\n");
       write_S_En_T(N,S,En,T);
       write_square(Naux,A,"A",2);
       write_square(N,pVp,"pVp",2);

diff --git a/include/becke.h b/include/becke.h
@@ -11,34 +11,49 @@ using std::vector;
 
 void print_gradient(int	natoms,	double*	grad);
 
-void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords);
+void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, double* coords);
+void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords);
+//MD void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, float* coordsf);
 
-void becke_weight_nc(int natoms, float* grid1, float* wt1, float* coords);
+void becke_weight_nc(int natoms, float* grid1, float* wt1, double* coords);
+void becke_weight_nc(int natoms, float* grid1, float* wt1, float* coordsf);
 
-void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3, 
+void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3,
                      int Z1, int Z2, int Z3, float A2, float B2, float C2, float A3, float B3, float C3);
 void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
                      int Z1, int Z2, float A2, float B2, float C2);
 void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double* wt2,
                      double zeta1, double zeta2, double A2, double B2, double C2);
 
-void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
-void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, float* grid, float* wt);
-void get_becke_grid_full(int natoms, int* atno, float* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
+void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
+void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
+void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, float* grid, float* wt);
 void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
 
+void compute_rho(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
+void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, double* grid, double* rho, double* drho, double* Td, int prl);
+void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
+void compute_rho(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
 
-void compute_delta(int natoms, int* atno, float* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, bool gga, bool tau, float* rho, int gsa, float* grid, float* wt, double* diff, int prl);
+void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, double* vxc, double* vxcs, int nrad, int gsa, double* grid, double* wt, double* fxc, int prl);
+void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, double* vxc, double* vxcs, int nrad, int gsa, float* grid, float* wt, double* fxc, int prl);
+void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool gga, bool tau, bool need_wt, double* Pao, float* vxc, float* vxcs, int nrad, int gs, float* grid, float* wt, double* fxc, int prl);
+void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, int gsa, float* grid, float* wt, double* vxc, double* fxc, int prl);
+void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, int gsa, float* grid, float* wt, float* vxc, double* fxc, int prl);
 
-void compute_dft_grad(int natoms, int* atno, float* coords, vector<vector<double> > &basis, double* Pao, bool is_gga, double* vxc, double* vxcs, int gs, float* grid, double* grad, int prl);
+void compute_delta(int natoms, int* atno, double* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, bool gga, bool tau, float* rho, int gsa, float* grid, float* wt, double* diff, int prl);
 
+void compute_dft_grad(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, bool is_gga, double* vxc, double* vxcs, int gs, float* grid, double* grad, int prl);
+
+//void atomic_charges(int natoms, int gs, float* chg, double* rho, float** wta, int prl);
 void atomic_charges(int natoms, int gs, float* chg, double* rho, float* zta, float** gridall, float** wta, int prl);
-void batomic_charges(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, float* chg, double* rho, int prl);
+void batomic_charges(float alpha, int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, float* chg, double* rho, int prl);
 
 void becke_charges(int natoms, int gs, float* chg, double* rho, float* wt, int prl);
 void becke_charges(int natoms, int gs, double* chg, double* rho, double* wt, int prl);
-void density_in_basis2(int natoms, int* atno, float* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, int gsa, float* grid, float* wt, double* Paom, int prl);
+void density_in_basis2(int natoms, int* atno, double* coords, vector<vector<double> > &basis1, vector<vector<double> > &basis2, int No, double* jCA, int gsa, float* grid, float* wt, double* Paom, int prl);
 
+double becke_ad(int Z1, int Z2);
 float becke_a(int Z1, int Z2);
 float bf3(float f1);
 double bf3d(double f1);

diff --git a/include/braggslater.h b/include/braggslater.h
@@ -36,18 +36,14 @@ float get_radii(int Z1)
   return val;
 }
 
-#if USE_ACC
 //#pragma acc routine seq
-#endif
 float bsf(int a0, int a1, int a2)
 {
   float val = 0.35*a0 + 0.85*a1 + a2;
   return val;
 }
 
-#if USE_ACC
 //#pragma acc routine seq
-#endif
 float get_radii_2(int Z)
 {
   if (Z<=0) Z = 1;

diff --git a/src/libcintw/cint.h → include/cint.h b/src/libcintw/cint.h → include/cint.h
diff --git a/src/libcintw/cint_funcs.h → include/cint_funcs.h b/src/libcintw/cint_funcs.h → include/cint_funcs.h
diff --git a/src/libcintw/cintprep.h → include/cintprep.h b/src/libcintw/cintprep.h → include/cintprep.h
@@ -13,6 +13,9 @@ extern "C" {
 
 using namespace std;
 
+//allow external access to normalization
+double get_gto_norm(int shl1, double zeta1);
+
 class CINTPrep {
   public: //should probably make this private...
     string xyzfile;
@@ -48,6 +51,7 @@ class CINTPrep {
     void set_bas(int *&bas_in);
     void set_env(double *&env_in);
     void assign_coords(int natoms, int *atomlist, double *coords, bool in_bohr = true); 
+    void assign_coords(int natoms, int *atomlist, float *coords, bool in_bohr = true); 
 
     //RI funcs
     int get_var_dim_ri();
@@ -56,12 +60,12 @@ class CINTPrep {
 
     void read_xyz(string inpxyz);
     void read_bas(string inpbas);
-    void read_bas_ri(string inpbas);
+    bool read_bas_ri(string inpbas);
     void prep_env();
 
     void copy_atoms(vector< int > &atoms_copy);
     void copy_coord(vector< double > &coord_copy);
-
+ 
     unordered_map<short, short> anum_to_N;
 };
 

diff --git a/src/libcintw/cintwrapper.h → include/cintwrapper.h b/src/libcintw/cintwrapper.h → include/cintwrapper.h
@@ -26,6 +26,8 @@ extern "C" {
                          int *atm, int natm, int *bas, int nbas, double *env);
   int cint1e_ovlp_sph(double *buf, int *shls,
                           int *atm, int natm, int *bas, int nbas, double *env);
+  int cint1e_ovlpip_sph(double *buf, int *shls,
+                          int *atm, int natm, int *bas, int nbas, double *env);
   int cint2e_sph(double *buf, int *shls,
                      int *atm, int natm, int *bas, int nbas, double *env, CINTOpt *no_opt);
 
@@ -95,11 +97,15 @@ void get_hcore(double *hcore, int N,
                int natm, int nbas, int nenv,
                int *atm, int *bas, double *env);
 
+void get_tcore(double *tcore, int N,
+               int natm, int nbas, int nenv,
+               int *atm, int *bas, double *env);
+
 void gen_eri(double **eri, int N, 
              int natm, int nbas, int nenv,
              int *atm, int *bas, double *env);
 
-void gen_jMOI_gto(double **eri, int N, 
+void gen_jMOI_gto(double **eri, int N,
                   int natm, int nbas, int nenv,
                   int *atm, int *bas, double *env);
 

diff --git a/include/cpu_util.h b/include/cpu_util.h
@@ -0,0 +1,57 @@
+#ifndef CPU_UTILH
+#define CPU_UTILH
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <math.h>
+#include <complex>
+
+#include <ctime>
+#include <sys/time.h>
+
+using namespace std;
+
+//#define lapack_complex_float complex< float >
+//#define lapack_complex_double complex< double >
+
+//#include <lapacke.h>
+//#include <cblas.h>
+
+double randomf(double a, double b);
+
+void expmat_complex_cpu(int N, double* theta, double* thetai, double* etheta);
+void expmat_cpu(int N, double *theta, double *etheta);
+
+int la_diagR(int neig, double* A, double* eigen, double* eigeni);
+void la_diag(int neig, int s1, double* A, double* Ae);
+int invert_stable_cpu(double* A, int size, double delta);
+int invert_stable_cpu(double* A, int size, double delta, int prl);
+int invert_stable_cpu_2(double* A, int size, double delta);
+int invert_stable_cpu_3(double* A, int size, double delta, bool root);
+int invert_cpu(double* A, int size, int mode);
+int mat_root_cpu(double* A, int size);
+int mat_root_inv_cpu(double* A, int size);
+int mat_root_inv_stable_cpu(double* A, int size, double inv_cutoff, int prl);
+
+void trans_cpu(float* Bt, float* B, int m, int n);
+void trans_cpu(double* Bt, double* B, int m, int n);
+void cross(double* m, double* r1, double* r2);
+int sign(double val);
+
+void mat_times_mat_cpu(float* C, float* A, float* B, int M, int N, int K);
+void mat_times_mat_cpu(float* C, float* A, float* B, int N);
+void mat_times_mat_cpu(double* C, double* A, double* B, int M, int N, int K);
+void mat_times_mat_ct_cpu(double* C, double* A, double* B, int M, int N, int K);
+void mat_times_mat_cpu(double* C, double* A, double* B, int N);
+void mat_times_mat_at_cpu(double* C, double* A, double* B, int N);
+void mat_times_mat_bt_cpu(float* C, float* A, float* B, int M, int N, int K);
+void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K, int LDAB);
+void mat_times_mat_bt_cpu(float* C, float* A, float* B, int N);
+void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K);
+void mat_times_mat_bt_cpu(double* C, double* A, double* B, int N);
+
+
+#endif
diff --git a/include/cuda_util.h b/include/cuda_util.h
@@ -0,0 +1,44 @@
+#ifndef CUDA_UTILH
+#define CUDA_UTILH
+
+//#include <cublas.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+int invert_eigen_cusolver(int size, double* A, double eig_max, cusolverDnHandle_t& cu_hdl);
+int invert_stable_cusolver(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl);
+int invert_stable_cusolver_2(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl);
+int invert_cusolver(int N, float* A, cusolverDnHandle_t& cu_hdl);
+int invert_cusolver(int N, double* A, cusolverDnHandle_t& cu_hdl);
+void diagonalize_cusolver(int N, float* A, float* Ae, cusolverDnHandle_t& cu_hdl);
+void diagonalize_cusolver(int Ne, int N, double* A, double* Ae, cusolverDnHandle_t& cu_hdl);
+
+int mat_root_inv_cusolver(double* A, int size, cusolverDnHandle_t& cu_hdl);
+int mat_root_inv_stable_cusolver(double* A, int size, double delta, cusolverDnHandle_t& cu_hdl);
+
+void expmat(int N, double* theta1, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);
+//double expmat_complex(int N, double* theta1, double* theta1i, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);
+double expmat_complex(int N, double* theta1, double* theta1i, double* jCA, double* U, cusolverDnHandle_t cu_hdl, cublasHandle_t cublasH);
+
+void mat_times_mat(float* C, float* A, float* B, int N);
+void mat_times_mat(float* C, float* A, float* B, int M, int N, int K);
+void mat_times_mat_at(float* C, float* A, float* B, int M, int N, int K);
+void mat_times_mat_bt(float* C, float* A, float* B, int N);
+
+void mat_times_mat(double* C, double* A, double* B, int N);
+void mat_times_mat(double* C, double* A, double* B, int M, int N, int K);
+void mat_times_mat_at(double* C, double* A, double* B, int N);
+void mat_times_mat_at(double* C, double* A, double* B, int M, int N, int K);
+void mat_times_mat_bt(double* C, double* A, double* B, int N);
+void mat_times_mat_bt(double* C, double* A, double* B, int M, int N, int K);
+
+#pragma acc routine seq
+void trans(float* Bt, float* B, int m, int n);
+#pragma acc routine seq
+void trans(double* Bt, double* B, int m, int n);
+
+void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first);
+void copy_to_all_gpu(int ngpu, int s1, float* A, int include_first);
+void copy_to_all_gpu(int ngpu, int s1, int s2, double** A, int include_first);
+
+#endif