diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index f12bd19..64e19b3 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -17,14 +17,44 @@ jobs: - name: Install dependencies run: | #python -m pip install --upgrade pip + #conda create --name ENV python=3.9 + #conda init + #conda activate ENV + #cd /opt/hostedtoolcache/Python/3.9.20/x64/bin/; ls if [ -f pytest_requirements.txt ]; then pip install -r pytest_requirements.txt; fi - pip install numpy + #pip install pytest-executable + #pip install pipdeptree + #pip install dask + export CONDA_ALWAYS_YES="true" + pip install 'numpy==1.24.0' pip install rdkit pip install scipy + pip install xtb + pip install joblib + pip install PyYAML + pip install openbabel-wheel + pip install pandas + pip install ase + #pip install xtb + conda install -y conda-forge::xtb + pip install xgboost + pip install h5py + pip install pysisyphus + conda install -y conda-forge::crest + unset CONDA_ALWAYS_YES pip install . + # CHECK + pip freeze + + # Prepare pysis env + cp .pysisyphusrc /home/runner/.pysisyphusrc + cond=asd + sed -i "s|CONDA_ENV|${CONDA}|g" /home/runner/.pysisyphusrc + #which crest + #if [ -f env.yaml ]; then # #conda install --file env.yaml # conda env create -f env.yaml @@ -33,4 +63,22 @@ jobs: - name: Run pytest run: | #conda activate yarp - cd examples/; pytest -s # Assuming your tests are in the 'tests' directory + #cd examples/; pytest -s # Assuming your tests are in the 'tests' directory + home=$(pwd) + cd /opt/hostedtoolcache/Python/3.9.20/x64/bin/ + ls + echo $CONDA + cd $CONDA/bin + ls + which pip; echo "GOOD1" + echo $(pwd) + $CONDA/bin/xtb --version + $CONDA/bin/crest --version + pysis --version + #obabel --version + cd $home/pyTEST_Example/; + # xtb and crest are in conda bin, others in pip bin + sed -i "s|CONDA_PATH|${CONDA}|g" test_rxn.py + pytest -s + + shell: bash diff --git a/pyTEST_Example/ML.py b/pyTEST_Example/ML.py new file mode 100644 index 0000000..943b7c7 --- /dev/null +++ b/pyTEST_Example/ML.py @@ -0,0 +1,40 @@ +from typing import Dict, List +import torch +import torch.nn as nn +from torch import Tensor + + +elem_to_num = {'H':1, 'C':6, 'N':7, 'O':8} + +class EnsembledModel(nn.Module): + def __init__(self, models: List, x=['coord', 'numbers', 'charge'], out=['energy'], detach=False): + super().__init__() + self.models = nn.ModuleList(models) + self.x = x + self.out = out + self.detach = detach + + def forward(self, data: Dict[str, Tensor]) -> Dict[str, Tensor]: + res : List[Dict[str, Tensor]] = [] + for model in self.models: + _in = dict() + for k in data: + if k in self.x: + _in[k] = data[k] + _out = model(_in) + _r = dict() + for k in _out: + if k in self.out: + _r[k] = _out[k] + if self.detach: + _r[k] = _r[k].detach() + res.append(_r) + + for k in res[0]: + v = [] + for x in res: + v.append(x[k]) + vv = torch.stack(v, dim=0) + data[k] = vv.mean(dim=0) + data[k + '_std'] = vv.std(dim=0) + return data diff --git a/pyTEST_Example/TS_refinement.py b/pyTEST_Example/TS_refinement.py new file mode 100644 index 0000000..c7c96d2 --- /dev/null +++ b/pyTEST_Example/TS_refinement.py @@ -0,0 +1,160 @@ +#!/bin/env python +# Author: Hsuan-Hao Hsu (hsu205@purdue.edu) +import os,sys +import numpy as np +import yaml +import logging +import time +import json +import pickle +import pyjokes +import fnmatch +from xgboost import XGBClassifier + +from yarp.input_parsers import xyz_parse +from wrappers.orca import ORCA +from wrappers.crest import CREST +from utils import * +from constants import Constants +from job_submission import * +from wrappers.gaussian import Gaussian +from job_mapping import * + +# This program aims to refine the TSs (.xyz files) by DFT level. +# We don't have any info of reactant and product. +# Just do TS-opt and IRC calculations. +def main(args): + TS_dict=dict() + # read TS into dictionary + if os.path.isfile(args["input"]): + E, G=xyz_parse(args["input"]) + TS_dict[args["input"].split("/")[-1].split(".")[0]]=dict() + TS_dict[args["input"].split("/")[-1].split(".")[0]]["E"]=E + TS_dict[args["input"].split("/")[-1].split(".")[0]]["TSG"]=G + else: + xyz_files=[args["input"]+"/"+i for i in os.listdir(args["input"]) if fnmatch.fnmatch(i, "*.xyz")] + for i in xyz_files: + E, G=xyz_parse(i) + TS_dict[i.split("/")[-1].split(".")[0]]=dict() + TS_dict[i.split("/")[-1].split(".")[0]]["E"]=E + TS_dict[i.split("/")[-1].split(".")[0]]["TSG"]=G + # finish laod initial TSs into a dict + scratch=args["scratch"] + if os.path.isdir(args["scratch"]) is False: os.mkdir(args["scratch"]) + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + # run TS optimization + job_list=dict() + running_jobs=[] + for i in TS_dict.keys(): + wf=f"{scratch}/{i}" + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_file=f"{wf}/{i}.xyz" + xyz_write(xyz_file, TS_dict[i]["E"], TS_dict[i]["TSG"]) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=xyz_file, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{i}-TSOPT",\ + jobtype="OptTS Freq", lot=args["dft_lot"], charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + dft_job.generate_geometry_settings(hess=True, hess_step=int(args["hess_recalc"])) + dft_job.generate_input() + job_list[i]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(i) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=xyz_file, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{i}-TSOPT",\ + jobtype="tsopt", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input() + job_list[i]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(i) + if len(running_jobs)>1: + n_submit=len(running_jobs)//int(args["dft_njobs"]) + if len(running_jobs)%int(args["dft_njobs"])>0: n_submit+=1 + startid=0 + slurm_jobs=[] + startid=0 + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"TSOPT.{i}", ppn=int(args["ppn"]), partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(args["mem"]*1100)) + endid=min(startid+int(args["dft_njobs"]), len(running_jobs)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([job_list[ind] for ind in running_jobs[startid:endid]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([job_list[ind] for ind in running_jobs[startid:endid]]) + slurmjob.submit() + startid=endid + slurm_jobs.append(slurmjob) + print(f"Running {len(slurm_jobs)} ts optimization jobs...") + monitor_jobs(slurm_jobs) + key=[i for i in job_list.keys()] + for i in key: + dft_opt=job_list[i] + if dft_opt.calculation_terminated_normally() and dft_opt.optimization_converged() and dft_opt.is_TS(): + _, geo=dft_opt.get_final_structure() + if dft_lot not in TS_dict[i].keys(): TS_dict[i][dft_lot]=dict() + TS_dict[i][dft_lot]["geo"]=geo + TS_dict[i][dft_lot]["thermal"]=dft_opt.get_thermal() + #TS_dict[i][dft_lot]["SPE"]=dft_opt.get_energy() + TS_dict[i][dft_lot]["imag_mode"]=dft_opt.get_imag_freq_mode() + else: + print("No ts optimiation jobs need to be performed...") + + # Finish running TS-opt jobs + # Prepare IRC jobs + job_list=dict() + running_jobs=[] + for i in TS_dict.keys(): + wf=f"{scratch}/{i}" + xyz_file=f"{wf}/{i}.xyz" + if dft_lot not in TS_dict[i].keys(): continue + xyz_write(xyz_file, TS_dict[i]["E"], TS_dict[i][dft_lot]["geo"]) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=xyz_file, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{i}-IRC",\ + jobtype="IRC", lot=args["dft_lot"], charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + dft_job.generate_geometry_settings(hess=True, hess_step=int(args["hess_recalc"])) + dft_job.generate_input() + job_list[i]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(i) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=xyz_file, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{i}-IRC",\ + jobtype="irc", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input() + job_list[i]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(i) + if len(running_jobs)>1: + n_submit=len(running_jobs)//int(args["dft_njobs"]) + if len(running_jobs)%int(args["dft_njobs"])>0: n_submit+=1 + startid=0 + slurm_jobs=[] + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"IRC.{i}", ppn=int(args["ppn"]), partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(args["mem"]*1100)) + endid=min(startid+int(args["dft_njobs"]), len(running_jobs)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([job_list[ind] for ind in running_jobs[startid:endid]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([job_list[ind] for ind in running_jobs[startid:endid]]) + slurmjob.submit() + startid=endid + slurm_jobs.append(slurmjob) + print(f"Running {len(slurm_jobs)} irc jobs...") + monitor_jobs(slurm_jobs) + key=[i for i in job_list.keys()] + for i in key: + dft_opt=job_list[i] + if dft_opt.calculation_terminated_normally(): + job_success=False + try: + E, G1, G2, TSG, barrier1, barrier2=dft_opt.analyze_IRC() + job_success=True + except: pass + if job_success==True: + TS_dict[i][dft_lot]["IRC"]=dict() + TS_dict[i][dft_lot]["IRC"]["node"]=[G1, G2] + TS_dict[i][dft_lot]["IRC"]["TS"]=TSG + TS_dict[i][dft_lot]["barriers"]=[barrier2, barrier1] + else: + print("No irc jobs need to be performed...") + with open(args["reaction_data"], 'wb') as f: + pickle.dump(TS_dict, f) + return + +if __name__=="__main__": + parameters = sys.argv[1] + parameters = yaml.load(open(parameters, "r"), Loader=yaml.FullLoader) + main(parameters) diff --git a/pyTEST_Example/analyze_functions.py b/pyTEST_Example/analyze_functions.py new file mode 100644 index 0000000..47a3889 --- /dev/null +++ b/pyTEST_Example/analyze_functions.py @@ -0,0 +1,314 @@ +import os,sys +import numpy as np +import logging +import pickle +import time +from copy import deepcopy +from collections import Counter +import multiprocessing as mp +from multiprocessing import Queue +from logging.handlers import QueueHandler +from concurrent.futures import ProcessPoolExecutor, TimeoutError + +from ase import io +from ase.build import minimize_rotation_and_translation +from scipy.spatial.distance import cdist +from xgboost import XGBClassifier + +# eventually this will be replaced by yarp.wrappers +import yarp as yp +from yarp.input_parsers import xyz_parse +from yarp.find_lewis import find_lewis +from yarp.taffi_functions import xyz_write, table_generator,graph_seps +from wrappers.xtb import XTB +from utils import * +from constants import Constants + +def check_dup_ts_pysis(tsopt_job_list, logger): + ''' + For a given list of pysis ts optimization jobs, check and exclude duplicated TSs + ''' + keep_list = [] + reactions = dict() + for tsopt_job in tsopt_job_list: + # check if tsopt job is finished + if tsopt_job.calculation_terminated_normally() is False: + logger.info(f"TSopt job {tsopt_job.jobname} fails, skip this reaction...") + print(f"TSopt job {tsopt_job.jobname} fails, skip this reaction...") + continue + + # check if this is a true ts + if tsopt_job.is_true_ts() is False: + logger.info(f"TSopt job {tsopt_job.jobname} fails to locate a true transition state, skip this reaction...") + print(f"TSopt job {tsopt_job.jobname} fails to locate a true transition state, skip this reaction...") + continue + + # parse output files + rxn_ind = tsopt_job.jobname + rxn_name= '_'.join(rxn_ind.split('_')[:-1]) + TSenergy= tsopt_job.get_energy() + E,G = tsopt_job.get_final_ts() + + # check duplicated TSs + if rxn_name not in reactions: + reactions[rxn_name] = [TSenergy] + keep_list.append(tsopt_job) + else: + min_ene_diff = min([Constants.ha2kcalmol*abs(TSenergy-ene) for ene in reactions[rxn_name]]) + if min_ene_diff < 0.05: + logger.info(f"TSopt job {tsopt_job.jobname} locates a duplicated transition state, skip this reaction...") + print(f"TSopt job {tsopt_job.jobname} locates a duplicated transition state, skip this reaction...") + else: + reactions[rxn_name].append(TSenergy) + keep_list.append(tsopt_job) + + return keep_list + +def analyze_outputs(rxns): + scratch=rxns[0].args["scratch"] + nc_thresd=rxns[0].args["nconf_dft"] + charge=rxns[0].args["charge"] + select=rxns[0].args["select"] + g=open(f'{scratch}/IRC-record.txt', 'w') + g.write(f'{"reaction":40s} {"R":<60s} {"P":<60s} {"type":<15s} {"barrier":10s}\n') + for count, rxn in enumerate(rxns): + key=[i for i in rxn.IRC_xtb.keys()] + for conf_i in key: + rxn_ind=f'{rxn.reactant_inchi}_{rxn.id}_{conf_i}' + print(rxn_ind) + barrier=rxn.IRC_xtb[conf_i]["barriers"][0] + RG=rxn.IRC_xtb[conf_i]["node"][0] + PG=rxn.IRC_xtb[conf_i]["node"][1] + Type=rxn.IRC_xtb[conf_i]["type"] + R_adj=table_generator(rxn.reactant.elements, RG) + P_adj=table_generator(rxn.reactant.elements, PG) + R_bondmat, _=find_lewis(rxn.reactant.elements, R_adj, charge) + P_bondmat, _=find_lewis(rxn.reactant.elements, P_adj, charge) + R_smi=return_smi(rxn.reactant.elements, RG, R_bondmat[0]) + P_smi=return_smi(rxn.reactant.elements, PG, P_bondmat[0]) + rxns[count].IRC_xtb[conf_i]['smiles']=[R_smi, P_smi] + rxns[count].IRC_xtb[conf_i]['select']=1 + if select=='network': + if Type not in ["intended", "P_unitended"]: rxns[count].IRC_xtb[conf_i]['select']=0 + elif select=='tight': + if Type != 'intended': rxns[count].IRC_xtb[conf_i]['select']=0 + else: + if Type not in ["intended", "R_unintended", "P_unintended"]: + rxns[count].IRC_xtb[conf_i]['select']=0 + g.write(f'{rxn_ind:40s} {R_smi:60s} {P_smi:60s} {Type:15s} {barrier:10f}\n') + + return rxns + +def apply_IRC_model(rxns): + args=rxns[0].args + IRC_model=XGBClassifier() + IRC_model.load_model(os.path.join(args["model_path"], "IRC_model.json")) + for count, rxn in enumerate(rxns): + key=[i for i in rxn.TS_dft.keys()] + for conf_i in key: + rxns[count]=predict_TS(rxn, conf_i, IRC_model) + return rxns + +def predict_TS(rxn, conf_i, IRC_model): + E=rxn.reactant.elements + RG=rxn.reactant.geo + PG=rxn.product.geo + Radj=rxn.reactant.adj_mat + Padj=rxn.product.adj_mat + intend_prob=TS_prediction(rxn, conf_i, IRC_model) + thermal=rxn.TS_dft[conf_i]["thermal"] + SPE=rxn.TS_dft[conf_i]["SPE"] + barrier=rxn.IRC_xtb[conf_i]["barriers"][0] + if max(intend_prob)<0.5: rxn.IRC_xtb[conf_i]["RP"]=[False, 0] + else: + if intend_prob[0]>intend_prob[1]: + rxn.IRC_xtb[conf_i]["RP"]=[True, 0] + else: + rxn.IRC_xtb[conf_i]["RP"]=[True, 1] + + return rxn +def TS_prediction(rxn, conf_i, IRC_model): + Radii=yarp.el_radii + E=rxn.reactant.elements + Radj=rxn.reactant.adj_mat + Padj=rxn.product.adj_mat + TS_G=rxn.TS_dft[conf_i]["geo"] + imag_mode=rxn.TS_dft[conf_i]["imag_mode"] + bond_break, bond_form=[], [] + adj_change=Padj-Radj + for i in range(len(E)): + for j in range(i+1, len(E)): + if adj_change[i][j]==-1: bond_break+=[(i, j)] + if adj_change[i][j]==1: bond_form+=[(i, j)] + center_atoms=list(set(sum(bond_break, ())+sum(bond_from, ()))) + if len(center_atoms)==0: return [0, 0] + + TS_f=TS_G+imag_mode*0.5 + TS_b=TS_G-imag_mode*0.5 + + bonds=bond_break+bond_form + dist=0 + for bond in bonds: dist=max(dist, np.linalg.norm(TS_G[bond[0]]-TS_G[bond[1]])) + + P_bonds=[] + Dist_Mat=np.triu(cdist(TS_G, TS_G)) + x_ind, y_ind=np.where((Dist_Mat>0.0) & (Dist_Mat0.0) & (Dist_Mat 0.6 + path_cross: if all atoms involved in bond changes are non-H, path_cross refers to the distance between two bond changes, threshold > 0.6 + max_Hdis: maximum bond length change if contains H, threshold < 4.5 (* optional) + min_Hcross_dis: shorted distance between atoms' path (H atoms involves) to original bonds, threshold > 0.4 (* optional) + h = RMSD/1.6 + max_dis/4.0 + 0.6/min_cross_dis + 0.6/path_cross + 0.5 * max_Hdis/4.5 + 0.1/min_cross_dis + ''' + + # calculate adj_mat + Radj = table_generator(E, RG) + Padj = table_generator(E, PG) + + # determine bond changes + bond_break, bond_form=[], [] + del_adj = Padj - Radj + for i in range(len(E)): + for j in range(i+1, len(E)): + if del_adj[i][j]==-1: bond_break+=[(i, j)] + if del_adj[i][j]==1: bond_form+=[(i, j)] + # identify hydrogen atoms, atoms involved in the reactions + H_index=[i for i, e in enumerate(E) if e=='H'] + involve=list(set(list(sum(bond_break+bond_form, ())))) + + # create observed segments + bond_seg={i:[] for i in bond_break+bond_form} + for bond in bond_break: + bond_seg[bond]=(PG[bond[1]]-PG[bond[0]], np.linalg.norm(PG[bond[1]]-PG[bond[0]])) + for bond in bond_form: + bond_seg[bond]=(RG[bond[1]]-RG[bond[0]], np.linalg.norm(RG[bond[1]]-RG[bond[0]])) + + # create bond list to check cross + bond_dict={i: [] for i in bond_break+bond_form} + for i in range(len(E)): + for j in range(i+1, len(E)): + for bond in bond_break: + if Padj[i][j]>0 and i not in bond and j not in bond: bond_dict[bond]+=[(i, j)] + for bond in bond_form: + if Radj[i][j]>0 and i not in bond and j not in bond: bond_dict[bond]+=[(i, j)] + + # Compute indicator + rmsd = return_RMSD(E,RG,PG,rotate=False,mass_weighted=True,namespace=namespace) + Hbond_dis = np.array([i[1] for bond,i in bond_seg.items() if (bond[0] in H_index or bond[1] in H_index)]) + bond_dis = np.array([i[1] for bond,i in bond_seg.items() if (bond[0] not in H_index and bond[1] not in H_index)]) + if len(Hbond_dis)>0: + max_Hdis=max(Hbond_dis) + else: + max_Hdis=2.0 + + if len(bond_dis)>0: + max_dis=max(bond_dis) + else: + max_dis=2.0 + + # Compute "cross" behaviour + min_cross, min_Hcross=[], [] + for bond in bond_break: + cross_dis=[] + for ibond in bond_dict[bond]: + _,_,dis=closestDistanceBetweenLines(PG[bond[0]], PG[bond[1]], PG[ibond[0]], PG[ibond[1]]) + cross_dis+=[dis] + if len(cross_dis)>0: + min_dis=min(cross_dis) + else: + min_dis=2.0 + + if bond[0] in H_index or bond[1] in H_index: + min_Hcross+=[min_dis] + else: + min_cross+=[min_dis] + + for bond in bond_form: + cross_dis=[] + for ibond in bond_dict[bond]: + _,_,dis=closestDistanceBetweenLines(RG[bond[0]], RG[bond[1]], RG[ibond[0]], RG[ibond[1]]) + cross_dis+=[dis] + if len(cross_dis) > 0: + min_dis=min(cross_dis) + else: + min_dis=2.0 + if bond[0] in H_index or bond[1] in H_index: + min_Hcross+=[min_dis] + else: + min_cross+=[min_dis] + + # Find the smallest bonds distance for each bond, if None, return 2.0 + if len(min_cross) > 0: + min_cross_dis = min(min_cross) + else: + min_cross_dis = 2.0 + + if len(min_Hcross) > 0: + min_Hcross_dis = min(min_Hcross) + else: + min_Hcross_dis = 2.0 + # Find the cross distanc ebetween two bond changes + if len([ind for ind in involve if ind in H_index]) ==0: + + if len(bond_break) == 2: + _,_,dis = closestDistanceBetweenLines(PG[bond_break[0][0]],PG[bond_break[0][1]],PG[bond_break[1][0]],PG[bond_break[1][1]],clampAll=True) + else: + dis = 2.0 + path_cross = dis + + if len(bond_form) == 2: + _,_,dis = closestDistanceBetweenLines(RG[bond_form[0][0]],RG[bond_form[0][1]],RG[bond_form[1][0]],RG[bond_form[1][1]],clampAll=True) + else: + dis = 2.0 + path_cross = min(dis,path_cross) + + else: + path_cross = 2.0 + + # return in dataframe format + indicators = [rmsd, max_dis, max_Hdis, min_cross_dis, min_Hcross_dis, path_cross] + feature_names = ['RMSD','max_dis','max_Hdis','min_cross_dis','min_Hcross_dis','path_cross'] + + return pd.DataFrame([indicators],columns=feature_names) + +def closestDistanceBetweenLines(a0,a1,b0,b1,clampAll=True,clampA0=False,clampA1=False,clampB0=False,clampB1=False): + ''' + Calculate spatial distance between two segments + Input: two lines defined by numpy.array pairs (a0,a1,b0,b1) + Output: the closest points on each segment and their distance + ''' + # If clampAll=True, set all clamps to True + if clampAll: + clampA0=True + clampA1=True + clampB0=True + clampB1=True + + # Calculate denomitator + A = a1 - a0 + B = b1 - b0 + magA = np.linalg.norm(A) + magB = np.linalg.norm(B) + + _A = A / magA + _B = B / magB + + cross = np.cross(_A, _B); + denom = np.linalg.norm(cross)**2 + + # If lines are parallel (denom=0) test if lines overlap. + # If they don't overlap then there is a closest point solution. + # If they do overlap, there are infinite closest positions, but there is a closest distance + if not denom: + d0 = np.dot(_A,(b0-a0)) + + # Overlap only possible with clamping + if clampA0 or clampA1 or clampB0 or clampB1: + d1 = np.dot(_A,(b1-a0)) + + # Is segment B before A? + if d0 <= 0 >= d1: + if clampA0 and clampB1: + if np.absolute(d0) < np.absolute(d1): + return a0,b0,np.linalg.norm(a0-b0) + return a0,b1,np.linalg.norm(a0-b1) + + + # Is segment B after A? + elif d0 >= magA <= d1: + if clampA1 and clampB0: + if np.absolute(d0) < np.absolute(d1): + return a1,b0,np.linalg.norm(a1-b0) + return a1,b1,np.linalg.norm(a1-b1) + # Segments overlap, return distance between parallel segments + return None,None,np.linalg.norm(((d0*_A)+a0)-b0) + + # Lines criss-cross: Calculate the projected closest points + t = (b0 - a0); + detA = np.linalg.det([t, _B, cross]) + detB = np.linalg.det([t, _A, cross]) + + t0 = detA/denom; + t1 = detB/denom; + + pA = a0 + (_A * t0) # Projected closest point on segment A + pB = b0 + (_B * t1) # Projected closest point on segment B + + # Clamp projections + if clampA0 or clampA1 or clampB0 or clampB1: + if clampA0 and t0 < 0: + pA = a0 + elif clampA1 and t0 > magA: + pA = a1 + + if clampB0 and t1 < 0: + pB = b0 + elif clampB1 and t1 > magB: + pB = b1 + + # Clamp projection A + if (clampA0 and t0 < 0) or (clampA1 and t0 > magA): + dot = np.dot(_B,(pA-b0)) + if clampB0 and dot < 0: + dot = 0 + elif clampB1 and dot > magB: + dot = magB + pB = b0 + (_B * dot) + + # Clamp projection B + if (clampB0 and t1 < 0) or (clampB1 and t1 > magB): + dot = np.dot(_A,(pB-a0)) + if clampA0 and dot < 0: + dot = 0 + elif clampA1 and dot > magA: + dot = magA + pA = a0 + (_A * dot) + + return pA,pB,np.linalg.norm(pA-pB) + +def seperate_mols(E,G,adj_mat=None,namespace='sep'): + ''' Function to seperate molecules and return a dictionary of each segment ''' + # generate adj mat + if adj_mat is None: adj_mat = table_generator(E, G) + # Seperate reactant(s) + gs = graph_seps(adj_mat) + groups = [] + loop_ind= [] + for i in range(len(gs)): + if i not in loop_ind: + new_group =[count_j for count_j,j in enumerate(gs[i,:]) if j >= 0] + loop_ind += new_group + groups +=[new_group] + + # Determine the inchikey of all components in the reactant + mols = {} + for group in groups: + # parse element and geometry of each fragment + N_atom = len(group) + frag_E = [E[ind] for ind in group] + frag_G = np.zeros([N_atom,3]) + for count_i,i in enumerate(group): + frag_G[count_i,:] = G[i,:] + # generate inchikey + xyz_write(f"{namespace}_input.xyz",frag_E,frag_G) + molecule = next(pybel.readfile("xyz", f"{namespace}_input.xyz")) + inchikey = molecule.write(format="inchikey").strip().split()[0] + os.system(f"rm {namespace}_input.xyz") + # store this fragment + if inchikey not in mols.keys(): + mols[inchikey] = [[frag_E,frag_G]] + else: + mols[inchikey].append([frag_E,frag_G]) + + return mols + +def check_multi_molecule(adj,geo,factor='auto'): + ''' Function to identify whether two or more reactants far away from each other in multi-molecular cases''' + # Seperate molecules(s) + gs = graph_seps(adj) + groups = [] + loop_ind= [] + for i in range(len(gs)): + if i not in loop_ind: + new_group = [count_j for count_j,j in enumerate(gs[i,:]) if j >= 0] + loop_ind += new_group + groups += [new_group] + + # if only one fragment, return True + if len(groups) == 1: return True + + # compute center of mass + centers = [] + radius = [] + for group in groups: + center = np.array([0.0, 0.0, 0.0]) + for i in group: + center += geo[i,:]/float(len(group)) + + centers += [center] + radius += [max([ np.linalg.norm(geo[i,:]-center) for i in group])] + + # iterate over all paris of centers + combs = combinations(range(len(centers)), 2) + max_dis = 0 + satisfy = [] + + if factor == 'auto': + if len(adj) > 12: factor = 1.5 + else: factor = 2.0 + if min([len(j) for j in groups]) < 5: factor = 2.5 + + for comb in combs: + dis = np.linalg.norm(centers[comb[0]]-centers[comb[1]]) + if dis > factor * (radius[comb[0]]+radius[comb[1]]): + satisfy += [False] + else: + satisfy += [True] + + return (False not in satisfy) + +def return_RMSD(E,G1,G2,rotate=True,mass_weighted=False,namespace='node'): + ''' Calcualte RMSD (Root-mean-square-displacement)''' + # Initialize mass_dict (used for identifying the dihedral among a coincident set that will be explicitly scanned) + mass_dict = {'H':1.00794,'He':4.002602,'Li':6.941,'Be':9.012182,'B':10.811,'C':12.011,'N':14.00674,'O':15.9994,'F':18.9984032,'Ne':20.1797,\ + 'Na':22.989768,'Mg':24.3050,'Al':26.981539,'Si':28.0855,'P':30.973762,'S':32.066,'Cl':35.4527,'Ar':39.948,\ + 'K':39.0983,'Ca':40.078,'Sc':44.955910,'Ti':47.867,'V':50.9415,'Cr':51.9961,'Mn':54.938049,'Fe':55.845,'Co':58.933200,'Ni':58.6934,'Cu':63.546,'Zn':65.39,\ + 'Ga':69.723,'Ge':72.61,'As':74.92159,'Se':78.96,'Br':79.904,'Kr':83.80,\ + 'Rb':85.4678,'Sr':87.62,'Y':88.90585,'Zr':91.224,'Nb':92.90638,'Mo':95.94,'Tc':98.0,'Ru':101.07,'Rh':102.90550,'Pd':106.42,'Ag':107.8682,'Cd':112.411,\ + 'In':114.818,'Sn':118.710,'Sb':121.760,'Te':127.60,'I':126.90447,'Xe':131.29,\ + 'Cs':132.90545,'Ba':137.327,'La':138.9055,'Hf':178.49,'Ta':180.9479,'W':183.84,'Re':186.207,'Os':190.23,'Ir':192.217,'Pt':195.078,'Au':196.96655,'Hg':200.59,\ + 'Tl':204.3833,'Pb':207.2,'Bi':208.98038,'Po':209.0,'At':210.0,'Rn':222.0} + + if rotate: + + # write two xyz file + xyz_write('{}1.xyz'.format(namespace),E,G1) + xyz_write('{}2.xyz'.format(namespace),E,G2) + node1 = io.read('{}1.xyz'.format(namespace)) + node2 = io.read('{}2.xyz'.format(namespace)) + minimize_rotation_and_translation(node1,node2) + io.write('{}2.xyz'.format(namespace),node2) + + # reload node 2 geometry and compute RMSD + _,G2 = xyz_parse('{}2.xyz'.format(namespace)) + + try: + os.remove('{}1.xyz'.format(namespace)) + os.remove('{}2.xyz'.format(namespace)) + except: + pass + # compute RMSD + DG = G1 - G2 + RMSD = 0 + if mass_weighted: + for i in range(len(E)): + RMSD += sum(DG[i]**2)*mass_dict[E[i]] + + return np.sqrt(RMSD / sum([mass_dict[Ei] for Ei in E])) + + else: + for i in range(len(E)): + RMSD += sum(DG[i]**2) + + return np.sqrt(RMSD / len(E)) + +def check_duplicate(i,total_i,thresh=0.05): + ''' check duplicate indicators, return True if unique''' + if len(total_i) == 0: return True + min_dis = min([np.linalg.norm(np.array(i)-np.array(j)) for j in total_i]) + # if rmsd > 0.1, this will be a unique conformation + if min_dis > thresh: return True + else: return False diff --git a/pyTEST_Example/constants.py b/pyTEST_Example/constants.py new file mode 100644 index 0000000..12581ea --- /dev/null +++ b/pyTEST_Example/constants.py @@ -0,0 +1,31 @@ +class Constants: + + n_a = 6.022140857E23 # molecules mol-1 + + ha_to_kcalmol = ha2kcalmol = 627.509 # Hartree^-1 kcal mol^-1 + ha_to_kJmol = ha2kJmol = 2625.50 # Hartree^-1 kJ mol^-1 + + ha_to_J = ha_to_kJmol * 1000 / n_a # Hartree^-1 J + J_to_ha = 1.0 / ha_to_J # J Hartree^-1 + + eV_to_ha = eV2ha = 0.0367493 # Hartree eV^-1 + ha_to_eV = ha2eV = 1.0 / eV_to_ha # eV Hartree^-1 + + kcal_to_kJ = kcal2kJ = 4.184 # kJ kcal^-1 + + rad_to_deg = 57.29577951308232087679815 # deg rad^-1 + + a0_to_ang = a02ang = 0.529177 # Å bohr^-1 + ang_to_a0 = ang2a0 = 1.0 / a0_to_ang # bohr Å^-1 + ang_to_nm = 0.1 # nm ang^-1 + ang_to_pm = 100 # pm ang^-1 + ang_to_m = 1E-10 # m ang^-1 + a0_to_m = a0_to_ang * ang_to_m # Å m^-1 + + per_cm_to_hz = c_in_cm = 299792458 * 100 # cm s^-1 + + amu_to_kg = 1.66053906660E-27 # kg amu^-1 + amu_to_me = 1822.888486209 # m_e amu^-1 + + atm_to_pa = 101325 # Pa atm^-1 + dm_to_m = 0.1 # m dm^-1 diff --git a/pyTEST_Example/job_mapping.py b/pyTEST_Example/job_mapping.py new file mode 100644 index 0000000..4864f97 --- /dev/null +++ b/pyTEST_Example/job_mapping.py @@ -0,0 +1,115 @@ +import os,sys +import numpy as np +import logging +import pickle +import time +from copy import deepcopy +from collections import Counter +import multiprocessing as mp +from multiprocessing import Queue +from logging.handlers import QueueHandler +from concurrent.futures import ProcessPoolExecutor, TimeoutError + +from ase import io +from ase.build import minimize_rotation_and_translation +from scipy.spatial.distance import cdist +from xgboost import XGBClassifier +from wrappers.xtb import XTB + +from yarp.taffi_functions import table_generator, xyz_write +from yarp.find_lewis import find_lewis +from utils import return_metal_constraint + +def logger_process(queue, logging_path): + """A child process for logging all information from other processes""" + logger = logging.getLogger("YARPrun") + logger.addHandler(logging.FileHandler(logging_path)) + logger.setLevel(logging.INFO) + while True: + message = queue.get() + if message is None: + break + logger.handle(message) + +def process_input_rxn(rxns, args={}): + job_mapping=dict() + process_id=mp.current_process().pid + for i in rxns: + count_i, rxn, args=i + RE=rxn.reactant.elements + PE=rxn.product.elements + RG=rxn.reactant.geo + PG=rxn.product.geo + R_inchi=rxn.reactant_inchi + P_inchi=rxn.product_inchi + R_constraint=return_metal_constraint(rxn.reactant) + P_constraint=return_metal_constraint(rxn.product) + if args["strategy"]!=0: + if P_inchi not in job_mapping: + job_mapping[P_inchi]={'jobs': [f'{count_i}-P'], 'id': len(job_mapping)} + xyz_write(f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", PE, PG) + if args["low_solvation"]: + solvation_model, solvent = args["low_solvation"].split("/") + optjob=XTB(input_geo=f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", work_folder=args["scratch_xtb"],lot=args["lot"], jobtype=["opt"],\ + solvent=solvent, solvation_model=solvation_model, jobname=f"{process_id}_{len(job_mapping)}_opt", charge=args["charge"], multiplicity=args["multiplicity"]) + if P_constraint!=[]: optjob.add_command(distance_constraints=P_constraint) + optjob.execute() + else: + optjob=XTB(input_geo=f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", work_folder=args["scratch_xtb"], lot=args["lot"], jobtype=["opt"],\ + jobname=f"{process_id}_{len(job_mapping)}_opt", charge=args["charge"], multiplicity=args["multiplicity"]) + if P_constraint!=[]: optjob.add_command(distance_constraints=P_constraint) + optjob.execute() + if optjob.optimization_success(): + E, G=optjob.get_final_structure() + job_mapping[P_inchi]["E"], job_mapping[P_inchi]["G"]=E, G + else: + sys.exit(f"xtb geometry optimization fails for {P_inchi}, please check or remove this reactions") + else: job_mapping[P_inchi]["jobs"].append(f"{count_i}-P") + if args["strategy"]!=1: + if R_inchi not in job_mapping: + job_mapping[R_inchi]={"jobs": [f"{count_i}-R"], "id": len(job_mapping)} + xyz_write(f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", RE, RG) + if args["low_solvation"]: + solvation_model, solvent = args["low_solvation"].split("/") + optjob=XTB(input_geo=f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", work_folder=args["scratch_xtb"],lot=args["lot"], jobtype=["opt"],\ + solvent=solvent, solvation_model=solvation_model, jobname=f"{process_id}_{len(job_mapping)}_opt", charge=args["charge"], multiplicity=args["multiplicity"]) + if R_constraint!=[]: optjob.add_command(distance_constraints=R_constraint) + optjob.execute() + else: + optjob=XTB(input_geo=f"{args['scratch_xtb']}/{process_id}_{len(job_mapping)}_init.xyz", lot=args["lot"], work_folder=args["scratch_xtb"], jobtype=["opt"],\ + jobname=f"{process_id}_{len(job_mapping)}_opt", charge=args["charge"], multiplicity=args["multiplicity"]) + if R_constraint!=[]: optjob.add_command(distance_constraints=R_constraint) + optjob.execute() + if optjob.optimization_success(): + E, G=optjob.get_final_structure() + job_mapping[R_inchi]["E"], job_mapping[R_inchi]["G"]=E, G + else: + sys.exit(f"xtb geometry optimization fails for {R_inchi}, please check or remove this reactions") + else: job_mapping[R_inchi]["jobs"].append(f"{count_i}-R") + return job_mapping + +def merge_job_mappings(all_job_mappings): + merged_mapping = dict() + for job_mapping in all_job_mappings: + for inchi, jobi in job_mapping.items(): + if inchi in merged_mapping.keys(): + for idx in jobi["jobs"]: merged_mapping[inchi]["jobs"].append(idx) + else: merged_mapping[inchi]=jobi.copy() + id_mapping={inchi: sorted(info['jobs'])[0] for inchi, info in merged_mapping.items()} + job_list=sorted(list(id_mapping.values())) + for inchi, info in merged_mapping.items(): + info['id']=job_list.index(id_mapping[inchi])+1 + return merged_mapping + +def monitor_jobs(slurm_jobs): + ''' + Function that sleeps the script until jobids are no longer in a running or pending state in the queue + ''' + while True: + for slurm_job in slurm_jobs: + if slurm_job.status() == 'FINISHED': + slurm_jobs.remove(slurm_job) + if not slurm_jobs: + break + time.sleep(60) + return diff --git a/pyTEST_Example/job_submission.py b/pyTEST_Example/job_submission.py new file mode 100755 index 0000000..bd6b614 --- /dev/null +++ b/pyTEST_Example/job_submission.py @@ -0,0 +1,299 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os +import time + +# for parallel parallel jobs (e.g., multiple jobs in parallel, each asks for multiple cpus), check the below contents to see if it works +''' +# Run the crest job in parallel with the desired number of threads +OMP_NUM_THREADS=4 crest X & +OMP_NUM_THREADS=4 crest Y & +OMP_NUM_THREADS=4 crest Z & + +wait +''' +# compared with directly call crest in each line + +class SLURM_Job: + def __init__(self, submit_path='.', partition='standby', time=4, jobname='JobSubmission', node=1, ppn=4, mem_per_cpu=1000, specify_array=False): + """ + Initialize slurm job parameters + Time needs to be specify in hours + """ + self.time = time + self.jobname = jobname + self.partition = partition + self.node = node + self.ppn = ppn + self.mem = mem_per_cpu + self.submit_path = submit_path + self.specify_array = specify_array + self.script_file = os.path.join(submit_path, jobname+'.submit') + + def submit(self): + """ + Submit a SLURM job using the specified script file and return the job id + """ + current_dir = os.getcwd() + # go into the job.submit folder to submit the job + os.chdir('/'.join(self.script_file.split('/')[:-1])) + command = f"sbatch {self.script_file}" + output = subprocess.run(command, shell=True, capture_output=True, text=True) + # go back to current dir + os.chdir(current_dir) + self.job_id = output.stdout.split()[-1] + + def status(self): + """ + Check the status of the SLURM job. + """ + if hasattr(self, "job_id") is False: + print("Haven't submitted this job yet, can not check the status of this job...") + return "UNSUBMITTED" + + try: + command = f"squeue -j {self.job_id} --noheader --format %T" + output = subprocess.run(command, shell=True, capture_output=True, text=True) + job_status = output.stdout.strip() + + if job_status == "": + # Job ID not found, indicating the job has completed + return "FINISHED" + else: + # Common status: RUNNING and PENDING + return job_status + except: + return "UNKNOWN" + + def create_job_head(self): + """ + Create a slurm job script for given settings + """ + with open(self.script_file, "w") as f: + f.write("#!/bin/bash\n") + if self.specify_array: f.write(f"#SBATCH --array={self.specify_array}\n") + f.write(f"#SBATCH --job-name={self.jobname}\n") + f.write(f"#SBATCH --output={self.jobname}.out\n") + f.write(f"#SBATCH --error={self.jobname}.err\n") + f.write(f"#SBATCH -A {self.partition}\n") + f.write(f"#SBATCH --nodes={self.node}\n") + f.write(f"#SBATCH --ntasks-per-node={self.ppn}\n") + f.write(f"#SBATCH --mem {self.mem*self.ppn}MB\n") + f.write(f"#SBATCH --time {self.time}:00:00\n\n") + f.write("echo Running on host `hostname`\n") + f.write("echo Start Time is `date`\n\n") + + def create_job_bottom(self): + """ + Print job finishing time + """ + with open(self.script_file, "a") as f: + f.write("\necho End Time is `date`\n\n") + + def setup_orca_script(self): + """ + Load in ORCA and OPENMPI + """ + with open(self.script_file, "a") as f: + f.write("\n# Load Orca\n") + f.write("# set up env for Orca\n") + f.write("module unload openmpi\n") + f.write("module load intel-mkl\n") + # f.write("module purge\n") + f.write('export PATH="/depot/bsavoie/apps/orca_5_0_1_openmpi411:$PATH"\n') + f.write('export LD_LIBRARY_PATH="/depot/bsavoie/apps/orca_5_0_1_openmpi411:$LD_LIBRARY_PATH"\n') + f.write('export PATH="/depot/bsavoie/apps/openmpi_4_1_1/bin:$PATH"\n') + f.write('export LD_LIBRARY_PATH="/depot/bsavoie/apps/openmpi_4_1_1/lib:$LD_LIBRARY_PATH"\n\n') + + def setup_qchem_script(self): + """ + Load in QChem + set for athena + """ + with open(self.script_file, "a") as f: + f.write("\n# Load QChem\n") + f.write("source /home/paulzim/qchem/trunk2022/paul.set.local0\n") + + def create_orca_jobs(self, orca_job_list, parallel=False, orcapath=None): + """ + Generate orca jobs + NOTE: a list of orca job objects needs to be provided + """ + self.create_job_head() + self.setup_orca_script() + + with open(self.script_file, "a") as f: + for orcajob in orca_job_list: + f.write("\n# cd into the submission directory\n") + f.write(f"cd {orcajob.work_folder}\n\n") + if orcapath is None: orcapath = '/depot/bsavoie/apps/orca_5_0_1_openmpi411/orca' + if parallel: f.write(f"{orcapath} {orcajob.orca_input} > {orcajob.output} &\n\n") + else: f.write(f"{orcapath} {orcajob.orca_input} > {orcajob.output} \n\n") + + f.write("wait\n") + + self.create_job_bottom() + + def create_qchem_jobs(self, qchem_job_list, parallel=False): + """ + Generate QChem jobs + NOTE: qchem_input_file needs to be provided with FULL path + """ + self.create_job_head() + self.setup_qchem_script() + + with open(self.script_file, "a") as f: + for qchemjob in qchem_job_list: + f.write("\n# cd into the submission directory\n") + f.write(f"cd {qchemjob.work_folder}\n\n") + if parallel: f.write(f"qchem -nt {qchemjob.nproc} {qchemjob.qchem_input} > {qchemjob.output} &\n\n") + else: f.write(f"qchem -nt {qchemjob.nproc} {qchemjob.qchem_input} > {qchemjob.output}\n\n") + + f.write("wait\n") + + self.create_job_bottom() + + def create_gaussian_jobs(self, job_list, parallel=False): + """ + Generate Gaussian16 jobs + Note: a list of gaussian job objects needs to be provided. + """ + self.create_job_head() + with open(self.script_file, "a") as f: + for job in job_list: + f.write("# cd into the submission directory\n") + f.write(f"cd {job.work_folder}\n\n") + f.write(f"module load gaussian16/B.01\n") + if parallel: f.write(f"g16 < {job.gjf} > {job.output}.{job.nprocs}.out &\n") + else: + f.write(f"g16 < {job.gjf} > {job.output} &\n") + f.write("\nwait\n") + # stop here + + def create_gsm_jobs(self, gsm_job_list, gsm_thread=1): + """ + Create a GSM job script using the specified script file. + Avaiable methods: xTB, Orca, QChem + """ + + # check input + if self.ppn % gsm_thread != 0: + print(f"Make sure your total number of cpu (ppn={self.ppn}) is divisible by gsm thread") + quit() + + self.create_job_head() + + # write head of GSM submission file + with open(self.script_file, "a") as f: + # specify job array + f.write("item=$SLURM_ARRAY_TASK_ID\n") + f.write(f'ID=`printf "%0*d\\n" {gsm_thread} ${{item}}`\n') + + # set up orca/qchem path + if gsm_job_list[0].method.lower() =='orca': + self.setup_orca_script() + gsm_exe = 'gsm.orca' + elif gsm_job_list[0].method.lower() =='qchem': + self.setup_qchem_script() + gsm_exe = 'gsm.qchem' + else: + # GFM-xTB use orca interface but is using xTB to generate pseudo-Orca files + gsm_exe = 'gsm.orca' + + # set thread and load packages + f.write(f"export OMP_NUM_THREADS={gsm_thread}\n") + f.write("module load intel-mkl\n\n") # a specific setting for bell + + for gsmjob in gsm_job_list: + f.write(f'cd {gsmjob.work_folder}\n') + f.write(f"./{gsm_exe} ${{item}} {self.ppn//gsm_thread} > {gsmjob.output}\nwait\n\n") + + self.create_job_bottom() + + def create_pysis_jobs(self, pysis_job_list, parallel=False): + """ + Create a pysis job script using the pysis_job + """ + self.create_job_head() + + with open(self.script_file, "a") as f: + + # set thread and load packages + f.write('export PATH="/depot/bsavoie/apps/anaconda3/envs/yarp/bin:$PATH"\n') + + # set up GSM commands (only supports doing each task in sequential) + for pysisjob in pysis_job_list: + f.write("\n# cd into the submission directory\n") + f.write(f"cd {pysisjob.work_folder}\n\n") + if parallel: f.write(f"pysis {pysisjob.pysis_input} > {pysisjob.output} &\n\n") + else: f.write(f"pysis {pysisjob.pysis_input} > {pysisjob.output}\n\n") + + self.create_job_bottom() + + def create_crest_jobs(self, crest_job_list): + """ + Create a crest job script for crest jobs + """ + self.create_job_head() + + with open(self.script_file, "a") as f: + + f.write(f"export OMP_STACKSIZE={crest_job_list[0].mem}M\n") + f.write(f"export OMP_NUM_THREADS={crest_job_list[0].nproc}\n") + + # set up GSM commands (only supports doing each task in sequential) + for crestjob in crest_job_list: + f.write("\n# cd into the submission directory\n") + f.write(f"cd {crestjob.work_folder}\n\n") + f.write("# Running crest jobs for the input file\n") + f.write(f"{crestjob.command} > {crestjob.output}\nwait\n\n") + + self.create_job_bottom() + + def create_auto3d_jobs(self, auto3d_job_list): + """ + Create a slurm script to run auto3d jobs + """ + self.create_job_head() + + with open(self.script_file, "a") as f: + for auto3djob in auto3d_job_list: + f.write("\n# cd into the submission directory\n") + f.write(f"cd {auto3djob.work_folder}\n\n") + f.write(f"{auto3djob.command}\nwait\n\n") + + self.create_job_bottom() + + def create_python_jobs(self, python_commands, anaconda_env_name, work_folders=None, thread=1): + """ + Create a slurm script to run python jobs + NOTE: input_file needs to be provided with FULL path + """ + self.create_job_head() + + if work_folders is None: work_folders = ['.'] * len(python_commands) + with open(self.script_file, "a") as f: + for jobid,python_command in enumerate(python_commands): + # first activate anaconda env + f.write(f"source activate {anaconda_env_name}\n\n") + f.write("\n# cd into the submission directory\n") + f.write(f"cd {work_folders[jobid]}\n\n") + f.write(f"{python_command}\nwait\n\n") + + self.create_job_bottom() + + ''' + def cleanup_files(self, file_paths): + """ + Remove the specified files. + """ + for file_path in file_paths: + try: + os.remove(file_path) + except OSError: + pass + ''' + + diff --git a/pyTEST_Example/main_dft.py b/pyTEST_Example/main_dft.py new file mode 100644 index 0000000..89cac1e --- /dev/null +++ b/pyTEST_Example/main_dft.py @@ -0,0 +1,568 @@ +#!/bin/env python +# Author: Hsuan-Hao Hsu (hsu205@purdue.edu) and Qiyuan Zhao (zhaoqy1996@gmail.com) +import os,sys +import numpy as np +import yaml +import logging +import time +import json +import pickle +import pyjokes +from xgboost import XGBClassifier + +from yarp.input_parsers import xyz_parse +from wrappers.orca import ORCA +from wrappers.crest import CREST +from utils import * +from constants import Constants +from job_submission import * +from job_mapping import * +from conf import seperate_mols +from wrappers.gaussian import Gaussian +def main(args:dict): + keys=[i for i in args.keys()] + if args["solvation"]: args["solvation_model"], args["solvent"]=args["solvation"].split('/') + else: args["solvation_model"], args["solvent"]="CPCM", False + args["scratch_dft"]=f'{args["scratch"]}/DFT' + args["scratch_crest"]=f'{args["scratch"]}/conformer' + if "crest" not in keys: args["crest"]="crest" + if os.path.isdir(args["scratch"]) is False: os.mkdir(args["scratch"]) + if os.path.isdir(args["scratch_dft"]) is False: os.mkdir(args["scratch_dft"]) + if args["reaction_data"] is None: args["reaction_data"]=args["scratch"]+"/reaction.p" + if os.path.exists(args["reaction_data"]) is False: + print("No reactions are provided for refinement....") + exit() + rxns=load_rxns(args["reaction_data"]) + for count, i in enumerate(rxns): + rxns[count].args=args + # Run DFT optimization first to get DFT energy + # print("Running DFT optimization") + #print(rxns) + rxns=run_dft_opt(rxns) + + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + # Run DFT TS opt + rxns=run_dft_tsopt(rxns) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + # Run DFT IRC opt and generate results + + rxns=run_dft_irc(rxns) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + + #rxns=analyze_intended(rxns) + writedown_result(rxns) + + return + +def load_rxns(rxns_pickle): + rxns=pickle.load(open(rxns_pickle, 'rb')) + return rxns + +def constrained_dft_geo_opt(rxns): + args=rxns[0].args + scratch_dft=args["scratch_dft"] + dft_jobs=[] + copt=dict() + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + for rxn in rxns: + # Three cases: + # 1. skip_low_IRC: read TS_xtb. + # 2. skip_low_TS: read TS_guess. + # 3. Otherwise, read the intended TS. + if args["skip_low_TS"] is True: key=[i for i in rxn.TS_guess.keys()] + elif args["skip_low_IRC"] is True: key=[i for i in rxn.TS_xtb.keys()] + else: key=[i for i in rxn.IRC_xtb.keys() if rxn.IRC_xtb[i]["type"]=="intended" or rxn.IRC_xtb[i]["type"]=="P_unintended"] + for ind in key: + rxn_ind=f"{rxn.reactant_inchi}_{rxn.id}_{ind}" + wf=f"{scratch_dft}/{rxn.reactant_inchi}_{rxn.id}_{ind}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz=f"{wf}/{rxn_ind}.xyz" + if args["skip_low_TS"] is True: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_guess[ind]) + elif args["skip_low_IRC"] is True: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_xtb[ind]) + else: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_xtb[ind]) + constrained_bond, constrained_atoms=return_rxn_constraint(rxn.reactant, rxn.product) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-COPT",\ + jobtype="OPT Freq", lot=args["dft_lot"], charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + constraints=[f'{{C {atom} C}}' for atom in constrained_atoms] + dft_job.generate_geometry_settings(hess=False, constraints=constraints) + dft_job.generate_input() + copt[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: dft_jobs.append(rxn_ind) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-COPT",\ + jobtype="copt", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input(constraints=constrained_atoms) + copt[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: dft_jobs.append(rxn_ind) + dft_jobs=sorted(dft_jobs) + slurm_jobs=[] + if len(dft_jobs)>0: + n_submit=len(dft_jobs)//int(args["dft_njobs"]) + if len(dft_jobs)%int(args["dft_njobs"])>0: n_submit+=1 + startidx=0 + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"COPT.{i}", ppn=args["ppn"], partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(int(args["mem"])*1100)) + endidx=min(startidx+int(args["dft_njobs"]), len(dft_jobs)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([copt[ind] for ind in dft_jobs[startidx:endidx]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([copt[ind] for ind in dft_jobs[startidx:endidx]]) + slurmjob.submit() + startidx=endidx + slurm_jobs.append(slurmjob) + print(f"Running {len(slurm_jobs)} constrained optimization jobs on TS...") + monitor_jobs(slurm_jobs) + else: print(f"No constrained optimization jobs need to be performed...") + + key=[i for i in copt.keys()] + for i in key: + dft_opt=copt[i] + if dft_opt.calculation_terminated_normally() and dft_opt.optimization_converged() and len(dft_opt.get_imag_freq()[0])>0 and min(dft_opt.get_imag_freq()[0]) < -10: + _, geo=dft_opt.get_final_structure() + for count, rxn in enumerate(rxns): + inchi, ind, conf_i=i.split("_")[0], int(i.split("_")[1]), int(i.split("_")[2]) + if inchi in rxn.reactant_inchi and ind==rxn.id: + rxns[count].constrained_TS[conf_i]=geo + return rxns + +def run_dft_tsopt(rxns): + args=rxns[0].args + opt_jobs=dict() + running_jobs=[] + scratch_dft=args["scratch_dft"] + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + if args["constrained_TS"] is True: rxns=constrained_dft_geo_opt(rxns) + # Load TS from reaction class and prepare TS jobs + for rxn in rxns: + # Four cases: + # 1. skip_low_IRC: read TS_xtb. + # 2. skip_low_TS: read TS_guess. + # 3. constriaed_ts: read constrained_TS + # 3. Otherwise, read the intended TS. + if args["constrained_TS"] is True: key=[i for i in rxn.constrained_TS.keys()] + elif args["skip_low_TS"] is True: key=[i for i in rxn.TS_guess.keys()] + elif args["skip_low_IRC"] is True: key=[i for i in rxn.TS_xtb.keys()] + else: key=[i for i in rxn.IRC_xtb.keys() if rxn.IRC_xtb[i]["type"]=="Intended" or rxn.IRC_xtb[i]["type"]=="P_unintended"] + for ind in key: + rxn_ind=f"{rxn.reactant_inchi}_{rxn.id}_{ind}" + wf=f"{scratch_dft}/{rxn.reactant_inchi}_{rxn.id}_{ind}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz=f"{wf}/{rxn_ind}.xyz" + if args["constrained_TS"] is True: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.constrained_TS[ind]) + elif args["skip_low_TS"] is True: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_guess[ind]) + elif args["skip_low_IRC"] is True: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_xtb[ind]) + else: + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_xtb[ind]) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-TSOPT",\ + jobtype="OptTS Freq", lot=args["dft_lot"], charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + dft_job.generate_geometry_settings(hess=True, hess_step=int(args["hess_recalc"])) + dft_job.generate_input() + opt_jobs[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(rxn_ind) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-TSOPT",\ + jobtype="tsopt", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input() + opt_jobs[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: running_jobs.append(rxn_ind) + if len(running_jobs)>0: + n_submit=len(running_jobs)//int(args["dft_njobs"]) + if len(running_jobs)%int(args["dft_njobs"])>0: n_submit+=1 + startid=0 + slurm_jobs=[] + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"TSOPT.{i}", ppn=int(args["ppn"]), partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(args["mem"]*1100)) + endid=min(startid+int(args["dft_njobs"]), len(running_jobs)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([opt_jobs[ind] for ind in running_jobs[startid:endid]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([opt_jobs[ind] for ind in running_jobs[startid:endid]]) + slurmjob.submit() + startid=endid + slurm_jobs.append(slurmjob) + print(f"Running {len(slurm_jobs)} ts optimization jobs...") + monitor_jobs(slurm_jobs) + key=[i for i in opt_jobs.keys()] + for i in key: + dft_opt=opt_jobs[i] + if dft_opt.calculation_terminated_normally() and dft_opt.optimization_converged() and dft_opt.is_TS(): + _, geo=dft_opt.get_final_structure() + for count, rxn in enumerate(rxns): + inchi, ind, conf_i=i.split("_")[0], int(i.split("_")[1]), int(i.split("_")[2]) + if dft_lot not in rxns[count].TS_dft.keys(): rxns[count].TS_dft[dft_lot]=dict() + if inchi in rxn.reactant_inchi and ind==rxn.id: + rxns[count].TS_dft[dft_lot][conf_i]=dict() + rxns[count].TS_dft[dft_lot][conf_i]["geo"]=geo + rxns[count].TS_dft[dft_lot][conf_i]["thermal"]=dft_opt.get_thermal() + rxns[count].TS_dft[dft_lot][conf_i]["SPE"]=dft_opt.get_energy() + rxns[count].TS_dft[dft_lot][conf_i]["imag_mode"]=dft_opt.get_imag_freq_mode() + else: + print("No ts optimiation jobs need to be performed...") + return rxns + +def run_dft_irc(rxns, analyze=True): + args=rxns[0].args + scratch_dft=args["scratch_dft"] + irc_jobs=dict() + todo_list=[] + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + # run IRC model first if we need + if args["skip_low_TS"] is False and args["skip_low_IRC"] is False: rxns=apply_IRC_model(rxns) + for count, rxn in enumerate(rxns): + if dft_lot in rxn.TS_dft.keys(): key=[i for i in rxn.TS_dft[dft_lot].keys()] + else: continue + for i in key: + rxn_ind=f"{rxn.reactant_inchi}_{rxn.id}_{i}" + RP=False + if args["skip_low_TS"] is False and args["skip_low_IRC"] is False: RP=rxn.IRC_xtb[i]["PR"][0] + if RP: continue + else: + # submit IRC jobs + if args["dft_irc"]: + print(f"reaction {rxn_ind} is unpredictable, use IRC/DFT to locate TS...") + wf=f"{scratch_dft}/{rxn_ind}" + inp_xyz=f"{wf}/{rxn_ind}-TS.xyz" + xyz_write(inp_xyz, rxn.reactant.elements, rxn.TS_dft[dft_lot][i]["geo"]) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-IRC",\ + jobtype="IRC", lot=args["dft_lot"], charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + dft_job.generate_irc_settings(max_iter=100) + dft_job.generate_input() + irc_jobs[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: todo_list.append(rxn_ind) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{rxn_ind}-IRC",\ + jobtype="irc", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input() + irc_jobs[rxn_ind]=dft_job + if dft_job.calculation_terminated_normally() is False: todo_list.append(rxn_ind) + if args["dft_irc"] and len(todo_list)>0: + n_submit=len(todo_list)//int(args["dft_njobs"]) + if len(todo_list)%int(args["dft_njobs"])>0:n_submit+=1 + startidx=0 + slurm_jobs=[] + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"IRC.{i}", ppn=int(args["ppn"]), partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(int(args["mem"])*1100)) + endidx=min(startidx+int(args["dft_njobs"]), len(todo_list)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([irc_jobs[ind] for ind in todo_list[startidx:endidx]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([irc_jobs[ind] for ind in todo_list[startidx:endidx]]) + slurmjob.submit() + startidx=endidx + slurm_jobs.append(slurmjob) + print(f"running {len(slurm_jobs)} IRC jobs...") + monitor_jobs(slurm_jobs) + + # Read result into reaction class + key=[i for i in irc_jobs.keys()] + for rxn_ind in key: + irc_job=irc_jobs[rxn_ind] + if irc_job.calculation_terminated_normally() is False: + print(f"IRC job {irc_job.jobname} fails, skip this reaction...") + continue + job_success=False + rxn_ind=rxn_ind.split("_") + inchi, idx, conf_i=rxn_ind[0], int(rxn_ind[1]), int(rxn_ind[2]) + E, G1, G2, TSG, barrier1, barrier2=irc_job.analyze_IRC() + try: + E, G1, G2, TSG, barrier1, barrier2=irc_job.analyze_IRC() + job_success=True + except: pass + if job_success is False: continue + for count, rxn in enumerate(rxns): + if inchi==rxn.reactant_inchi and idx==rxn.id: + if dft_lot not in rxns[count].IRC_dft.keys(): rxns[count].IRC_dft[dft_lot]=dict() + rxns[count].IRC_dft[dft_lot][conf_i]=dict() + rxns[count].IRC_dft[dft_lot][conf_i]["node"]=[G1, G2] + rxns[count].IRC_dft[dft_lot][conf_i]["TS"]=TSG + rxns[count].IRC_dft[dft_lot][conf_i]["barriers"]=[barrier2, barrier1] + if analyze==True: rxns=analyze_intended(rxns) + return rxns + +def analyze_intended(rxns): + args=rxns[0].args + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + for count, rxn in enumerate(rxns): + P_adj_mat=rxn.product.adj_mat + R_adj_mat=rxn.reactant.adj_mat + if dft_lot not in rxn.IRC_dft.keys(): continue + for i in rxn.IRC_dft[dft_lot].keys(): + G1=rxn.IRC_dft[dft_lot][i]["node"][0] + G2=rxn.IRC_dft[dft_lot][i]["node"][1] + adj_mat1=table_generator(rxn.reactant.elements, G1) + adj_mat2=table_generator(rxn.product.elements, G2) + barrier2=rxn.IRC_dft[dft_lot][i]["barriers"][0] + barrier1=rxn.IRC_dft[dft_lot][i]["barriers"][1] + adj_diff_r1=np.abs(adj_mat1-R_adj_mat) + adj_diff_r2=np.abs(adj_mat2-R_adj_mat) + adj_diff_p1=np.abs(adj_mat1-P_adj_mat) + adj_diff_p2=np.abs(adj_mat2-P_adj_mat) + if adj_diff_r1.sum()==0: + if adj_diff_p2.sum()==0: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G1, G2] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_dft[dft_lot][i]["type"]="intended" + else: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G1, G2] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_dft[dft_lot][i]["type"]="P_unintended" + elif adj_diff_p1.sum()==0: + if adj_diff_r2.sum()==0: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G2, G1] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_dft[dft_lot][i]["type"]="intended" + else: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G2, G1] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_dft[dft_lot][i]["type"]="R_unintended" + elif adj_diff_r2.sum()==0: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G2, G1] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_dft[dft_lot][i]["type"]="P_unintended" + elif adj_diff_p2.sum()==0: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G1, G2] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_dft[dft_lot][i]["type"]="R_unintended" + else: + rxns[count].IRC_dft[dft_lot][i]["node"]=[G1, G2] + rxns[count].IRC_dft[dft_lot][i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_dft[dft_lot][i]["type"]="unintended" + return rxns +def writedown_result(rxns): + args=rxns[0].args + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + with open(f'{args["scratch_dft"]}/yarp_result.txt', 'w') as f: + if args["backward_DE"]: f.write(f'{"reaction":40s} {"R":<60s} {"P":<60s} {"DE_F":<10s} {"DG_F":<10s} {"DE_B":<10s} {"DG_B":<10s} {"Type":<10s} {"Source":<10s}\n') + else: f.write(f'{"reaction":40s} {"R":<60s} {"P":<60s} {"DE_F":<10s} {"DG_F":<10s} {"Type":<10s} {"Source":<10s}\n') + for rxn in rxns: + if dft_lot in rxn.IRC_dft.keys(): key=[i for i in rxn.IRC_dft[dft_lot].keys()] + else: continue + for conf_i in key: + rxn_ind=f"{rxn.reactant_inchi}_{rxn.id}_{conf_i}" + adj_mat=table_generator(rxn.reactant.elements, rxn.IRC_dft[dft_lot][conf_i]["node"][0]) + bond_mat, _=find_lewis(rxn.reactant.elements, adj_mat) + bond_mat=bond_mat[0] + rsmi=return_smi(rxn.reactant.elements, rxn.IRC_dft[dft_lot][conf_i]["node"][0], bond_mat=bond_mat) + adj_mat=table_generator(rxn.reactant.elements, rxn.IRC_dft[dft_lot][conf_i]["node"][1]) + bond_mat, _=find_lewis(rxn.reactant.elements, adj_mat) + bond_mat=bond_mat[0] + psmi=return_smi(rxn.reactant.elements, rxn.IRC_dft[dft_lot][conf_i]["node"][1], bond_mat=bond_mat) + try: + DE_F=Constants.ha2kcalmol*(rxn.TS_dft[dft_lot][conf_i]["SPE"]-rxn.reactant_dft_opt[dft_lot]["SPE"]) + DG_F=Constants.ha2kcalmol*(rxn.TS_dft[dft_lot][conf_i]["thermal"]["GibbsFreeEnergy"]-rxn.reactant_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]) + except: + DE_F=0.0 + DG_F=0.0 + if args["backward_DE"]: + try: + DE_B=Constants.ha2kcalmol*(rxn.TS_dft[dft_lot][conf_i]["SPE"]-rxn.product_dft_opt[dft_lot]["SPE"]) + DG_B=Constants.ha2kcalmol*(rxn.TS_dft[dft_lot][conf_i]["thermal"]["GibbsFreeEnergy"]-rxn.product_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]) + except: + DE_B=0.0 + DF_B=0.0 + f.write(f"{rxn_ind:40s} {rsmi:<60s} {psmi:<60s} {DE_F:<10.4f} {DG_F:<10.4f} {DE_B:<10.4f} {DG_B:<10.4f} {rxn.IRC_dft[dft_lot][conf_i]['type']:<10s} {dft_lot:<10s}\n") + else: + f.write(f"{rxn_ind:40s} {rsmi:<60s} {psmi:<60s} {DE_F:<10.4f} {DG_F:<10.4f} {rxn.IRC_dft[dft_lot][conf_i]['type']:<10s} {dft_lot:<10s}\n") + return + +def run_dft_opt(rxns): + args=rxns[0].args + crest_folder=args["scratch_crest"] + dft_folder=args["scratch_dft"] + if os.path.isdir(crest_folder) is False: os.mkdir(crest_folder) + if os.path.isdir(dft_folder) is False: os.mkdir(dft_folder) + stable_conf=dict() + inchi_dict=find_all_seps(rxns) + key=[i for i in inchi_dict.keys()] + for rxn in rxns: + if rxn.reactant_inchi not in stable_conf.keys(): + if bool(rxn.reactant_conf) is True and "-" not in rxn.reactant_inchi: + stable_conf[rxn.reactant_inchi]=[rxn.reactant.elements, rxn.reactant_conf[0]] + if rxn.product_inchi not in stable_conf.keys(): + if bool(rxn.product_conf) is True and "-" not in rxn.product_inchi: + stable_conf[rxn.product_inchi]=[rxn.product.elements, rxn.product_conf[0]] + # collect inchi from reaction classes + all_inchi=dict() + # collect missing DFT energy + missing_dft=[] + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + inchi_key=[i for i in inchi_dict.keys()] + for rxn in rxns: + if dft_lot not in rxn.reactant_dft_opt.keys(): + for i in inchi_key: + if i in rxn.reactant_inchi and i not in missing_dft: + missing_dft.append(i) + if dft_lot not in rxn.product_dft_opt.keys(): + for i in inchi_key: + if i in rxn.product_inchi and i not in missing_dft: + missing_dft.append(i) + + missing_conf=[] + for i in missing_dft: + if i not in stable_conf.keys(): + missing_conf.append(i) + njobs=int(args["dft_njobs"]) + if len(missing_conf) > 0: + CREST_job_list=[] + for inchi in missing_conf: + if inchi in missing_dft: + # print(inchi_dict[inchi]) + E, G=inchi_dict[inchi][0], inchi_dict[inchi][1] + wf=f'{crest_folder}/{inchi}' + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz=f"{wf}/{inchi}.xyz" + xyz_write(inp_xyz, E, G) + crest_job=CREST(input_geo=inp_xyz, work_folder=wf, nproc=int(args["c_nprocs"]), mem=int(args["mem"])*1000, quick_mode=args["crest_quick"],\ + opt_level=args["opt_level"], charge=args["charge"], multiplicity=args["multiplicity"], crest_path = args["crest"]) + CREST_job_list.append(crest_job) + + n_submit=len(CREST_job_list)//njobs + if len(CREST_job_list)%njobs>0: n_submit+=1 + startidx=0 + slurm_jobs=[] + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f'CREST.{i}', ppn=args["ppn"], partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(args["mem"])*1100) + endidx=min(startidx+njobs, len(CREST_job_list)) + slurmjob.create_crest_jobs([job for job in CREST_job_list[startidx:endidx]]) + slurmjob.submit() + startidx=endidx + slurm_jobs.append(slurmjob) + print(f"Running {len(slurm_jobs)} CREST jobs...") + monitor_jobs(slurm_jobs) + print("All CREST jobs finished...") + + for crest_job in CREST_job_list: + inchi=crest_job.input_geo.split('/')[-1].split('.xyz')[0] + if crest_job.calculation_terminated_normally(): + E, G, _ = crest_job.get_stable_conformer() + stable_conf[inchi]=[E, G] + + # submit missing dft optimization + if len(missing_dft)>0: + dft_job_list=[] + for inchi in missing_dft: + if inchi not in stable_conf.keys(): continue + E, G=stable_conf[inchi] + wf=f"{dft_folder}/{inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz=f"{wf}/{inchi}.xyz" + xyz_write(inp_xyz, E, G) + if args["package"]=="ORCA": + dft_job=ORCA(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{inchi}-OPT", jobtype="OPT Freq", lot=args["dft_lot"],\ + charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"], solvation_model=args["solvation_model"], dielectric=args["dielectric"], writedown_xyz=True) + dft_job.generate_input() + dft_job_list.append(dft_job) + elif args["package"]=="Gaussian": + dft_job=Gaussian(input_geo=inp_xyz, work_folder=wf, nproc=int(args["dft_nprocs"]), mem=int(args["mem"])*1000, jobname=f"{inchi}-OPT",\ + jobtype="opt", lot=dft_lot, charge=args["charge"], multiplicity=args["multiplicity"], solvent=args["solvent"],\ + solvation_model=args["solvation_model"], dielectric=args["dielectric"], dispersion=args["dispersion"]) + dft_job.generate_input() + dft_job_list.append(dft_job) + + n_submit=len(dft_job_list)//int(args["dft_njobs"]) + if len(dft_job_list)%int(args["dft_njobs"])>0: n_submit+=1 + startidx=0 + slurm_jobs=[] + for i in range(n_submit): + slurmjob=SLURM_Job(jobname=f"OPT.{i}", ppn=args["ppn"], partition=args["partition"], time=args["dft_wt"], mem_per_cpu=int(args["mem"])*1100) + endidx=min(startidx+int(args["dft_njobs"]), len(dft_job_list)) + if args["package"]=="ORCA": slurmjob.create_orca_jobs([job for job in dft_job_list[startidx:endidx]]) + elif args["package"]=="Gaussian": slurmjob.create_gaussian_jobs([job for job in dft_job_list[startidx:endidx]]) + slurmjob.submit() + startidx=endidx + slurm_jobs.append(slurmjob) + + print(f"Running {len(slurm_jobs)} DFT optimization jobs...") + monitor_jobs(slurm_jobs) + print("DFT optimization finished.") + dft_dict=dict() + for dft_job in dft_job_list: + inchi=dft_job.jobname.split("-OPT")[0] + if dft_job.calculation_terminated_normally() and dft_job.optimization_converged(): + imag_freq, _=dft_job.get_imag_freq() + if len(imag_freq) > 0: + print("WARNING: imaginary frequency identified for molecule {inchi}...") + + SPE=dft_job.get_energy() + thermal=dft_job.get_thermal() + _, G=dft_job.get_final_structure() + dft_dict[inchi]=dict() + dft_dict[inchi]["SPE"]=SPE + dft_dict[inchi]["thermal"]=thermal + dft_dict[inchi]["geo"]=G + if len(args["dft_lot"].split()) > 1: dft_lot="/".join(args["dft_lot"].split()) + else: dft_lot=args["dft_lot"] + key=[i for i in dft_dict.keys()] + for count, rxn in enumerate(rxns): + for i in key: + if i in rxn.reactant_inchi: + if dft_lot not in rxns[count].reactant_dft_opt.keys(): + rxns[count].reactant_dft_opt[dft_lot]=dict() + if "SPE" not in rxns[count].reactant_dft_opt[dft_lot].keys(): + rxns[count].reactant_dft_opt[dft_lot]["SPE"]=0.0 + rxns[count].reactant_dft_opt[dft_lot]["SPE"]+=dft_dict[i]["SPE"] + if "thermal" not in rxns[count].reactant_dft_opt[dft_lot].keys(): + rxns[count].reactant_dft_opt[dft_lot]["thermal"]={} + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]=0.0 + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["Enthalpy"]=0.0 + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["InnerEnergy"]=0.0 + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["Entropy"]=0.0 + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]+=dft_dict[i]["thermal"]["GibbsFreeEnergy"] + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["Enthalpy"]+=dft_dict[i]["thermal"]["Enthalpy"] + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["InnerEnergy"]+=dft_dict[i]["thermal"]["InnerEnergy"] + rxns[count].reactant_dft_opt[dft_lot]["thermal"]["Entropy"]+=dft_dict[i]["thermal"]["Entropy"] + if rxn.product_inchi in dft_dict.keys() and rxn.args["backward_DE"]: + if dft_lot not in rxns[count].product_dft_opt.keys(): + rxns[count].product_dft_opt[dft_lot]=dict() + if "SPE" not in rxns[count].product_dft_opt[dft_lot].keys(): + rxns[count].product_dft_opt[dft_lot]["SPE"]=0.0 + rxns[count].product_dft_opt[dft_lot]["SPE"]+=dft_dict[i]["SPE"] + if "thermal" not in rxns[count].product_dft_opt[dft_lot].keys(): + rxns[count].product_dft_opt[dft_lot]["thermal"]={} + rxns[count].product_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]=0.0 + rxns[count].product_dft_opt[dft_lot]["thermal"]["Enthalpy"]=0.0 + rxns[count].product_dft_opt[dft_lot]["thermal"]["InnerEnergy"]=0.0 + rxns[count].product_dft_opt[dft_lot]["thermal"]["Entropy"]=0.0 + rxns[count].product_dft_opt[dft_lot]["thermal"]["GibbsFreeEnergy"]+=dft_dict[i]["thermal"]["GibbsFreeEnergy"] + rxns[count].product_dft_opt[dft_lot]["thermal"]["Enthalpy"]+=dft_dict[i]["thermal"]["Enthalpy"] + rxns[count].product_dft_opt[dft_lot]["thermal"]["InnerEnergy"]+=dft_dict[i]["thermal"]["InnerEnergy"] + rxns[count].product_dft_opt[dft_lot]["thermal"]["Entropy"]+=dft_dict[i]["thermal"]["Entropy"] + return rxns + +def find_all_seps(rxns): + inchi_dict=dict() + for rxn in rxns: + tmp_dict=seperate_mols(rxn.reactant.elements, rxn.reactant.geo) + key=[i for i in tmp_dict.keys()] + for i in key: + if i not in inchi_dict.keys(): + inchi_dict[i]=tmp_dict[i] + if rxn.args["backward_DE"]: + tmp_dict=seperate_mols(rxn.reactant.elements, rxn.product.geo) + key=[i for i in tmp_dict.keys()] + for i in key: + if i not in inchi_dict.keys(): + inchi_dict[i]=tmp_dict[i] + return inchi_dict + +if __name__=="__main__": + parameters = sys.argv[1] + parameters = yaml.load(open(parameters, "r"), Loader=yaml.FullLoader) + main(parameters) diff --git a/pyTEST_Example/main_xtb.py b/pyTEST_Example/main_xtb.py new file mode 100644 index 0000000..0850262 --- /dev/null +++ b/pyTEST_Example/main_xtb.py @@ -0,0 +1,695 @@ +import yarp as yp +import numpy as np +import threading +import pickle +import multiprocessing as mp +from multiprocessing import Queue +from logging.handlers import QueueHandler +from joblib import Parallel, delayed +from yarp.find_lewis import all_zeros +from yarp.find_lewis import bmat_unique +import os, sys, yaml, fnmatch +import logging +from openbabel import pybel +from utils import * +from wrappers.reaction import * +from job_mapping import * +from wrappers.crest import CREST +from qc_jobs import * +from conf import * +from analyze_functions import * +from wrappers.pysis import PYSIS +from wrappers.gsm import GSM + +# YARP methodology by Hsuan-Hao Hsu, Qiyuan Zhao, and Brett M. Savoie +def initialize(args): + keys=[i for i in args.keys()] + if "input" not in keys: + print("KEY ERROR: NO INPUT REACTANTS OR REACTIONS. Exit....") + exit() + if "scratch" not in keys: + args["scratch"]=f"{os.getcwd()}/yarp_run" + if "low_solvation" not in keys: + args["low_solvation"]=False + args["low_solvation_model"]="alpb" + args["solvent"]=False + else: + args["low_solvation_model"], args["solvent"]=args['low_solvation'].split('/') + if "method" not in keys: + args["method"]="crest" + if "reaction_data" not in keys: args["reaction_data"]="reaction.p" + if "form_all" not in keys: args["form_all"]=False + if "lewis_criteria" not in keys: args["lewis_criteria"]=0.0 + if "crest" not in keys: args["crest"]="crest" + if "xtb" not in keys: args["xtb"]="xtb" + if "charge" not in keys: + print("WARNING: Charge is not provided. Use neutral species (charge=0) as default...") + args["charge"]=0 + if "multiplicity" not in keys: + print("WARNING: Multiplicity is not provided. Use closed-shell species (multiplicity=1) as default...") + args["multiplicity"]=1 + if "enumeration" not in keys: + args["enumeration"]=True + if "n_break" not in keys: + args["n_break"]=2 + else: args["n_break"]=int(args['n_break']) + if "strategy" not in keys: + args["strategy"]=2 + else: args["strategy"]=int(args["strategy"]) + if "n_conf" not in keys: + args["n_conf"]=3 + else: args["n_conf"]=int(args["n_conf"]) + if "nprocs" not in keys: + args["nprocs"]=1 + else: args["nprocs"]=int(args["nprocs"]) + if "c_nprocs" not in keys: + args["c_nprocs"]=1 + else: args["c_nprocs"]=int(args["c_nprocs"]) + if "mem" not in keys: + args["mem"]=1 + if "restart" not in keys: + args["restart"]=False + args["scratch_xtb"]=f"{args['scratch']}/xtb_run" + args["scratch_crest"]=f"{args['scratch']}/conformer" + args["conf_output"]=f"{args['scratch']}/rxn_conf" + if os.path.exists(args["scratch"]) is False: os.makedirs(args["scratch"]) + if os.path.exists(args["scratch_xtb"]) is False: os.makedirs(args["scratch_xtb"]) + if os.path.exists(args["scratch_crest"]) is False: os.makedirs(args["scratch_crest"]) + if os.path.exists(args["conf_output"]) is False: os.makedirs(args["conf_output"]) + logging_path = os.path.join(args["scratch"], "YARPrun.log") + logging_queue = mp.Manager().Queue(999) + logger_p = mp.Process(target=logger_process, args=(logging_queue, logging_path), daemon=True) + logger_p.start() + start = time.time() + Tstart= time.time() + logger = logging.getLogger("main") + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + return args, logger, logging_queue + +def main(args:dict): + args, logger, logging_queue=initialize(args) + print(f"""Welcome to + __ __ _ ____ ____ + \ \ / // \ | _ \| _ \ + \ V // _ \ | |_) | |_) | + | |/ ___ \| _ <| __/ + |_/_/ \_\_| \_\_| + // Yet Another Reaction Program + """) + if os.path.isfile(args["input"]) and fnmatch.fnmatch(args["input"], "*.smi") is True: # Read smiles in + mol=[i.split('\n')[0] for i in open(args["input"], 'r+').readlines()] + elif os.path.isfile(args["input"]) and fnmatch.fnmatch(args["input"], "*.xyz") is True: + mol=[args["input"]] + else: + mol=[args["input"]+"/"+i for i in os.listdir(args["input"]) if fnmatch.fnmatch(i, '*.xyz') or fnmatch.fnmatch(i, '*.mol')] + + if os.path.isfile(args["reaction_data"]) is True: + rxns=pickle.load(open(args["reaction_data"], 'rb')) + for rxn in rxns: rxn.args=args + + print("-----------------------") + print("------First Step-------") + print("------Enumeration------") + print("-----------------------") + + if args["enumeration"]: + for i in mol: rxns=run_enumeration(i, args=args) + elif os.path.isfile(args["reaction_data"]) is False: + rxns=[] + for i in mol: rxns.append(read_rxns(i, args=args)) + + inchi_array=[] + for i in rxns: + if i.reactant_inchi not in inchi_array: inchi_array.append(i.reactant_inchi) + inchi_dict=dict() + for i in inchi_array: inchi_dict[i]=0 + for i in rxns: + inchi=i.reactant_inchi + idx=inchi_dict[inchi] + i.id=idx + inchi_dict[inchi]=idx+1 + + #exit() + print("-----------------------") + print("------Second Step------") + print("Conformational Sampling") + print("-----------------------") + if args["method"]=='rdkit': + for count_i, i in enumerate(rxns): rxns[count_i].conf_rdkit() + elif args["method"]=='crest': + rxns=conf_by_crest(rxns, logging_queue, logger) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + exit() + print("-----------------------") + print("-------Third Step------") + print("Conformation Generation") + print("-----------------------") + rxns=select_rxn_conf(rxns, logging_queue) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + print("-----------------------") + print("-------Forth Step------") + print("-Growing String Method-") + print("-----------------------") + rxns=run_gsm_by_pysis(rxns, logging_queue) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + print("-----------------------") + print("-------Fifth Step------") + print("------Berny TS Opt-----") + print("-----------------------") + rxns=run_ts_opt_by_xtb(rxns, logging_queue, logger) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + print("-----------------------") + print("-------Sixth Step------") + print("-----IRC Calculation---") + print("-----------------------") + rxns=run_irc_by_xtb(rxns, logging_queue) + with open(args["reaction_data"], "wb") as f: + pickle.dump(rxns, f) + print("-----------------------") + print("-----print result------") + print("-----------------------") + rxns=analyze_outputs(rxns) + return + +def run_irc_by_xtb(rxns, logging_queue): + args=rxns[0].args + conf_output=args["conf_output"] + nprocs=args["nprocs"] + scratch=args["scratch"] + irc_jobs=dict() + for count, rxn in enumerate(rxns): + key=[j for j in rxn.TS_xtb.keys()] + for j in key: + rxn_ind=f"{rxn.reactant_inchi}_{int(rxn.id)}_{j}" + wf=f"{scratch}/{rxn_ind}" + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_write(f"{wf}/{rxn_ind}-TS.xyz", rxn.reactant.elements, rxn.TS_xtb[j]) + if not args["solvent"]: + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TS.xyz", work_folder=wf, jobname=rxn_ind, jobtype="irc", charge=args["charge"], multiplicity=args["multiplicity"]) + else: + if "alpb" in args["low_solvation_model"].lower(): + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TS.xyz", work_folder=wf, jobname=rxn_ind, jobtype="irc", charge=args["charge"], multiplicity=args["multiplicity"],\ + alpb=args["solvent"]) + else: + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TS.xyz", work_folder=wf, jobname=rxn_ind, jobtype="irc", charge=args["charge"], multiplicity=args["multiplicity"],\ + gbsa=args["solvent"]) + if os.path.isfile(f"{wf}/ts_final_hessian.h5"): pysis_job.generate_input(calctype="xtb", hess_init=f"{wf}/ts_final_hessian.h5") + else: pysis_job.generate_input(calctype='xtb') + irc_jobs[rxn_ind]=pysis_job + irc_job_list=[irc_jobs[ind] for ind in sorted(irc_jobs.keys())] + irc_thread=min(nprocs, len(irc_job_list)) + input_job_list=[(irc_job, logging_queue, args["pysis_wt"]) for irc_job in irc_job_list] + Parallel(n_jobs=irc_thread)(delayed(run_pysis)(*task) for task in input_job_list) + + # Read result into reaction class + for irc_job in irc_job_list: + if irc_job.calculation_terminated_normally() is False: + print(f"IRC job {irc_job.jobname} fails, skip this reaction") + continue + job_success=False + rxn_ind=irc_job.jobname + rxn_ind=rxn_ind.split("_") + inchi, idx, conf_i=rxn_ind[0], int(rxn_ind[1]), int(rxn_ind[2]) + try: + E, G1, G2, TSG, barrier1, barrier2=irc_job.analyze_IRC() + _, TSE, _=irc_job.get_energies_from_IRC() + job_success=True + except: pass + if job_success is False: continue + adj_mat1, adj_mat2=table_generator(E, G1), table_generator(E, G2) + #bond_mat1, _=find_lewis(E, adj_mat1, args["charge"]) + #bond_mat2, _=find_lewis(E, adj_mat2, args["charge"]) + #bond_mat1=bond_mat1[0] + #bond_mat2=bond_mat2[0] + for count, rxn in enumerate(rxns): + if inchi==rxn.reactant_inchi and idx==rxn.id: + #rxns[count].IRC_xtb[conf_i]["node"]=[G1, G2] + #rxns[count].IRC_xtb[conf_i]["TS"]=TSG + #rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier1, barrier2] + P_adj_mat=rxn.product.adj_mat + R_adj_mat=rxn.reactant.adj_mat + adj_diff_r1=np.abs(adj_mat1-R_adj_mat) + adj_diff_r2=np.abs(adj_mat2-R_adj_mat) + adj_diff_p1=np.abs(adj_mat1-P_adj_mat) + adj_diff_p2=np.abs(adj_mat2-P_adj_mat) + rxns[count].IRC_xtb[conf_i]=dict() + if adj_diff_r1.sum()==0: + if adj_diff_p2.sum()==0: + rxns[count].IRC_xtb[conf_i]["node"]=[G1, G2] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_xtb[conf_i]["type"]="intended" + else: + rxns[count].IRC_xtb[conf_i]["node"]=[G1, G2] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_xtb[conf_i]["type"]="P_unintended" + elif adj_diff_p1.sum()==0: + if adj_diff_r2.sum()==0: + rxns[count].IRC_xtb[conf_i]["node"]=[G2, G1] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_xtb[conf_i]["type"]="intended" + else: + rxns[count].IRC_xtb[conf_i]["node"]=[G2, G1] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_xtb[conf_i]["type"]="R_unintended" + elif adj_diff_r2.sum()==0: + rxns[count].IRC_xtb[conf_i]["node"]=[G2, G1] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier1, barrier2] + rxns[count].IRC_xtb[conf_i]["type"]="P_unintended" + elif adj_diff_p2.sum()==0: + rxns[count].IRC_xtb[conf_i]["node"]=[G1, G2] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_xtb[conf_i]["type"]="R_unintended" + else: + rxns[count].IRC_xtb[conf_i]["node"]=[G1, G2] + rxns[count].IRC_xtb[conf_i]["TS"]=TSG + rxns[count].IRC_xtb[conf_i]["barriers"]=[barrier2, barrier1] + rxns[count].IRC_xtb[conf_i]["type"]="unintended" + return rxns + +def run_opt_by_xtb(rxns, logging_queue, logger): + args=rxns[0].args + nprocs=args["nprocs"] + scratch=args["scratch"] + wf=f"{scratch}/xtb_run" + if os.path.isdir(wf) is False: os.mkdir(wf) + opt_jobs=dict() + for i in rxns: + RE=i.reactant.elements + PE=i.product.elements + RG=i.reactant.geo + PG=i.product.geo + R_inchi=i.reactant_inchi + P_inchi=i.product_inchi + R_constraint=return_metal_constraint(i.reactant) + P_constraint=return_metal_constraint(i.product) + if args['strategy']!=0: + if P_inchi not in opt_jobs.keys(): + wf=f"{scratch}/xtb_run/{P_inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_write(f"{wf}/{P_inchi}-init.xyz", PE, PG) + if args["solvent"]==False: + pysis_job=PYSIS(input_geo=f"{wf}/{P_inchi}-init.xyz", work_folder=wf, jobname=P_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"]) + else: + if args["low_solvation_model"].lower()=='alpb': + pysis_job=PYSIS(input_geo=f"{wf}/{P_inchi}-init.xyz", work_folder=wf, jobname=P_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"],\ + alpb=args["solvent"]) + else: + pysis_job=PYSIS(input_geo=f"{wf}/{P_inchi}-init.xyz", work_folder=wf, jobname=P_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"],\ + gbsa=args["solvent"]) + pysis_job.generate_input(calctype='xtb', hess=True, hess_step=1) + opt_jobs[P_inchi]=pysis_job + if args["strategy"]!=1: + if R_inchi not in opt_jobs.keys(): + wf=f"{scratch}/xtb_run/{R_inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_write(f"{wf}/{R_inchi}-init.xyz", PE, PG) + print(wf) + if args["solvent"]==False: + pysis_job=PYSIS(input_geo=f"{wf}/{R_inchi}-init.xyz", work_folder=wf, jobname=R_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"]) + else: + if args["low_solvation_model"].lower()=='alpb': + pysis_job=PYSIS(input_geo=f"{wf}/{R_inchi}-init.xyz", work_folder=wf, jobname=R_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"],\ + alpb=args["solvent"]) + else: + pysis_job=PYSIS(input_geo=f"{wf}/{R_inchi}-init.xyz", work_folder=wf, jobname=R_inchi, jobtype='opt', charge=args["charge"], multiplicity=args["multiplicity"],\ + gbsa=args["solvent"]) + pysis_job.generate_input(calctype='xtb', hess=True, hess_step=1) + opt_jobs[R_inchi]=pysis_job + # Finish creat pysis jobs + # create a process pool + opt_job_list=[opt_jobs[ind] for ind in sorted(opt_jobs.keys())] + opt_thread=min(nprocs, len(opt_job_list)) + + input_job_list=[(opt_job, logging_queue, args["pysis_wt"]) for opt_job in opt_job_list] + Parallel(n_jobs=opt_thread)(delayed(run_pysis)(*task) for task in input_job_list) + + # Read in optimized geometry + for opt_job in opt_job_list: + if opt_job.optimization_converged(): E, G = opt_job.get_opt_geo() + else: continue + ind=opt_job.jobname + for rxn in rxns: + if args["strategy"]!=0: + inchi=rxn.product_inchi + if ind==inchi: + rxn.product_xtb_opt={"E": E, "G": G} + print(f"product opt, G: {G}\n") + if args["strategy"]!=1: + inchi=rxn.reactant_inchi + if ind==inchi: + rxn.reactant_xtb_opt={"E": E, "G":G} + print(f"reactant opt, G: {G}\n") + return rxns + +def conf_by_crest(rxns, logging_queue, logger): + rxns=run_opt_by_xtb(rxns, logging_queue, logger) + chunks=[] + args=rxns[0].args + nprocs=args["nprocs"] + c_nprocs=args["c_nprocs"] + scratch_crest=args["scratch_crest"] + mem=int(args["mem"])*1000 + crest_job_list=[] + inchi_list=[] + thread=nprocs//c_nprocs + for rxn in rxns: + if args["strategy"]!=0: + if rxn.product_inchi not in inchi_list: + wf=f"{scratch_crest}/{rxn.product_inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inchi_list.append(rxn.product_inchi) + inp_xyz=f"{wf}/{rxn.product_inchi}.xyz" + if bool(rxn.product_xtb_opt) is False: xyz_write(inp_xyz, rxn.product.elements, rxn.product.geo) + else: xyz_write(inp_xyz, rxn.product_xtb_opt["E"], rxn.product_xtb_opt["G"]) + crest_job=CREST(input_geo=inp_xyz, work_folder=wf, lot=args["lot"], nproc=c_nprocs, mem=mem, quick_mode=args['crest_quick'], opt_level=args['opt_level'],\ + solvent=args['solvent'], solvation_model=args['low_solvation_model'], charge=args['charge'], multiplicity=args['multiplicity'], crest_path=args["crest"]) + if args["crest_quick"]: crest_job.add_command(additional='-rthr 0.1 -ewin 8 ') + crest_job_list.append(crest_job) + if args["strategy"]!=1: + if rxn.reactant_inchi not in inchi_list: + wf=f"{scratch_crest}/{rxn.reactant_inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inchi_list.append(rxn.reactant_inchi) + inp_xyz=f"{wf}/{rxn.reactant_inchi}.xyz" + if bool(rxn.reactant_xtb_opt) is False: xyz_write(inp_xyz, rxn.reactant.elements, rxn.reactant.geo) + else: xyz_write(inp_xyz, rxn.reactant_xtb_opt["E"], rxn.reactant_xtb_opt["G"]) + crest_job=CREST(input_geo=inp_xyz, work_folder=wf, lot=args["lot"], nproc=c_nprocs, mem=mem, quick_mode=args['crest_quick'], opt_level=args['opt_level'],\ + solvent=args['solvent'], solvation_model=args['low_solvation_model'], charge=args['charge'], multiplicity=args['multiplicity'], crest_path=args["crest"]) + if args["crest_quick"]: crest_job.add_command(additional='-rthr 0.1 -ewin 8 ') + crest_job_list.append(crest_job) + input_job_list=[(crest_job, logging_queue) for crest_job in crest_job_list] + Parallel(n_jobs=thread)(delayed(run_crest)(*task) for task in input_job_list) + rxns=read_crest_in_class(rxns, scratch_crest) + return rxns + +def run_ts_opt_by_xtb(rxns, logging_queue, logger): + args=rxns[0].args + conf_output=args["conf_output"] + nprocs=args["nprocs"] + scratch=args["scratch"] + tsopt_jobs=dict() + for count_i, i in enumerate(rxns): + key=[j for j in i.TS_guess.keys()] + for j in key: + rxn_ind=f"{i.reactant_inchi}_{i.id}_{j}" + wf=f"{scratch}/{rxn_ind}" + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_write(f"{wf}/{rxn_ind}-TSguess.xyz", i.reactant.elements, i.TS_guess[j]) + if args["solvent"] is False: + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TSguess.xyz", work_folder=wf, jobname=rxn_ind, jobtype='tsopt', charge=args["charge"], multiplicity=args["multiplicity"]) + else: + if args["low_solvation_model"].lower()=='alpb': + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TSguess.xyz", work_folder=wf, jobname=rxn_ind, jobtype='tsopt', charge=args["charge"], multiplicity=args["multiplicity"],\ + alpb=args["solvent"]) + else: + pysis_job=PYSIS(input_geo=f"{wf}/{rxn_ind}-TSguess.xyz", work_folder=wf, jobname=rxn_ind, jobtype='tsopt', charge=args["charge"], multiplicity=args["multiplicity"],\ + gbsa=args["solvent"]) + pysis_job.generate_input(calctype='xtb', hess=True, hess_step=1) + tsopt_jobs[rxn_ind]=pysis_job + + # Create a process pool with gsm_thread processes + tsopt_job_list= [tsopt_jobs[ind] for ind in sorted(tsopt_jobs.keys())] + tsopt_thread = min(nprocs, len(tsopt_job_list)) + + # Run the tasks in parallel + input_job_list = [(tsopt_job, logging_queue, args['pysis_wt']) for tsopt_job in tsopt_job_list] + Parallel(n_jobs=tsopt_thread)(delayed(run_pysis)(*task) for task in input_job_list) + + # check tsopt jobs + tsopt_job_list = check_dup_ts_pysis(tsopt_job_list, logger) + for tsopt_job in tsopt_job_list: + TSE, TSG = tsopt_job.get_final_ts() + ind=tsopt_job.jobname + ind=ind.split('_') + inchi, idx, conf_i=ind[0], int(ind[1]), int(ind[2]) + for count, rxn in enumerate(rxns): + if rxn.reactant_inchi in inchi and rxn.id == idx: + rxns[count].TS_xtb[conf_i]=TSG + return rxns + +def run_gsm_by_pysis(rxns, logging_queue): + args=rxns[0].args + conf_output=args["conf_output"] + nprocs=args["nprocs"] + scratch=args["scratch"] + rxn_folder=[] + # write the reaction xyz to conf_output for follwoing GSM calculation + for i in rxns: + key=[j for j in i.rxn_conf.keys()] + for j in key: + rxn_ind=f"{i.reactant_inchi}_{i.id}_{j}" + wf=f"{scratch}/{rxn_ind}" + rxn_folder.append(wf) + if os.path.isdir(wf) is False: os.mkdir(wf) + xyz_write(f"{wf}/R.xyz", i.reactant.elements, i.rxn_conf[j]["R"]) + xyz_write(f"{wf}/P.xyz", i.reactant.elements, i.rxn_conf[j]["P"]) + gsm_thread=min(nprocs, len(rxn_folder)) + gsm_jobs={} + # preparing and running GSM-xTB + for count, rxn in enumerate(rxn_folder): + inp_xyz = [f"{rxn}/R.xyz", f"{rxn}/P.xyz"] + if not args["solvent"]: + gsm_job = PYSIS(inp_xyz, work_folder=wf, jobname=rxn.split('/')[-1], jobtype="string", coord_type="cart", nproc=nprocs, charge=args["charge"], multiplicity=args["multiplicity"]) + else: + if "alpb" in args["low_solvation_model"].lower(): + gsm_job = PYSIS(inp_xyz, work_folder=wf, jobname=rxn.split('/')[-1], jobtype="string", coord_type="cart", nproc=nprocs, charge=args["charge"], multiplicity=args["multiplicity"],\ + alpb=args["solvent"]) + else: + gsm_job = PYSIS(inp_xyz, work_folder=wf, jobname=rxn.split('/')[-1], jobtype="string", coord_type="cart", nproc=nprocs, charge=args["charge"], multiplicity=args["multiplicity"],\ + gbsa=args["solvent"]) + gsm_job.generate_input(calctype="xtb") + gsm_jobs[rxn.split('/')[-1]] = gsm_job + + # Create a process pool with gsm_thread processes + gsm_job_list = [gsm_jobs[ind] for ind in sorted(gsm_jobs.keys())] + # Run the tasks in parallel + input_job_list = [(gsm_job, logging_queue) for gsm_job in gsm_job_list] + Parallel(n_jobs=gsm_thread)(delayed(run_pysis)(*task) for task in input_job_list) + tsopt_jobs={} + for count, gsm_job in enumerate(gsm_job_list): + if gsm_job.calculation_terminated_normally() is False: + print(f'GSM job {gsm_job.jobname} fails to converge, please check this reaction...') + elif os.path.isfile(f"{gsm_job.work_folder}/splined_hei.xyz") is True: + print(f"GSM job {gsm_job.jobname} is coverged!") + TSE, TSG=xyz_parse(f"{gsm_job.work_folder}/splined_hei.xyz") + # Read guess TS into reaction class + ind=gsm_job.jobname + ind=ind.split('_') + inchi, idx, conf_i = ind[0], int(ind[1]), int(ind[2]) + for count_i, i in enumerate(rxns): + if i.reactant_inchi==inchi and i.id==idx: + rxns[count_i].TS_guess[conf_i]=TSG + return rxns + +def run_gsm_by_xtb(rxns, logging_queue): + args=rxns[0].args + conf_output=args["conf_output"] + nprocs=args["nprocs"] + scratch=args["scratch"] + # write the reaction xyz to conf_output for follwoing GSM calculation + for i in rxns: + key=[j for j in i.rxn_conf.keys()] + for j in key: + name=f"{conf_output}/{i.reactant_inchi}_{i.id}_{j}.xyz" + write_reaction(i.reactant.elements, i.rxn_conf[j]["R"], i.rxn_conf[j]["P"], filename=name) + rxn_confs=[rxn for rxn in os.listdir(conf_output) if rxn[-4:]=='.xyz'] + gsm_thread=min(nprocs, len(rxn_confs)) + gsm_jobs={} + + # preparing and running GSM-xTB + for count, rxn in enumerate(rxn_confs): + rxn_ind = rxn.split('.xyz')[0] + wf = f"{scratch}/{rxn_ind}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz = f"{conf_output}/{rxn}" + gsm_job = GSM(input_geo=inp_xyz,input_file=args['gsm_inp'],work_folder=wf,method='xtb', lot=args["lot"], jobname=rxn_ind, jobid=count, charge=args['charge'],\ + multiplicity=args['multiplicity'], solvent=args['solvent'], solvation_model=args['low_solvation_model']) + gsm_job.prepare_job() + gsm_jobs[rxn_ind] = gsm_job + + # Create a process pool with gsm_thread processes + gsm_job_list = [gsm_jobs[ind] for ind in sorted(gsm_jobs.keys())] + # Run the tasks in parallel + input_job_list = [(gsm_job, logging_queue) for gsm_job in gsm_job_list] + Parallel(n_jobs=gsm_thread)(delayed(run_gsm)(*task) for task in input_job_list) + tsopt_jobs={} + for count, gsm_job in enumerate(gsm_job_list): + if gsm_job.calculation_terminated_normally() is False: + print(f'GSM job {gsm_job.jobname} fails to converge, please check this reaction...') + elif gsm_job.find_correct_TS() is False: + print(f"GSM job {gsm_job.jobname} fails to locate a TS, skip this reaction...") + else: + TSE, TSG=gsm_job.get_TS() + # Read guess TS into reaction class + ind=gsm_job.jobname + ind=ind.split('_') + inchi, idx, conf_i = ind[0], int(ind[1]), int(ind[2]) + for count_i, i in enumerate(rxns): + if i.reactant_inchi==inchi and i.id==idx: + rxns[count_i].TS_guess[conf_i]=TSG + return rxns + +def select_rxn_conf(rxns, logging_queue): + args=rxns[0].args + conf_output=args["conf_output"] + nprocs=args["nprocs"] + if os.path.isdir(conf_output) is True and len(os.listdir(conf_output))>0: + print("Reaction conformation sampling has already been done in the target folder, skip this step...") + else: + + thread=min(nprocs, len(rxns)) + chunk_size=len(rxns)//thread + remainder=len(rxns)%thread + input_data_list=[(rxn, logging_queue) for rxn in rxns] + chunks=[] + startidx=0 + for i in range(thread): + endidx=startidx+chunk_size+(1 if i < remainder else 0) + chunks.append(input_data_list[startidx:endidx]) + startidx=endidx + Parallel(n_jobs=thread)(delayed(generate_rxn_conf)(chunk) for chunk in chunks) + #rxns=modified_rxns + + #for i in rxns: i.rxn_conf_generate(logging_queue) + print(f"Finish generating reaction conformations, the output conformations are stored in {conf_output}\n") + return rxns + +def conf_crest(rxns, logging_queue): + chunks=[] + input_rxns=[] + args=rxns[0].args + nprocs=args["nprocs"] + c_nprocs=args["c_nprocs"] + scratch_crest=args["scratch_crest"] + mem = int(args['mem'])*1000 + for count_i, i in enumerate(rxns): input_rxns.append((count_i, i, args)) + thread = min(nprocs, len(input_rxns)) + chunk_size= len(input_rxns) // thread + remainder = len(input_rxns) % thread + startidx=0 + for i in range(thread): + endidx = startidx + chunk_size + (1 if i < remainder else 0) + chunks.append(input_rxns[startidx:endidx]) + startidx = endidx + all_job_mappings = Parallel(n_jobs=thread)(delayed(process_input_rxn)(chunk) for chunk in chunks) + job_mappings = merge_job_mappings(all_job_mappings) + # print("Finish initialization") + # print(job_mappings) + crest_thread=nprocs//c_nprocs + track_crest={} + crest_job_list=[] + for inchi, jobi in job_mappings.items(): + wf=f"{scratch_crest}/{inchi}" + if os.path.isdir(wf) is False: os.mkdir(wf) + inp_xyz=f"{wf}/{inchi}.xyz" + xyz_write(inp_xyz, jobi["E"], jobi['G']) + crest_job=CREST(input_geo=inp_xyz, work_folder=wf, lot=args["lot"], nproc=c_nprocs, mem=mem, quick_mode=args['crest_quick'], opt_level=args['opt_level'],\ + solvent=args['solvent'], solvation_model=args['low_solvation_model'], charge=args['charge'], multiplicity=args['multiplicity']) + if args['crest_quick']: crest_job.add_command(additional='-rthr 0.1 -ewin 8 ') + crest_job_list.append(crest_job) + for jobid in jobi['jobs']: track_crest[jobid]=crest_job + input_job_list=[(crest_job, logging_queue) for crest_job in crest_job_list] + Parallel(n_jobs=crest_thread)(delayed(run_crest)(*task) for task in input_job_list) + rxns=read_crest_in_class(rxns, scratch_crest) + return rxns + +def read_crest_in_class(rxns, scratch_crest): + conf_inchi=[inchi for inchi in os.listdir(scratch_crest) if os.path.isdir(scratch_crest+'/'+inchi)] + for i in conf_inchi: + elements, geos = xyz_parse(f"{scratch_crest}/{i}/crest_conformers.xyz", multiple=True) + for count_j, j in enumerate(rxns): + if j.product_inchi in i: + for count_k, k in enumerate(geos): + rxns[count_j].product_conf[count_k]=k + if j.reactant_inchi in i: + for count_k, k in enumerate(geos): + rxns[count_j].reactant_conf[count_k]=k + return rxns + +def run_enumeration(input_mol, args=dict()): + nb=args["n_break"] + form_all=args["form_all"] + criteria=args["lewis_criteria"] + reactant=yp.yarpecule(input_mol) + mol=yp.yarpecule(input_mol) + print("Do the reaction enumeration on molecule: {} ({})".format(mol.hash,input_mol)) + name=input_mol.split('/')[-1].split('.')[0] + # break bonds + break_mol=list(yp.break_bonds(mol, n=nb)) + + if form_all: products=yp.form_bonds_all(break_mol) + else: products=yp.form_n_bonds(break_mol, n=nb) + + products=[_ for _ in products if _.bond_mat_scores[0]<=criteria and sum(np.abs(_.fc))<2.0] + product=[] + for _ in products: + if _.rings!=[]: + if len(_.rings[0])>4: product.append(_) + else: product.append(_) + products=product + print(f"{len(products)} cleaned products after find_lewis() filtering") + rxn=[] + for count_i, i in enumerate(products): + R=reaction(reactant, i, args=args, opt=True) + rxn.append(R) + return rxn + +def read_rxns(input_mol, args={}): + print(f"Read in reaction: {input_mol}") + elements, geo= xyz_parse(input_mol, multiple=True) + xyz_write(".tmp_R.xyz", elements[0], geo[0]) + reactant=yp.yarpecule(".tmp_R.xyz", canon=False) + os.system('rm .tmp_R.xyz') + xyz_write(".tmp_P.xyz", elements[1], geo[1]) + product=yp.yarpecule(".tmp_P.xyz", canon=False) + os.system('rm .tmp_P.xyz') + R=reaction(reactant, product, args=args, opt=False) + return R + +def write_reaction(elements, RG, PG, filename="reaction.xyz"): + out=open(filename, 'w+') + out.write("{}\n\n".format(len(elements))) + for count_i, i in enumerate(elements): + i.capitalize() + out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i, RG[count_i][0], RG[count_i][1], RG[count_i][2])) + out.write("{}\n\n".format(len(elements))) + for count_i, i in enumerate(elements): + i.capitalize() + out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i, PG[count_i][0], PG[count_i][1], PG[count_i][2])) + out.close() + return + +def write_reaction_yp(R, P, filename="reaction.xyz"): + out=open(filename, 'w+') + out.write('{}\n'.format(len(R.elements))) + out.write('q {}\n'.format(R.q)) + for count_i, i in enumerate(R.elements): + if len(i)>1: + i=i.capitalize() + out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i, R.geo[count_i][0], R.geo[count_i][1], R.geo[count_i][2])) + else: out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i.upper(), R.geo[count_i][0], R.geo[count_i][1], R.geo[count_i][2])) + out.write('{}\n'.format(len(P.elements))) + out.write('q {}\n'.format(P.q)) + for count_i, i in enumerate(P.elements): + if len(i)>1: + i=i.capitalize() + out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i, P.geo[count_i][0], P.geo[count_i][1], P.geo[count_i][2])) + else: out.write("{:<20s} {:< 20.8f} {:< 20.8f} {:< 20.8f}\n".format(i.upper(), P.geo[count_i][0], P.geo[count_i][1], P.geo[count_i][2])) + out.close() + return + +if __name__=="__main__": + parameters = sys.argv[1] + parameters = yaml.load(open(parameters, "r"), Loader=yaml.FullLoader) + main(parameters) diff --git a/pyTEST_Example/model_reaction.py b/pyTEST_Example/model_reaction.py new file mode 100644 index 0000000..8f312f8 --- /dev/null +++ b/pyTEST_Example/model_reaction.py @@ -0,0 +1,513 @@ +# this program handles the model reaction problems and is created by Hsuan-Hao Hsu (hsu205@purdue.edu). +import sys, itertools, timeit, os, copy, math +from itertools import combinations +from openbabel import pybel +from openbabel import openbabel as ob +from collections import Counter +import numpy as np +import yarp as yp +import yaml, fnmatch, pickle +import scipy +# from sklearn.preprocessing import normalize +from yarp.taffi_functions import graph_seps,table_generator,return_rings,adjmat_to_adjlist,canon_order +from yarp.properties import el_to_an,an_to_el,el_mass, el_radii +from yarp.find_lewis import find_lewis,return_formals,return_n_e_accept,return_n_e_donate,return_formals,return_connections,return_bo_dict +from yarp.hashes import atom_hash,yarpecule_hash +from yarp.input_parsers import xyz_parse,xyz_q_parse,xyz_from_smiles, mol_parse +from yarp.misc import merge_arrays, prepare_list +from openbabel import pybel +from rdkit import Chem +from rdkit.Chem import EnumerateStereoisomers, AllChem, TorsionFingerprints, rdmolops, rdDistGeom +from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions +from rdkit.ML.Cluster import Butina +from math import cos, sin +from wrappers.reaction import * +from utils import * +from main_xtb import initialize + +def main(args:dict): + #args, logger=initialize(args) + args, logger, logging_queue=initialize(args) + + if os.path.isfile(args["input"]) and fnmatch.fnmatch(args["input"], '*.smi'): # Read smiles in + mol=[i.split('\n')[0] for i in open(args["input"], 'r+').readlines()] + elif os.path.isfile(args["input"]) and fnmatch.fnmatch(args["input"], '*.xyz'): + mol=[args["input"]+"/"+i for i in os.listdir(args["input"])] + else: + mol=[args["input"]+"/"+i for i in os.listdir(args["input"]) if fnmatch.fnmatch(i, '*.xyz') or fnmatch.fnmatch(i, '*.mol')] + radical=[] + #print(mol) + + for i in mol: + print("Generate uniradical for {}".format(i)) + reactant=yp.yarpecule(i) + # find the reactant is radical or not + heavy=0 + CHNO=1 + for _ in reactant.elements: + if _!="H" and _!="h": heavy+=1 + if _!="H" and _!="h" and _!="C" and _!="c" and _!="N" and _!="n" and _!="O" and _!="o": CHNO=0 + if CHNO==0: continue + if heavy>20: continue + bmat=reactant.bond_mats[0] + is_rad=0 + for j in range(len(bmat)): + is_rad+=int(bmat[j, j])%2 + if sum(np.abs(reactant.fc))>0: + print("Ionic species. Removing....") + continue + elif is_rad==1: + print("{} is a uni-radical.".format(i)) + radical.append(reactant) + elif is_rad==0: + tmp=generate_uniradical(reactant) + for _ in tmp: radical.append(_) + # running enumerations + print(f"We have {len(radical)} radicals.") + with open("radicals.p", "wb") as f: + pickle.dump(radical, f) + + + radical=pickle.load(open("radicals.p", "rb")) + + #radicals=[] + #for _ in radical: + # heavy=0 + # for i in _.elements: + # if i!="H" or i!="h": heavy+=1 + # if heavy<20: + # radicals.append(_) + #radical=[] + #radical=radicals + #reactions=[] + # print(radical) + # print(len(radical)) + # exit() + """ + count=0 + break_mol=list(yp.break_bonds(radical[0], n=1)) + products=yp.form_n_bonds(break_mol, n=1, def_only=True) + products=[_ for _ in products if _.bond_mat_scores[0]<=0.0] + for j in products: reactions.append(reaction(radical[0], j, args=args, opt=True)) + reactions=[i for i in reactions if "Error" not in i.product_inchi and "Error" not in i.reactant_inchi] + MR_rxns, MR_dict=create_model_reactions(reactions) + print(len(reactions)) + print(len(MR_rxns)) + exit() + + count=0 + + reactions=[] + for count_i, i in enumerate(radical): + break_mol=list(yp.break_bonds(i, n=args["n_break"])) + products=yp.form_n_bonds(break_mol, n=args["n_break"], def_only=True) + products=[_ for _ in products if _.bond_mat_scores[0]<=0.0] + for j in products: + reactions.append(reaction(i, j, args=args, opt=True)) + reactions=[_ for _ in reactions if "Error" not in i.product_inchi and "Error" not in i.reactant_inchi] + with open("All_true.p", "wb") as f: pickle.dump(reactions, f) + MR_rxns, MR_dict=create_model_reactions(reactions) + # with open("All_true.p", "wb") as f: pickle.dump(reactions, f) + with open("All_model.p", "wb") as f: pickle.dump(MR_rxns, f) + with open("All_dict.p", "wb") as f: pickle.dump(MR_dict, f) + """ + count=0 + while 1: + reactions=[] + # if count>=5: break + if count+10>len(radical): bound=len(radical) + else: bound=count+10 + if 0: + #if os.path.isfile(f"true_{count}_{bound}.p") is True: + reactions=pickle.load(open(f"true_{count}_{bound}.p", "rb")) + else: + for i in range(count, bound): + #print("RADICAL") + #for count_e, _ in enumerate(radical[i].geo): + # print(f"{radical[i].elements[count_e]} {_[0]} {_[1]} {_[2]}") + react=[] + rad=[] + for count_b, _ in enumerate(radical[i].bond_mats[0]): + if _[count_b]%2==0: react.append(count_b) + else: rad.append(count_b) + neb=[] + for _ in rad: + for count_bd, bd in enumerate(radical[i].adj_mat[_]): + if bd: neb.append(count_bd) + react=[_ for _ in react if _ not in neb] + break_mol=list(break_H_bonds(radical[i], react=react)) + #break_mol=list(yp.break_bonds(radical[i], n=args["n_break"])) + #products=yp.form_n_bonds(break_mol, n=args["n_break"], def_only=True) + products=form_radical_bonds(break_mol) + products=[_ for _ in products if _.bond_mat_scores[0]<=0.0] + for j in products: + print("Create reaction class....") + if np.sum(abs(radical[i].adj_mat-j.adj_mat))==0: continue + product_geo=opt_geo(j.elements, j.geo, j.bond_mats[0]) + if product_geo!=[]: + j.geo=product_geo + reactions.append(reaction(radical[i], j, args=args, opt=False)) + reactions=[i for i in reactions if "ERROR" not in i.product_inchi and "ERROR" not in i.reactant_inchi] + reactions=[i for i in reactions if i.product_inchi!=i.reactant_inchi] + with open(f"MR/true_{count}_{bound}.p", "wb") as f: + print(f"MR/true_{count}_{bound}.p") + pickle.dump(reactions,f) + print("Create model reaction....") + MR_rxns, MR_dict=create_model_reactions(reactions) + #print(len(reactions)) + #print(len(MR_rxns)) + #print(f"model_{count}_{bound}.p") + with open(f"MR/model_{count}_{bound}.p", "wb") as f: + pickle.dump(MR_rxns, f) + with open(f"MR/MRdict_{count}_{bound}.p", "wb") as f: + pickle.dump(MR_dict, f) + if count+10>=len(radical): break + count=count+10 + #break + #if count>=len(radical):break + """ + for i in radical: + break_mol=list(yp.break_bonds(i, n=args["n_break"])) + products=yp.form_n_bonds(break_mol, n=args["n_break"], def_only=True) + products=[_ for _ in products if _.bond_mat_scores[0]<=3.0] + for j in products: reactions.append(reaction(i, j, args=args, opt=True)) + with open("true_rxns.p", "wb") as f: + pickle.dump(reactions, f) + MR_rxns, MR_dict=create_model_reactions(reactions) + #for rxns in MR_rxns: + # for count_i, i in enumerate(rxns.reactant.elements): + # print(f"{i} {rxns.reactant.geo[count_i][0]} {rxns.reactant.geo[count_i][1]} {rxns.reactant.geo[count_i][2]}") + # print("\n") + with open(args["reaction_data"], "wb") as f: + pickle.dump(MR_rxns, f) + with open("MR_dict.p", "wb") as f: + pickle.dump(MR_dict, f) + """ + return + +def create_model_reactions(reaction): + # this function is given the set of true reaction and return a set of model reactions. + # this function will generate one list and one dictionary. + # the list will store all model reactions + # the dictionary will store the infomation between model reaction and true reaction. + MR_list=[] + MR_dict=dict() + for rxn in reaction: + MR=return_model_rxn(rxn) + if MR==[]: continue + if MR.hash not in MR_dict.keys(): + MR_dict[MR.hash]=[rxn.hash] + MR_list.append(MR) + else: + MR_dict[MR.hash].append(rxn.hash) + return MR_list, MR_dict + +def generate_uniradical(reactant): + # give a neutral species, this function will remove a hydrogen atom and generate a list of uni-radical + # find hydrogen atom first + hydrogen=[] + for count, atom in enumerate(reactant.elements): + if atom.lower()=='h': hydrogen.append(count) + #print(hydrogen) + product=list(break_H_bonds(reactant, react=hydrogen)) + # remove the hydrogen radical in reactant and product list + #print(len(product)) + #print(len(product)) + #for prod in product: + # print(len(prod.elements)) + # for count, i in enumerate(prod.geo): + # print(f"{prod.elements[count]} {i[0]} {i[1]} {i[2]}") + # print("\n") + radicals=[] + for prod in product: + elements=[] + P_geo=[] + #prod=geometry_opt(prod) + #print(prod.adj_mat) + for count_i, i in enumerate(range(len(prod.adj_mat))): + if prod.adj_mat[count_i].sum()==0: # the separate hydrogen atom + continue + else: + elements.append(prod.elements[count_i]) + P_geo.append(prod.geo[count_i]) + out=open(".tmp_P.xyz", "w+") + out.write(f"{len(elements)}\n\n") + for count_i, i in enumerate(P_geo): + out.write(f"{elements[count_i]} {i[0]} {i[1]} {i[2]}\n") + #print(f"{elements[count_i]} {i[0]} {i[1]} {i[2]}") + out.close() + P=yp.yarpecule(".tmp_P.xyz", canon=False) + os.system("rm .tmp_P.xyz") + radicals.append(P) + return radicals + +def return_model_rxn(rxn, depth=1): + # This function is written by Hsuan-Hao Hsu (hsu205@purdue.edu) + # Read in a true reaction and return a reaction class of model reaction + elements=rxn.reactant.elements + R_geo=rxn.reactant.geo + P_geo=rxn.product.geo + R_adj=rxn.reactant.adj_mat + P_adj=rxn.product.adj_mat + R_bond=rxn.reactant.bond_mats[0] + P_bond=rxn.product.bond_mats[0] + BE_change=P_bond-R_bond + adj_change=P_adj-R_adj + bn, fm, _=return_bnfm(BE_change) + bond_change, reactive_atoms=return_adj_change(adj_change) + gs=graph_seps(R_adj) + keep_idx=list(reactive_atoms) + edge_idx=[] # the atom we need to add hydrogens + for i in bond_change: + if i[0] not in keep_idx: keep_idx.append(i[0]) + if i[1] not in keep_idx: keep_idx.append(i[1]) + for count_j, j in enumerate(gs[i[0]]): + if j>0 and j<=depth and count_j not in keep_idx: keep_idx.append(count_j) + if j>0 and j==depth and count_j not in reactive_atoms and count_j not in edge_idx: edge_idx.append(count_j) + for count_j, j in enumerate(gs[i[1]]): + if j>0 and j<=depth and count_j not in keep_idx: keep_idx.append(count_j) + if j>0 and j==depth and count_j not in reactive_atoms and count_j not in edge_idx: edge_idx.append(count_j) + # keep_idx stores the info of the atoms we want to keep + # next step is adding hydrogens at the edge atom + # print(keep_idx) + # print(edge_idx) + new_R_E, new_R_geo, new_P_geo=return_model_geo(elements, R_geo, R_bond, BE_change, keep_idx, edge_idx) + if len(new_P_geo)<=2 or len(new_R_geo)<=2: + print("Failed to optimize geometry for model reaction.") + return [] + xyz_write(".tmp_R.xyz", new_R_E, new_R_geo) + reactant=yp.yarpecule(".tmp_R.xyz", canon=False) + os.system("rm .tmp_R.xyz") + xyz_write(".tmp_P.xyz", new_R_E, new_P_geo) + product=yp.yarpecule(".tmp_P.xyz", canon=False) + os.system("rm .tmp_P.xyz") + R=reaction(reactant, product, args=rxn.args, opt=True) + return R + +def return_model_geo(elements, geo, bondmat, BE_change, keep_idx, edge_idx): + # this function will generate the geometry for model rxn + new_E, new_geo, new_edge, new_bondmat, numbond, new_BE_change=[], [], [], [], [], [] + for count_i, i in enumerate(elements): + tmp=0 + if count_i in keep_idx: + if count_i in edge_idx: new_edge.append(len(new_E)) + new_E.append(i) + new_geo.append(geo[count_i]) + new_bondmat.append([j for count_j, j in enumerate(bondmat[count_i]) if count_j in keep_idx]) + new_BE_change.append([j for count_j, j in enumerate(BE_change[count_i]) if count_j in keep_idx]) + for count_j, j in enumerate(bondmat[count_i]): + if count_j != count_i: tmp+=j + numbond.append(tmp) + for i in new_edge: + # add hydrogen to the edge atoms + tot_bond=0 + for count_j, j in enumerate(new_bondmat[i]): + if count_j != i: tot_bond+=j + num_add_hydrogen=int(numbond[i]-tot_bond) + #print(num_add_hydrogen) + if num_add_hydrogen > 0: + bond_length=el_radii[new_E[i]]+el_radii["H"] + for j in range(num_add_hydrogen): + new_E.append("H") + #const=2*3.1415926/num_add_hydrogen*float(j) + #argu=1 + #cycle=0 + connect_ids=[count_k for count_k, k in enumerate(new_bondmat[i]) if count_k!=i and k>=1] + if len(connect_ids)==1: # don't need to use cross + vec=[new_geo[i][0]-new_geo[connect_ids[0]][0],new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + #print(vec) + vec=vec/np.linalg.norm(vec) + if vec[2]>1E-6: imag_vec=np.array([1.0, 1.0, -(vec[0]+vec[1])/vec[2]]) + else: imag_vec=np.array([1.0, 1.0, 0.0]) + rotate=rotate_matrix(imag_vec, 1.0/6.0*math.pi+np.random.rand()/math.pi) + vec=np.dot(rotate, vec) + vec=vec/np.linalg.norm(vec) + #print(vec) + new_coord=new_geo[i]+vec*bond_length + else: + vecs=[] + # print(connect_ids) + for k in connect_ids: + vec=[new_geo[k][0]-new_geo[i][0], new_geo[k][1]-new_geo[i][1], new_geo[k][2]-new_geo[i][2]] + vecs.append(vec/np.linalg.norm(vec)) + for k in range(len(connect_ids)-1): + if k==0: + vec=np.cross(vecs[k], vecs[k+1]) + while np.linalg.norm(vec)<1E-6: # two vectors are parallel (sin(theta)=0) + vec=np.cross(vecs[k], vecs[k+1]+0.1*np.random.rand(3)) + vec=vec/np.linalg.norm(vec) + else: + vec=np.cross(vec, vecs[k+1]) + while np.linalg.norm(vec)<1E-6: + vec=np.cross(vec, vecs[k+1]+0.1*np.random.rand(3)) + vec=vec/np.linalg.norm(vec) + #vec=vec/np.linalg.norm(vec) + goal=1 + cycle=0 + while goal: + cycle+=1 + dot_product=[] + for k in vecs: + dot_product.append(np.dot(vec, k)) + if max(dot_product)>0.95: # Too close + goal=1 + abs_dot=[k for k in dot_product] + axis=vecs[abs_dot.index(min(abs_dot))] + deg=0.5*math.pi + rotate=rotate_matrix(axis, deg) + #print(cycle) + #print(rotate) + vec=np.dot(rotate, vec) + if max(dot_product)<0.95 or cycle>5: + goal=0 + #for k in vecs: + # if (np.dot(vec,k))>0.95: print("Close") + new_coord=new_geo[i]+vec*bond_length+0.01*np.random.rand(3) + #print(vecs) + #print(vec) + new_geo.append(new_coord) + ''' + while argu or cycle<10: + if cycle<3: vec=[bond_length*cos(const), bond_length*sin(const), 0.0] + elif cycle<6: vec=[0.0, bond_length*cos(const), bond_length*sin(const)] + else: vec=[bond_length*cos(const), 0.0, bond_length*sin(const)] + new_coord=new_geo[i]+[np.random.random()*0.01, np.random.random()*0.01, np.random.random()*0.01]+vec + argu=0 + cycle=cycle+1 + for old_coord in new_geo: + dist=(old_coord[0]-new_coord[0])**2.0+(old_coord[1]-new_coord[1])**2.0+(old_coord[2]-new_coord[2])**2.0 + dist=dist**0.5 + #print(dist) + if dist<0.2: argu=1 + new_geo.append(new_coord) + ''' + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + if len(new_E)<=1: + return [], [], [] + new_bondmat=np.asarray(new_bondmat) + #print(new_E) + #print(new_bondmat) + # new_geo=opt_geo(new_E, new_geo, new_bondmat) + # exit() + try: + #print("A") + #print("Model Reactant") + #for count_i, i in enumerate(new_geo): + # print(f"{new_E[count_i]} {i[0]} {i[1]} {i[2]}") + + new_geo=opt_geo(new_E, new_geo, new_bondmat) + #print("After OPT") + #for count_i, i in enumerate(new_geo): + # print(f"{new_E[count_i]} {i[0]} {i[1]} {i[2]}") + except: + return [], [], [] + try: + #print("Model Product") + new_BE_change=np.asarray(new_BE_change) + new_bondmat=new_bondmat+new_BE_change + + #for count_i, i in enumerate(new_geo): + # print(f"{new_E[count_i]} {i[0]} {i[1]} {i[2]}") + #print("B") + + new_change_geo=opt_geo(new_E, new_geo, new_bondmat) + except: + return [], [], [] + return new_E, new_geo, new_change_geo + +def rotate_matrix(axis, deg): + rotate_matrix=scipy.linalg.expm(np.cross(np.eye(3), axis/scipy.linalg.norm(axis)*deg)) + return rotate_matrix + +def return_bnfm(bondmat): + break_bond=[] + form_bond=[] + reactive_atoms=[] + for i in range(len(bondmat)): + for j in range(len(bondmat)): + if i>j: + if bondmat[i][j]>0: + form_bond+=[(i, j)] + if i not in reactive_atoms: reactive_atoms.append(i) + if j not in reactive_atoms: reactive_atoms.append(j) + elif bondmat[i][j]<0: + break_bond+=[(i, j)] + if i not in reactive_atoms: reactive_atoms.append(i) + if j not in reactive_atoms: reactive_atoms.append(j) + return break_bond, form_bond, reactive_atoms + +def return_adj_change(adjmat): + keep_idx=[] + reactive_atoms=[] + for i in range(len(adjmat)): + for j in range(len(adjmat)): + if i > j: + if adjmat[i][j]!=0: + keep_idx+=[(i, j)] + reactive_atoms.append(i) + reactive_atoms.append(j) + return keep_idx, reactive_atoms + +def break_H_bonds(mol, react=[]): + hashes=set([]) + bonds=[(count_r, count_c) for count_r, row in enumerate(mol.adj_mat) for count_c, col in enumerate(row) if (count_r in react and col>0)] + for b in bonds: + adj_mat=copy.copy(mol.adj_mat) + adj_mat[b[0], b[1]]=0 + adj_mat[b[1], b[0]]=0 + bmat=copy.copy(mol.bond_mats[0]) + bmat[b[0], b[1]]-=1 + bmat[b[1], b[0]]-=1 + bmat[b[0], b[0]]+=1 + bmat[b[1], b[1]]+=1 + tmp=yp.yarpecule((adj_mat, mol.geo, mol.elements, mol.q), canon=False) + tmp.bond_mats[0]=bmat + if tmp.hash not in hashes: + yield tmp + hashes.add(tmp.hash) + +def form_radical_bonds(mol): + yarpecules=prepare_list(mol) + hashes=set([_.hash for _ in yarpecules]) + for y in yarpecules: + react=[] + for count, _ in enumerate(y.bond_mats[0]): + if _[count]%2!=0: react.append(count) + if len(react)!=3: continue + form_bonds=[(react[0], react[1]), (react[0], react[2]), (react[1], react[2])] + for i in form_bonds: + adj_mat=copy.copy(y.adj_mat) + bmat=copy.copy(y.bond_mats[0]) + adj_mat[i[0], i[1]]=1 + adj_mat[i[1], i[0]]=1 + bmat[i[0], i[1]]+=1 + bmat[i[1], i[0]]+=1 + bmat[i[0], i[0]]-=1 + bmat[i[1], i[1]]-=1 + tmp=yp.yarpecule((adj_mat, y.geo, y.elements, y.q), canon=False) + tmp.bond_mats[0]=bmat + if tmp.hash not in hashes: + yield tmp + hashes.add(tmp.hash) + +if __name__=="__main__": + parameters = sys.argv[1] + parameters = yaml.load(open(parameters, "r"), Loader=yaml.FullLoader) + main(parameters) diff --git a/pyTEST_Example/parameters.yaml b/pyTEST_Example/parameters.yaml new file mode 100644 index 0000000..f3168ce --- /dev/null +++ b/pyTEST_Example/parameters.yaml @@ -0,0 +1,48 @@ +# Note that a bimolecular reaction is recommended to use .xyz or .mol file +# Enumetation part +#input: /scratch/negishi/li1724/110124-YARP-Github/private_YARP/pyTEST_Example//reaction_xyz/DA.xyz # (1) The input folder with .xyz or .mol file (2) a text file with multiple smiles +input: reaction_xyz # (1) The input folder with .xyz or .mol file (2) a text file with multiple smiles +scratch: /scratch/negishi/li1724/110124-YARP-Github/private_YARP/pyTEST_Example//RESULT/ # The output folder with reaction geometry for constructing reaction network. +reaction_data: DA.p +n_break: 1 # Indicate the number of breaking bonds +form_all: False # Forming all possible bonds (matters for lone paired electrons (oxygen, sulfur, and other similar atoms)) (default: 0) +lewis_criteria: 5.0 # the criteria to find the products +ff: uff # force field for driving the reaction coordinate +crest: crest +lot: gfn2 +crest_quick: False +xtb: xtb +method: crest +enumeration: False # if you only have reactant, you will need to do enumeration. Ohterwise, you need to provide reaction geometry. +n_conf: 5 +nconf_dft: 5 +strategy: 2 +nprocs: 1 +c_nprocs: 1 +mem: 1 # in GB +crest: crest +opt_level: vtight +crest_quick: False +opt: False +pysis_wt: 3600 +select: network +charge: 0 +multiplicity: 1 +skip_low_IRC: False +skip_low_TS: False +constrained_TS: True +model_path: /scratch/negishi/li1724/110124-YARP-Github/private_YARP/pyTEST_Example//bin +gsm_inp: /scratch/negishi/li1724/110124-YARP-Github/private_YARP/pyTEST_Example//bin/inpfileq + +package: ORCA #Gaussian or ORCA +dft_nprocs: 4 +dft_lot: UHF 6-31G +dft_wt: 4 +ppn: 8 +partition: standby +dft_njobs: 1 +hess_recalc: 3 +dft_irc: True +backward_DE: False +dielectric: 95.3 +solvation: CPCM/read diff --git a/pyTEST_Example/qc_jobs.py b/pyTEST_Example/qc_jobs.py new file mode 100644 index 0000000..ed84ef3 --- /dev/null +++ b/pyTEST_Example/qc_jobs.py @@ -0,0 +1,128 @@ +#!/bin/env python +import logging +import time + +import multiprocessing as mp +from multiprocessing import Queue +from logging.handlers import QueueHandler +from concurrent.futures import ProcessPoolExecutor, TimeoutError + +def run_xtb(xtb_job, logging_queue): + ''' subprocess for running xtb in parallel ''' + # set up logger + logger = logging.getLogger("main") + # Add handler only if it doesn't already exist + if not logger.hasHandlers(): + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + + if xtb_job.calculation_terminated_normally(): + print(f"XTB job {xtb_job.jobname.split()[-1]} has been finished, skip this job...") + logger.info(f"XTB job {xtb_job.jobname.split()[-1]} has been finished, skip this job...") + else: + print(f"running XTB job {xtb_job.jobname.split()[-1]} on PID {mp.current_process().pid}") + logger.info(f"running XTB job {xtb_job.jobname.split()[-1]} on PID {mp.current_process().pid}") + xtb_job.execute() + +def run_crest(crest_job, logging_queue): + ''' subprocess for running crest in parallel ''' + # set up logger + logger = logging.getLogger("main") + # Add handler only if it doesn't already exist + if not logger.hasHandlers(): + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + + if crest_job.calculation_terminated_normally(): + print(f"CREST job {crest_job.jobname} has been finished, skip this job...") + logger.info(f"CREST job {crest_job.jobname} has been finished, skip this job...") + else: + print(f"running CREST job {crest_job.jobname} on PID {mp.current_process().pid}") + logger.info(f"running CREST job {crest_job.jobname} on PID {mp.current_process().pid}") + result = crest_job.execute() + if result.returncode == 0: + print(f"CREST job {crest_job.jobname} is finished.") + logger.info(f"CREST job {crest_job.jobname} is finished.") + else: + print(f"Command failed for CREST job {crest_job.jobname} with the following error message:") + logger.info(f"Command failed for CREST job {crest_job.jobname}, check job log file for detailed information") + +def run_gsm(gsm_job, logging_queue): + ''' subprocess for running gsm in parallel ''' + # set up logger + logger = logging.getLogger("main") + # Add handler only if it doesn't already exist + if not logger.hasHandlers(): + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + + if gsm_job.calculation_terminated_normally(): + print(f"GSM job {gsm_job.jobname} has been finished, skip this job...") + logger.info(f"GSM job {gsm_job.jobname} has been finished, skip this job...") + else: + print(f"running GSM job {gsm_job.jobname} on PID {mp.current_process().pid}") + logger.info(f"running GSM job {gsm_job.jobname} on PID {mp.current_process().pid}") + start = time.time() + result= gsm_job.execute() + end = time.time() + if result.returncode == 0: + print(f"GSM job {gsm_job.jobname} is finished, with running time {end-start:.1f}s") + logger.info(f"GSM job {gsm_job.jobname} is finished, with running time {end-start:.1f}s") + else: + print(f"Command failed for GSM job {gsm_job.jobname} with the following error message:") + logger.info(f"Command failed for GSM job {gsm_job.jobname}, check job log file for detailed information") + print(result.stderr) + +def run_pysis(pysis_job, logging_queue, timeout=3600): + ''' subprocess for running pysis in parallel ''' + # set up logger + logger = logging.getLogger("main") + # Add handler only if it doesn't already exist + if not logger.hasHandlers(): + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + + if pysis_job.calculation_terminated_normally(): + print(f"PYSIS job {pysis_job.jobname} has been finished, skip this job...") + logger.info(f"PYSIS job {pysis_job.jobname} has been finished, skip this job...") + else: + print(f"running PYSIS job {pysis_job.jobname} on PID {mp.current_process().pid}") + logger.info(f"running PYSIS job {pysis_job.jobname} on PID {mp.current_process().pid}") + result = pysis_job.execute(timeout=timeout) + if result.returncode == 0: + print(f"PYSIS job {pysis_job.jobname} is finished.") + logger.info(f"PYSIS job {pysis_job.jobname} is finished.") + else: + print(f"Command failed for PYSIS job {pysis_job.jobname} with the following error message:") + logger.info(f"Command failed for PYSIS job {pysis_job.jobname}, check job log file for detailed information") + print(result.stderr) +def run_ssm(ssm_job): + ''' subprocess for running ssm in parallel ''' + if ssm_job.calculation_terminated_normally(): + print(f"SSM job {ssm_job.jobname} has been finished, skip this job...") + else: + print(f"running SSM job {ssm_job.jobname} on PID {mp.current_process().pid}") + start = time.time() + result= ssm_job.execute() + end = time.time() + if result.returncode == 0: + print(f"SSM job {ssm_job.jobname} is finished, with running time {end-start:.1f}s") + else: + print(f"Command failed for SSM job {ssm_job.jobname} with the following error message:") + print(result.stderr) + +def run_gauxtb(gau_job): + ''' subprocess for running gau-xtb in parallel ''' + + if gau_job.job_finished(): + print(f"Gaussian job {gau_job.jobname} has been finished, skip this job...") + else: + print(f"running Gaussian job {gau_job.jobname} on PID {mp.current_process().pid}") + start = time.time() + result= gau_job.execute() + end = time.time() + if result.returncode == 0: + print(f"Gaussian job {gau_job.jobname} is finished, with running time {end-start:.1f}s") + else: + print(f"Command failed for Gaussian job {gau_job.jobname} with the following error message:") + print(result.stderr) diff --git a/pyTEST_Example/reaction_xyz/DA.xyz b/pyTEST_Example/reaction_xyz/DA.xyz new file mode 100644 index 0000000..86016d8 --- /dev/null +++ b/pyTEST_Example/reaction_xyz/DA.xyz @@ -0,0 +1,36 @@ +16 + +C -8.2955392952 1.4486317718 -0.0002056327 +C -7.0080392547 1.8045410323 0.0002892918 +H -9.0346533886 2.0202725561 0.5522735322 +H -8.6357160149 0.5786080992 -0.5530445386 +H -6.6678758591 2.6745285006 0.5531965412 +H -6.2689193044 1.2328613693 -0.5521313611 +C 3.1574304884 3.0921327489 -0.2296538131 +C 4.4571289305 3.2340887809 0.0545094078 +C 5.2416674162 2.2886441018 0.8105293417 +C 5.2008724629 0.9605294355 0.6497505064 +H 2.6271567921 3.8596607896 -0.7840855628 +H 2.5902028050 2.2218643999 0.0845936137 +H 4.9733513653 4.1415941533 -0.2525919674 +H 5.9443845946 2.7170197739 1.5226435930 +H 4.5425727422 0.4871509130 -0.0716425732 +H 5.8408892830 0.3135963756 1.2409664001 +16 + +C -1.0130998647 1.2188100173 0.4131533656 +C -1.2165513306 2.5155395913 -0.1903843632 +H -1.8343383803 0.9722823872 1.1317190614 +H -1.2350027141 0.4367750300 -0.3693100239 +H -1.9259362370 3.1208321785 0.4343687424 +H -1.8682970459 2.4014179170 -1.0996707941 +C -0.0706470891 3.3142655200 -0.5325170884 +C 1.2041191104 2.9167873587 -0.0673717451 +C 1.3729637658 1.7429627418 0.6634192445 +C 0.2756375600 0.9053310300 0.9679057377 +H -0.2284205994 4.4102378878 -0.3761047893 +H 0.0086180232 3.4221533822 -1.6517286587 +H 2.0707320972 3.5401940159 -0.2821124798 +H 2.3678522185 1.4707353948 1.0123078712 +H 0.5543546657 -0.1733362366 0.8792394704 +H 0.2029295834 0.8607365858 2.0924832284 diff --git a/pyTEST_Example/read_yarp_reaction.py b/pyTEST_Example/read_yarp_reaction.py new file mode 100644 index 0000000..2bef9ee --- /dev/null +++ b/pyTEST_Example/read_yarp_reaction.py @@ -0,0 +1,8 @@ +import pickle +import os, sys + +data=pickle.load(open(sys.argv[1], 'rb')) +for i in data: + print(i.product_inchi) + print(i.reactant_dft_opt) + print(i.TS_dft) diff --git a/pyTEST_Example/scripts/ard/create_gsm_jobs.py b/pyTEST_Example/scripts/ard/create_gsm_jobs.py new file mode 100755 index 0000000..72ac571 --- /dev/null +++ b/pyTEST_Example/scripts/ard/create_gsm_jobs.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os +import re +import shutil + +from ard_gsm.qchem import QChem, QChemError, insert_into_qcinput +from ard_gsm.mol import MolGraph +from ard_gsm.driving_coords import generate_driving_coords +from config.limits import connection_limits + + +def main(): + args = parse_args() + config_qchem_start = args.config_qchem + config_qchem_end = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.gsm.end') + config_gsm = args.config_gsm + config_gscreate = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'gsm.gscreate') + + pdir = args.out_dir + if not os.path.exists(pdir): + os.makedirs(pdir) + num_regex = re.compile(r'\d+') + + with open(os.path.join(pdir, 'params.log'), 'w') as f: + if args.check_limits: + f.write('Connection limits:\n') + for symbol in connection_limits: + ll = connection_limits[symbol][0] + ul = connection_limits[symbol][1] + f.write(f' {symbol}: {ll}, {ul}\n') + f.write(f'maxbreak = {args.maxbreak}\n') + f.write(f'maxform = {args.maxform}\n') + f.write(f'maxchange = {args.maxchange}\n') + f.write(f'single_change = {not args.ignore_single_change}\n') + f.write(f'equiv_Hs = {args.equiv_Hs}\n') + f.write(f'minbreak = {args.minbreak}\n') + f.write(f'minform = {args.minform}\n') + f.write(f'minchange = {args.minchange}\n') + + with open(config_qchem_start) as f: + config_qchem_start = f.readlines() + if args.mem is not None: + config_qchem_start = insert_into_qcinput( + config_qchem_start, f'MEM_TOTAL {args.mem:d}\n', '$rem' + ) + + for log_idx, logfile in enumerate(glob.iglob(os.path.join(args.qlog_dir, '*.log'))): + try: + log = QChem(logfile=logfile) + except QChemError as e: + print(e) + continue + + # Check frequencies + try: + freqs = log.get_frequencies() + except QChemError as e: + if 'not found' in str(e): + print(f'Warning: Frequencies could not be found in {logfile}') + else: + raise + else: + if any(freq < 0.0 for freq in freqs): + raise Exception(f'Negative frequency in {logfile}! Not optimized') + + symbols, coords = log.get_geometry() + mol = MolGraph(symbols=symbols, coords=coords) + mol.infer_connections() + + print(f'Making driving coordinates from {logfile}') + driving_coords_set = generate_driving_coords( + mol, + maxbreak=args.maxbreak, + maxform=args.maxform, + maxchange=args.maxchange, + single_change=not args.ignore_single_change, + equiv_Hs=args.equiv_Hs, + minbreak=args.minbreak, + minform=args.minform, + minchange=args.minchange, + check_limits=args.check_limits + ) + + try: + num = int(num_regex.search(os.path.basename(logfile)).group(0)) + except AttributeError: + # Couldn't find number in filename + num = log_idx + + out_dir = os.path.join(pdir, f'gsm{num}') + scr_dir = os.path.join(out_dir, 'scratch') + if not os.path.exists(out_dir): + os.mkdir(out_dir) + if not os.path.exists(scr_dir): + os.mkdir(scr_dir) + + # Use charge and multiplicity from reactant job + config_qchem_start_tmp = insert_into_qcinput( + config_qchem_start, f'{log.get_charge()} {log.get_multiplicity()}\n', '$molecule' + ) + with open(os.path.join(out_dir, 'qstart'), 'w') as f: + f.writelines(config_qchem_start_tmp) + + shutil.copy(config_qchem_end, os.path.join(out_dir, 'qend')) + shutil.copy(config_gsm, os.path.join(out_dir, 'inpfileq')) + + gscreate_path = os.path.join(out_dir, 'gscreate') + shutil.copy(config_gscreate, gscreate_path) + os.chmod(gscreate_path, 0o755) # Make executable + + for idx, driving_coords in enumerate(driving_coords_set): + isomers_file = os.path.join(scr_dir, f'ISOMERS{idx:04}') + initial_file = os.path.join(scr_dir, f'initial{idx:04}.xyz') + with open(isomers_file, 'w') as f: + f.write(str(driving_coords)) + with open(initial_file, 'w') as f: + f.write(str(len(symbols)) + '\n') + f.write('\n') + for symbol, xyz in zip(symbols, coords): + f.write(f'{symbol} {xyz[0]: .10f} {xyz[1]: .10f} {xyz[2]: .10f}\n') + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('qlog_dir', metavar='QDIR', help='Path to directory containing geometry optimization outputs') + parser.add_argument('out_dir', metavar='ODIR', help='Path to output directory') + parser.add_argument('--maxbreak', type=int, default=3, metavar='B', help='Maximum number of connections to break') + parser.add_argument('--maxform', type=int, default=3, metavar='F', help='Maximum number of connections to form') + parser.add_argument('--maxchange', type=int, default=5, metavar='C', help='Maximum number of connections to change') + parser.add_argument('--ignore_single_change', action='store_true', + help='Do not consider single connection changes (e.g., nbreak=1, nform=0)') + parser.add_argument('--consider_equivalent_hydrogens', action='store_true', dest='equiv_Hs', + help='Create equivalent driving coordinates for the same reaction with different but ' + 'equivalent hydrogens, i.e., hydrogens in methyl groups') + parser.add_argument('--minbreak', type=int, default=0, metavar='B', help='Minimum number of connections to break') + parser.add_argument('--minform', type=int, default=0, metavar='F', help='Minimum number of connections to form') + parser.add_argument('--minchange', type=int, default=1, metavar='F', help='Minimum number of connections to change') + parser.add_argument('--check_limits', action='store_true', help='Check valencies of expected products') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument( + '--config_qchem', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.gsm.start'), + help='Configuration file for Q-Chem calls in GSM' + ) + parser.add_argument( + '--config_gsm', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'gsm.inpfileq'), + help='Settings for GSM calculations' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/create_prod_optfreq_jobs.py b/pyTEST_Example/scripts/ard/create_prod_optfreq_jobs.py new file mode 100755 index 0000000..298897a --- /dev/null +++ b/pyTEST_Example/scripts/ard/create_prod_optfreq_jobs.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os +import re + +from ard_gsm.qchem import QChem +from ard_gsm.util import iter_sub_dirs, read_xyz_file + + +def main(): + args = parse_args() + num_regex = re.compile(r'\d+') + maxnum = float('inf') if args.maxnum is None else args.maxnum + + for gsm_sub_dir in iter_sub_dirs(args.gsm_dir, pattern=r'gsm\d+'): + gsm_num = int(num_regex.search(os.path.basename(gsm_sub_dir)).group(0)) + if gsm_num > maxnum: + continue + + out_dir = os.path.join(args.out_dir, os.path.basename(gsm_sub_dir)) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + elif not args.overwrite: + continue + + qstart_file = os.path.join(gsm_sub_dir, 'qstart') + qtmp = QChem(logfile=qstart_file) + charge, multiplicity = qtmp.get_charge(), qtmp.get_multiplicity() + + print(f'Extracting from {gsm_sub_dir}...') + for gsm_log in glob.iglob(os.path.join(gsm_sub_dir, 'gsm*.out')): + num = int(num_regex.search(os.path.basename(gsm_log)).group(0)) + string_file = os.path.join(gsm_sub_dir, f'stringfile.xyz{num:04}') + + if not (os.path.isfile(string_file) and os.path.getsize(string_file) > 0): + continue + if args.ignore_errors and has_error(gsm_log): + continue + + if args.ignore_errors or is_successful(gsm_log): + # Optimize van-der-Waals wells instead of separated products + # Also check if product optimization during GSM failed + xyzs = read_xyz_file(string_file, with_energy=True) + last_energy = xyzs[-1][-1] + second_to_last_energy = xyzs[-2][-1] + if last_energy > second_to_last_energy: # Something went wrong in product optimization + continue + path = os.path.join(out_dir, f'prod_optfreq{num:04}.in') + q = QChem(config_file=args.config) + q.make_input_from_coords(path, *xyzs[-1][:-1], charge=charge, multiplicity=multiplicity, mem=args.mem) + + +def is_successful(gsm_log): + """ + Success is defined as having converged to a transition state. + """ + with open(gsm_log) as f: + for line in reversed(f.readlines()): + if '-XTS-' in line or '-TS-' in line: + return True + return False + + +def has_error(gsm_log): + """ + Check if last node is high in energy or if the path is dissociative. + """ + with open(gsm_log) as f: + for line in reversed(f.readlines()): + if 'high energy' in line and '-exit early-' in line: + return True + if 'terminating due to dissociation' in line: + return True + return False + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('gsm_dir', metavar='GSMDIR', help='Path to directory containing GSM folders') + parser.add_argument('out_dir', metavar='ODIR', help='Path to output directory') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument('--overwrite', action='store_true', help='Overwrite input files in existing directories') + parser.add_argument('--maxnum', type=int, metavar='NUM', help='Only make jobs from GSM folders up to this number') + parser.add_argument('--ignore_errors', action='store_true', + help='Extract from all GSM calculations ignoring (most) errors') + parser.add_argument( + '--config', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.opt_freq'), + help='Configuration file for product optfreq jobs in Q-Chem' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/create_ts_optfreq_jobs.py b/pyTEST_Example/scripts/ard/create_ts_optfreq_jobs.py new file mode 100755 index 0000000..2d22b86 --- /dev/null +++ b/pyTEST_Example/scripts/ard/create_ts_optfreq_jobs.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os +import re + +from ard_gsm.mol import MolGraph +from ard_gsm.qchem import QChem, QChemError +from ard_gsm.reaction import Reaction, group_reactions_by_products, group_reactions_by_connection_changes +from ard_gsm.util import iter_sub_dirs, read_xyz_file + + +def main(): + args = parse_args() + num_regex = re.compile(r'\d+') + maxnum = float('inf') if args.maxnum is None else args.maxnum + + for prod_sub_dir in iter_sub_dirs(args.prod_dir, pattern=r'gsm\d+'): + sub_dir_name = os.path.basename(prod_sub_dir) + gsm_num = int(num_regex.search(sub_dir_name).group(0)) + if gsm_num > maxnum: + continue + + out_dir = os.path.join(args.out_dir, sub_dir_name) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + elif not args.overwrite: + continue + + qstart_file = os.path.join(args.gsm_dir, sub_dir_name, 'qstart') + qtmp = QChem(logfile=qstart_file) + charge, multiplicity = qtmp.get_charge(), qtmp.get_multiplicity() # Doesn't change for products in same folder + + print(f'Extracting from {sub_dir_name}...') + + reactions = {} + for prod_file in glob.iglob(os.path.join(prod_sub_dir, 'prod_optfreq*.out')): + num = int(num_regex.search(os.path.basename(prod_file)).group(0)) + string_file = os.path.join(args.gsm_dir, sub_dir_name, f'stringfile.xyz{num:04}') + + try: + qp = QChem(logfile=prod_file) + freqs = qp.get_frequencies() + except QChemError as e: + print(e) + continue + if any(freq < 0.0 for freq in freqs): + print(f'Ignored {prod_file} because of negative frequency') + continue + + xyzs = read_xyz_file(string_file, with_energy=True) + ts_xyz = max(xyzs[1:-1], key=lambda x: x[2]) + product_symbols, product_coords = qp.get_geometry() + + # Reactant and TS energies are based on string and are relative to the reactant + reactant = MolGraph(symbols=xyzs[0][0], coords=xyzs[0][1], energy=xyzs[0][2]) + ts = MolGraph(symbols=ts_xyz[0], coords=ts_xyz[1], energy=ts_xyz[2]) + product = MolGraph(symbols=product_symbols, coords=product_coords) # Don't bother assigning energy + reactant.infer_connections() + product.infer_connections() + if not args.keep_isomorphic_reactions and reactant.is_isomorphic(product): + print(f'Ignored {prod_file} because product is isomorphic with reactant') + continue + reactions[num] = Reaction(reactant, product, ts) + + if args.group_by_connection_changes: + reaction_groups = group_reactions_by_connection_changes(reactions) + else: + reaction_groups = group_reactions_by_products(reactions) + + for group in reaction_groups: + # Only consider TS energies instead of "barriers" b/c energies are relative to reactant + reactions_in_group = list(group.items()) # Make list + reactions_in_group.sort(key=lambda r: r[1].ts.energy) + + for num, rxn in reactions_in_group[:args.nextract]: + path = os.path.join(out_dir, f'ts_optfreq{num:04}.in') + qts = QChem(mol=rxn.ts, config_file=args.config) + qts.make_input(path, charge=charge, multiplicity=multiplicity, mem=args.mem) + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('gsm_dir', metavar='GSMDIR', help='Path to directory containing GSM folders') + parser.add_argument('prod_dir', metavar='PDIR', help='Path to directory containing folders with product opts') + parser.add_argument('out_dir', metavar='ODIR', help='Path to output directory') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument('-n', '--nextract', type=int, default=4, metavar='N', + help='Number of duplicate reactions of the same type to extract (sorted by lowest barrier)') + parser.add_argument('--group_by_connection_changes', action='store_true', + help='Use connection changes instead of product identities to distinguish reactions') + parser.add_argument('--keep_isomorphic_reactions', action='store_true', + help='Consider reactions where the product is isomorphic with the reactant') + parser.add_argument('--overwrite', action='store_true', help='Overwrite input files in existing directories') + parser.add_argument('--maxnum', type=int, metavar='NUM', help='Only make jobs from GSM folders up to this number') + parser.add_argument( + '--config', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.ts_opt_freq'), + help='Configuration file for TS optfreq jobs in Q-Chem' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/extract_reactions.py b/pyTEST_Example/scripts/ard/extract_reactions.py new file mode 100755 index 0000000..219f411 --- /dev/null +++ b/pyTEST_Example/scripts/ard/extract_reactions.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import csv +import glob +import os +import re + +from ard_gsm.mol import SanitizationError +from ard_gsm.extract import qchem2molgraph, parse_reaction, remove_duplicates, rxn2xyzfile +from ard_gsm.util import iter_sub_dirs + + +def main(): + args = parse_args() + num_regex = re.compile(r'\d+') + out_file = open(args.out_file, 'w') + + if args.xyz_dir is not None: + if not os.path.exists(args.xyz_dir): + os.makedirs(args.xyz_dir) + + writer = csv.writer(out_file) + header = ['rsmi', 'psmi', 'ea', 'dh'] + if args.write_file_info: + header.extend(['rfile', 'pfile', 'tsfile']) + writer.writerow(header) + + if args.all_ts: + # This will not filter any reactions other than incorrect frequencies + parsing_settings = dict( + keep_isomorphic=True, + edist_max=float('inf'), + gdist_max=float('inf'), + normal_mode_check=False, + soft_check=False, + negative_barrier_check=False + ) + else: + parsing_settings = dict( + keep_isomorphic=args.keep_isomorphic_reactions, + edist_max=args.edist, + gdist_max=args.gdist, + normal_mode_check=args.check_normal_mode, + soft_check=args.soft_check, + negative_barrier_check=True + ) + + rxn_num = 0 + for ts_sub_dir in iter_sub_dirs(args.ts_dir, pattern=r'gsm\d+'): + sub_dir_name = os.path.basename(ts_sub_dir) + print(f'Extracting from {sub_dir_name}...') + reactant_num = int(num_regex.search(sub_dir_name).group(0)) + reactant_file = os.path.abspath(os.path.join(args.reac_dir, f'molopt{reactant_num}.log')) + + reactant, qr = qchem2molgraph(reactant_file, freq_only=True, print_msg=False, return_qobj=True) + if reactant is None: + raise Exception(f'Negative frequency for reactant in {reactant_file}!') + + if args.reactant_smiles_from_comment: + reactant_smiles = qr.get_comment() + if args.atommap: + try: + reactant_smiles = reactant.assign_atom_map_numbers_to_smiles(reactant_smiles) + except SanitizationError: + print(f'Error during Smiles conversion in {reactant_file}') + raise + else: + try: + reactant_smiles = reactant.perceive_smiles(atommap=args.atommap) + except SanitizationError: + print(f'Error during Smiles conversion in {reactant_file}') + raise + + reactions = {} + for ts_file in glob.iglob(os.path.join(ts_sub_dir, 'ts_optfreq*.out')): + ts_file = os.path.abspath(ts_file) + num = int(num_regex.search(os.path.basename(ts_file)).group(0)) + prod_file = os.path.abspath(os.path.join(args.prod_dir, sub_dir_name, f'prod_optfreq{num:04}.out')) + + rxn = parse_reaction( + reactant, + prod_file, + ts_file, + **parsing_settings + ) + if rxn is not None: + rxn.reactant_smiles = reactant_smiles + rxn.reactant_file = reactant_file + reactions[num] = rxn + + if not args.all_ts: + reactions = remove_duplicates( + reactions, + group_by_connection_changes=args.group_by_connection_changes, + atommap=args.atommap + ) + + for num, rxn in reactions.items(): + row = [rxn.reactant_smiles, rxn.product_smiles, rxn.barrier, rxn.enthalpy] + if args.write_file_info: + row.extend([rxn.reactant_file, rxn.product_file, rxn.ts_file]) + writer.writerow(row) + if args.xyz_dir is not None: + path = os.path.join(args.xyz_dir, f'rxn{rxn_num:06}.xyz') + rxn2xyzfile(rxn, path) + rxn_num += 1 + + if args.include_reverse: + # For reverse reactions, it's technically possible that some of + # them are the same as already extracted reactions in a different + # sub dir, but it's unlikely + rxn = rxn.reverse() + row = [rxn.reactant_smiles, rxn.product_smiles, rxn.barrier, rxn.enthalpy] + if args.write_file_info: + row.extend([rxn.reactant_file, rxn.product_file, rxn.ts_file]) + writer.writerow(row) + if args.xyz_dir is not None: + path = os.path.join(args.xyz_dir, f'rxn{rxn_num:06}.xyz') + rxn2xyzfile(rxn, path) + rxn_num += 1 + + print(f'Wrote {rxn_num} reactions to {args.out_file}.') + out_file.close() + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('reac_dir', help='Path to directory containing optimized reactant structures') + parser.add_argument('prod_dir', help='Path to directory containing optimized product structures') + parser.add_argument('ts_dir', help='Path to directory containing optimized TS structures') + parser.add_argument('out_file', help='Path to output file') + parser.add_argument('--all_ts', action='store_true', + help='Do not filter reactions or remove duplicates and store only TS file info') + parser.add_argument('--reactant_smiles_from_comment', action='store_true', + help='Use the SMILES in the reactant geometry optimization comment instead of inferring it') + parser.add_argument('--xyz_dir', help='If specified, write the geometries for each reaction to this directory') + parser.add_argument('--include_reverse', action='store_true', help='Also extract reverse reactions') + parser.add_argument('--write_file_info', action='store_true', help='Write file paths to output file') + parser.add_argument('--edist', type=float, default=5.0, + help='Ignore TS files with energy differences (kcal/mol) larger than this') + parser.add_argument('--gdist', type=float, default=1.0, + help='Ignore TS files with Cartesian RMSD (Angstrom) between first and last geometries' + ' larger than this') + parser.add_argument('--check_normal_mode', action='store_true', + help='Perform a normal mode analysis to identify if the TS structure is correct (make sure to' + ' check the warnings in the normal_mode_analysis function before using this option') + parser.add_argument('--soft_check', action='store_true', + help='If checking normal modes, only perform a soft check, i.e., only check that the largest' + ' TS variation is the largest overall') + parser.add_argument('--group_by_connection_changes', action='store_true', + help='Use connection changes instead of product identities to distinguish reactions') + parser.add_argument('--keep_isomorphic_reactions', action='store_true', + help='Consider reactions where the product is isomorphic with the reactant') + parser.add_argument('--no_atommap', action='store_false', dest='atommap', + help='Do not include atom mapping in parsed SMILES') + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/filter_reactants.py b/pyTEST_Example/scripts/ard/filter_reactants.py new file mode 100755 index 0000000..ebca172 --- /dev/null +++ b/pyTEST_Example/scripts/ard/filter_reactants.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +import argparse +import glob +import os +import shutil + +from ard_gsm.mol import MolGraph, SanitizationError +from ard_gsm.qchem import QChem, QChemError + + +def main(): + args = parse_args() + os.makedirs(args.out_dir, exist_ok=True) + + for logfile in glob.iglob(os.path.join(args.qlog_dir, '*.log')): + try: + log = QChem(logfile=logfile) + except QChemError as e: + print(e) + continue + + try: + freqs = log.get_frequencies() + except QChemError as e: + print(e) + continue + else: + if any(freq < 0.0 for freq in freqs): + print(f'Imaginary frequency in {logfile}') + continue + + symbols, coords = log.get_geometry(first=True) + mol_preopt = MolGraph(symbols=symbols, coords=coords) + mol_preopt.infer_connections() + symbols, coords = log.get_geometry() + mol_postopt = MolGraph(symbols=symbols, coords=coords) + mol_postopt.infer_connections() + if not mol_postopt.has_same_connectivity(mol_preopt): + print(f'Changed connectivity in {logfile}') + continue + + if args.check_smiles: + try: + _ = mol_postopt.assign_atom_map_numbers_to_smiles(log.get_comment()) + except QChemError: + print(f'WARNING: Missing SMILES in {logfile}') + except SanitizationError: + print(f'Incorrect SMILES in {logfile}') + continue + + shutil.copy(logfile, args.out_dir) + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Remove all geometry optimizations that had errors, ' + 'imaginary frequencies, or changed connectivity during optimization.') + parser.add_argument('qlog_dir', metavar='QDIR', help='Path to directory containing geometry optimization outputs') + parser.add_argument('out_dir', metavar='ODIR', help='Path to output directory') + parser.add_argument('--check_smiles', action='store_true', help='Check that the SMILES from the comment matches') + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/gsm_job_stats.py b/pyTEST_Example/scripts/ard/gsm_job_stats.py new file mode 100755 index 0000000..b14a308 --- /dev/null +++ b/pyTEST_Example/scripts/ard/gsm_job_stats.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import csv +import glob +import os +import re + +from ard_gsm.mol import MolGraph +from ard_gsm.driving_coords import DrivingCoords +from ard_gsm.util import read_xyz_file + + +def main(): + args = parse_args() + + gsm_dir = args.gsm_dir + scr_dir = os.path.join(gsm_dir, 'scratch') + num_regex = re.compile(r'\d+') + + results = [] + + for gsm_log in glob.iglob(os.path.join(gsm_dir, 'gsm*.out')): + num = int(num_regex.search(os.path.basename(gsm_log)).group(0)) + + slurm_log = os.path.join(gsm_dir, f'{num}.log') + string_file = os.path.join(gsm_dir, f'stringfile.xyz{num:04}') + isomers_file = os.path.join(scr_dir, f'ISOMERS{num:04}') + ts_file = os.path.join(scr_dir, f'tsq{num:04}.xyz') + + try: + (niter, + ngrad, + overlap, + ts_type, + scf_error, + too_many_nodes, + high_energy, + geometry_error, + dissociative, + bad_spacings, + growth_limit) = get_gsm_stats(gsm_log) + except: + print(f'Error in {gsm_log}:') + raise + time_limit, bus_error = check_for_slurm_error(slurm_log) + + error = None + if time_limit: + error = 'time' + elif bus_error: + error = 'bus' + elif too_many_nodes: + error = 'nodes' + elif high_energy: + error = 'highE' + elif geometry_error: + error = 'geometry' + elif dissociative: + error = 'dissociative' + elif scf_error: + error = 'scf' + elif bad_spacings: + error = 'spacing' + elif growth_limit: + error = 'growth_limit' + # ##### Temporary ##### + elif ts_type != '-FL-' and not os.path.exists(ts_file): + raise Exception(f'Other error in {gsm_log}!') + ##### + + if ts_type in {'-XTS-', '-TS-'} and os.path.exists(string_file): + xyzs = read_xyz_file(string_file, with_energy=True) + energies = [xyz[2] for xyz in xyzs] + barrier = max(energies[1:-1]) - energies[0] + rxn_energy = energies[-1] - energies[0] + # If DFT fails during final geometry optimization then the final node might have an invalid energy + if rxn_energy >= barrier: + rxn_energy = energies[-2] - energies[0] + if rxn_energy >= barrier: + print(f'Warning: Ignored {gsm_log} because of invalid reaction energy') + continue + intended = check_bond_changes(isomers_file, xyzs) + stats = Stats(num=num, + niter=niter, + ngrad=ngrad, + error=error, + ts_type=ts_type, + barrier=barrier, + rxn_energy=rxn_energy, + overlap=overlap, + intended=intended) + results.append(stats) + else: + stats = Stats(num=num, + niter=niter, + ngrad=ngrad, + error=error, + ts_type=ts_type) + results.append(stats) + + with open(args.out_file, 'w') as csvfile: + stats_writer = csv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC) + header = ['num', 'niter', 'ngrad', 'error', 'ts_type', 'barrier', 'rxn_energy', 'overlap', 'intended'] + stats_writer.writerow(header) + + results.sort(key=lambda s: s.num) + for stats in results: + stats_writer.writerow([getattr(stats, name, None) for name in header]) + + # Print summary statistics (ignoring bus errors) + ngrads = [stats.ngrad for stats in results if stats.error != 'bus'] + ngrads_success = [stats.ngrad for stats in results if stats.ts_type in {'-XTS-', '-TS-'}] + avg_grad = sum(ngrads) / len(ngrads) + max_grad = max(ngrads) + avg_grad_success = sum(ngrads_success) / len(ngrads_success) + max_grad_success = max(ngrads_success) + frac_success = len(ngrads_success) / len(ngrads) + frac_intended = (sum(1 for stats in results if stats.ts_type in {'-XTS-', '-TS-'} and stats.intended) + / len(ngrads_success)) + print(f'Average number of gradients: {avg_grad:.0f}') + print(f'Maximum number of gradients: {max_grad}') + print(f'Average number of gradients in successful jobs: {avg_grad_success:.0f}') + print(f'Maximum number of gradients in successful jobs: {max_grad_success}') + print(f'Fraction of jobs that succeeded: {frac_success:.4f}') + print(f'Fraction of successful jobs that were intended: {frac_intended:.4f}') + print('Lowest barriers:') + results_with_barrier = [stats for stats in results if hasattr(stats, 'barrier')] + results_with_barrier.sort(key=lambda s: s.barrier) + for i in range(min(10, len(results_with_barrier))): + print(f'{results_with_barrier[i].num}: {results_with_barrier[i].barrier:.2f}') + + +class Stats(object): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + +def get_gsm_stats(gsm_log): + """ + Returns a tuple of the number of iterations, number of gradients, overlap + between TS and Hessian vector (None if not found), type of TS/reaction path + (-XTS-, -TS-, -multistep-, -FL-, '-diss growth', None if not found), + whether an SCF error occurred, whether there are too many nodes, whether + the maximum reaction energy was exceeded, whether a geometry error + occurred, and whether the profile is dissociative. + """ + niter = 0 + ngrad = 0 + overlap = None + ts_type = None + scf_error = False + too_many_nodes = False + high_energy = False + geometry_error = False + dissociative = False + bad_spacings = False + growth_limit = False + + with open(gsm_log) as f: + for line in f: + if 'totalgrad' in line: + if 'tgrads:' in line: + line_split = line.split() + idx = line_split.index('tgrads:') + ngrad = int(line_split[idx + 1]) + if 'opt_iter:' in line: + niter += 1 + elif 'opt_iters over:' in line: + line_split = line.split() + for ol_num in range(100): + try: + ol_idx = line_split.index(f'ol({ol_num}):') + except ValueError as e: + ol_error = e + continue + else: + break + else: + raise ol_error + overlap = float(line_split[ol_idx+1]) + ts_type = line_split[-1] + if ts_type == 'growth-': + ts_type = '-diss growth-' + # Handle cases where nothing was printed at the end of the line + if len(ts_type) == 1: + ts_type = None + elif 'SCF failed' in line: + scf_error = True + elif 'cannot add node' in line: + too_many_nodes = True + elif 'high energy' in line and '-exit early-' in line: + high_energy = True + elif 'ERROR: Geometry contains NaN' in line: + geometry_error = True + elif 'terminating due to dissociation' in line: + dissociative = True + elif 'ERROR: bad spacings' in line: + bad_spacings = True + elif 'at limit of growth' in line: + growth_limit = True + + return (niter, ngrad, overlap, ts_type, + scf_error, too_many_nodes, high_energy, geometry_error, dissociative, bad_spacings, growth_limit) + + +def check_for_slurm_error(slurm_log): + """ + Returns a tuple of flags indicating whether SLURM reached the time limit or + encountered a bus error. + """ + time_limit = bus_error = False + + with open(slurm_log) as f: + for line in reversed(f.readlines()): + if 'TIME LIMIT' in line: + time_limit = True + elif 'Bus error' in line: + bus_error = True + + return time_limit, bus_error + + +def check_bond_changes(isomers_file, string): + """ + Check if desired bond changes were obtained. + """ + with open(isomers_file) as f: + intended_changes = DrivingCoords() + intended_changes.reconstruct_from_str(f.read()) + + r_symbols, r_coords = string[0][:2] + p_symbols, p_coords = string[-1][:2] + reactant = MolGraph(symbols=r_symbols, coords=r_coords) + product = MolGraph(symbols=p_symbols, coords=p_coords) + reactant.infer_connections() + product.infer_connections() + + # Extract connection changes + r_connections = reactant.get_all_connections() + p_connections = product.get_all_connections() + break_idxs, form_idxs = [], [] + for connection in r_connections: + if connection not in p_connections: + idxs = (connection.atom1.idx, connection.atom2.idx) + break_idxs.append(idxs) + for connection in p_connections: + if connection not in r_connections: + idxs = (connection.atom1.idx, connection.atom2.idx) + form_idxs.append(idxs) + actual_changes = DrivingCoords(break_idxs=break_idxs, form_idxs=form_idxs) + + if intended_changes.is_subset(actual_changes): + return True + else: + return False + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('gsm_dir', metavar='DIR', help='Path to GSM directory') + parser.add_argument('out_file', metavar='FILE', help='Path to output file') + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/make_opt_jobs.py b/pyTEST_Example/scripts/ard/make_opt_jobs.py new file mode 100755 index 0000000..37a412e --- /dev/null +++ b/pyTEST_Example/scripts/ard/make_opt_jobs.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import os +import random + +from ard_gsm.qchem import QChem +from ard_gsm.util import pickle_load + + +def main(): + args = parse_args() + + print('Loading data...') + data = pickle_load(args.mol_data, compressed=True) + + if args.ignore is not None: + ignore = set() + with open(args.ignore) as f: + for line in f: + for s in line.strip().split(): + if 'dsgdb9nsd' in s: + ignore.add(s) + break + data = [mol_data for mol_data in data if mol_data.file_name not in ignore] + + if args.max_heavy > 0: + data = [mol_data for mol_data in data if sum(1 for s in mol_data.elements if s != 'H') <= args.max_heavy] + if args.min_heavy > 0: + data = [mol_data for mol_data in data if sum(1 for s in mol_data.elements if s != 'H') >= args.min_heavy] + if args.random: + random.shuffle(data) + if args.num > 0: + data = data[:args.num] + + print('Generating geometries and searching conformers...') + mols, names = [], [] + for mol_data in data: + try: + mol = mol_data.to_rdkit(nconf=args.nconf) + except: # Ignore RDKit sanitization errors + continue + else: + mols.append(mol) + names.append(mol_data.file_name) + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + print('Making input files...') + for i, mol in enumerate(mols): + q = QChem(mol, config_file=args.config) + q.make_input(os.path.join(args.out_dir, f'molopt{i}.in'), mem=args.mem) + + with open(os.path.join(args.out_dir, 'names.txt'), 'w') as f: + for i, name in enumerate(names): + f.write(f'{i}: {name}\n') + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('mol_data', metavar='FILE', help='Path to pickled and zipped list of MolData objects') + parser.add_argument('out_dir', metavar='DIR', help='Path to output directory') + parser.add_argument('--max_heavy', type=int, default=-1, metavar='MAXH', help='Maximum number of heavy atoms') + parser.add_argument('--min_heavy', type=int, default=-1, metavar='MINH', help='Minimum number of heavy atoms') + parser.add_argument('--num', type=int, default=-1, metavar='N', help='Number of molecules to choose from mol_data') + parser.add_argument('--not_random', action='store_false', dest='random', + help='Select molecules in order instead of randomly') + parser.add_argument('--nconf', type=int, default=100, metavar='C', + help='Number of conformers to generate for lowest-energy conformer search') + parser.add_argument('--ignore', metavar='FILE', help='File containing list of QM9 file names to ignore') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument( + '--config', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.opt_freq'), + help='Configuration file for Q-Chem input files.' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/make_opt_jobs_rad.py b/pyTEST_Example/scripts/ard/make_opt_jobs_rad.py new file mode 100755 index 0000000..1fa8651 --- /dev/null +++ b/pyTEST_Example/scripts/ard/make_opt_jobs_rad.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import os + +import pandas as pd +from rdkit import Chem + +from ard_gsm.qchem import QChem + + +def main(): + args = parse_args() + seed = 7 + + print('Loading data...') + data = pd.read_csv(args.csv) + + if args.rad_only: + data = data[data['type'] == 'fragment'] + + if args.max_heavy > 0: + data = data[data['heavy_atoms'] <= args.max_heavy] + if args.min_heavy > 0: + data = data[data['heavy_atoms'] >= args.min_heavy] + + if args.num > 0: + if args.random: + data = data.sample(n=args.num, random_state=seed).reset_index(drop=True) + else: + data = data[:args.num] + elif args.random: + data = data.sample(frac=1, random_state=seed).reset_index(drop=True) + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + print(f'Writing {len(data)} input files...') + for i, (smi, mol_block, mol_type) in enumerate(zip(data['smiles'], data['mol'], data['type'])): + mol = Chem.MolFromMolBlock(mol_block, removeHs=False) + multiplicity = 1 if mol_type == 'molecule' else 2 + + q = QChem(mol, config_file=args.config) + q.make_input(os.path.join(args.out_dir, f'molopt{i}.in'), multiplicity=multiplicity, mem=args.mem, comment=smi) + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('csv', metavar='FILE', + help='CSV file containing columns with SMILES, mol block, and type (molecule or fragment)') + parser.add_argument('out_dir', metavar='DIR', help='Path to output directory') + parser.add_argument('--max_heavy', type=int, default=-1, metavar='MAXH', help='Maximum number of heavy atoms') + parser.add_argument('--min_heavy', type=int, default=-1, metavar='MINH', help='Minimum number of heavy atoms') + parser.add_argument('--rad_only', action='store_true', help='Only select radicals') + parser.add_argument('--num', type=int, default=-1, metavar='N', help='Number of molecules to choose from mol_data') + parser.add_argument('--not_random', action='store_false', dest='random', + help='Select molecules in order instead of randomly') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument( + '--config', metavar='FILE', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.opt_freq'), + help='Configuration file for Q-Chem input files.' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/parse_qm9.py b/pyTEST_Example/scripts/ard/parse_qm9.py new file mode 100755 index 0000000..926e51b --- /dev/null +++ b/pyTEST_Example/scripts/ard/parse_qm9.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os + +from ard_gsm.mol import MolData +from ard_gsm.util import pickle_dump + + +def main(): + args = parse_args() + + ignore = set() + if args.ignore_file is not None: + with open(args.ignore_file) as f: + for line in f: + try: + idx = int(line.split()[0]) + except (IndexError, ValueError): + continue + else: + ignore.add(idx) + + print('Parsing files...') + files = glob.iglob(os.path.join(args.data_dir, '*.xyz')) + data = [] + + for path in files: + d = MolData(path=path) + if d.index in ignore: + continue + elif not args.fluorine and d.contains_element('F'): + continue + else: + data.append(d) + + out_dir = os.path.dirname(os.path.abspath(args.out_file)) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + pickle_dump(args.out_file, data, compress=True) + print(f'Dumped {len(data)} molecules to {args.out_file}') + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('data_dir', metavar='DIR', help='Path to 134k data directory') + parser.add_argument('out_file', metavar='FILE', help='Path to output file') + parser.add_argument('--ignore', metavar='FILE', dest='ignore_file', + help='Path to file containing list of indices to ignore. Indices should be in the first column') + parser.add_argument('--no_fluorine', action='store_false', dest='fluorine', + help='Ignore molecules containing fluorine') + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/refine_products_and_ts.py b/pyTEST_Example/scripts/ard/refine_products_and_ts.py new file mode 100755 index 0000000..1001937 --- /dev/null +++ b/pyTEST_Example/scripts/ard/refine_products_and_ts.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os +import re + +from ard_gsm.extract import qchem2molgraph, parse_reaction, remove_duplicates +from ard_gsm.qchem import QChem +from ard_gsm.util import iter_sub_dirs + + +def main(): + args = parse_args() + num_regex = re.compile(r'\d+') + maxnum = float('inf') if args.maxnum is None else args.maxnum + + for ts_sub_dir in iter_sub_dirs(args.ts_dir, pattern=r'gsm\d+'): + sub_dir_name = os.path.basename(ts_sub_dir) + gsm_num = int(num_regex.search(sub_dir_name).group(0)) + if gsm_num > maxnum: + continue + + ts_sub_out_dir = os.path.join(args.ts_out_dir, sub_dir_name) + if os.path.exists(ts_sub_out_dir) and not args.overwrite: + continue + + print(f'Extracting from {sub_dir_name}...') + + reactant_num = int(num_regex.search(sub_dir_name).group(0)) + reactant_file = os.path.join(args.reac_dir, f'molopt{reactant_num}.log') + reactant, qr = qchem2molgraph(reactant_file, return_qobj=True, freq_only=True, print_msg=False) + if reactant is None: + raise Exception(f'Negative frequency for reactant in {reactant_file}!') + charge, multiplicity = qr.get_charge(), qr.get_multiplicity() + + reactions = {} + for ts_file in glob.iglob(os.path.join(ts_sub_dir, 'ts_optfreq*.out')): + num = int(num_regex.search(os.path.basename(ts_file)).group(0)) + prod_file = os.path.join(args.prod_dir, sub_dir_name, f'prod_optfreq{num:04}.out') + + rxn = parse_reaction( + reactant, + prod_file, + ts_file, + keep_isomorphic=args.keep_isomorphic_reactions, + edist_max=args.edist, + gdist_max=args.gdist, + normal_mode_check=args.check_normal_mode, + soft_check=args.soft_check + ) + if rxn is not None: + rxn.reactant_file = reactant_file + reactions[num] = rxn + + reactions = remove_duplicates( + reactions, + ndup=args.ndup, + group_by_connection_changes=args.group_by_connection_changes, + set_smiles=False + ) + + prod_sub_out_dir = os.path.join(args.prod_out_dir, sub_dir_name) + os.makedirs(ts_sub_out_dir, exist_ok=True) + os.makedirs(prod_sub_out_dir, exist_ok=True) + + for num, rxn in reactions.items(): + ts_file = os.path.join(ts_sub_out_dir, f'ts_optfreq{num:04}.in') + prod_file = os.path.join(prod_sub_out_dir, f'prod_optfreq{num:04}.in') + qts = QChem(mol=rxn.ts, config_file=args.config_ts) + qp = QChem(mol=rxn.product, config_file=args.config_prod) + qts.make_input(ts_file, charge=charge, multiplicity=multiplicity, mem=args.mem) + qp.make_input(prod_file, charge=charge, multiplicity=multiplicity, mem=args.mem) + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('reac_dir', help='Directory containing optimized reactant structures (same level as products)') + parser.add_argument('prod_dir', help='Directory containing optimized product structures') + parser.add_argument('ts_dir', help='Directory containing optimized TS structures') + parser.add_argument('prod_out_dir', help='Output directory for product jobs') + parser.add_argument('ts_out_dir', help='Output directory for TS jobs') + parser.add_argument('--maxnum', type=int, metavar='NUM', help='Only make jobs from GSM folders up to this number') + parser.add_argument('--overwrite', action='store_true', help='Overwrite input files in existing directories') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument('--ndup', type=int, default=1, + help='Number of duplicate reactions of the same type to extract (sorted by lowest barrier)') + parser.add_argument('--edist', type=float, default=5.0, + help='Ignore TS files with energy differences (kcal/mol) larger than this') + parser.add_argument('--gdist', type=float, default=1.0, + help='Ignore TS files with Cartesian RMSD (Angstrom) between first and last geometries' + ' larger than this') + parser.add_argument('--check_normal_mode', action='store_true', + help='Perform a normal mode analysis to identify if the TS structure is correct (make sure to' + ' check the warnings in the normal_mode_analysis function before using this option') + parser.add_argument('--soft_check', action='store_true', + help='If checking normal modes, only perform a soft check, i.e., only check that the largest' + ' TS variation is the largest overall') + parser.add_argument('--group_by_connection_changes', action='store_true', + help='Use connection changes instead of product identities to distinguish reactions') + parser.add_argument('--keep_isomorphic_reactions', action='store_true', + help='Consider reactions where the product is isomorphic with the reactant') + parser.add_argument( + '--config_prod', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.opt_freq_high'), + help='Configuration file for Q-Chem product optfreq jobs' + ) + parser.add_argument( + '--config_ts', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.ts_opt_freq_high'), + help='Configuration file for Q-Chem TS optfreq jobs' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/ard/refine_reactants.py b/pyTEST_Example/scripts/ard/refine_reactants.py new file mode 100755 index 0000000..ffbeeb4 --- /dev/null +++ b/pyTEST_Example/scripts/ard/refine_reactants.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import argparse +import glob +import os + +from ard_gsm.qchem import QChem, QChemError + + +def main(): + args = parse_args() + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + for logfile in glob.iglob(os.path.join(args.qlog_dir, '*.log')): + try: + log = QChem(logfile=logfile) + except QChemError as e: + print(e) + continue + + # Check frequencies + try: + freqs = log.get_frequencies() + except QChemError as e: + if 'not found' in str(e): + print(f'Warning: Frequencies could not be found in {logfile}') + else: + raise + else: + if any(freq < 0.0 for freq in freqs): + raise Exception(f'Negative frequency in {logfile}! Not optimized') + + symbols, coords = log.get_geometry() + charge = log.get_charge() + mult = log.get_multiplicity() + fname = os.path.splitext(os.path.basename(logfile))[0] + '.in' + path = os.path.join(args.out_dir, fname) + + q = QChem(config_file=args.config) + q.make_input_from_coords(path, symbols, coords, charge=charge, multiplicity=mult, mem=args.mem) + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('qlog_dir', help='Directory containing geometry optimization outputs') + parser.add_argument('out_dir', help='Output directory') + parser.add_argument('--mem', type=int, metavar='MEM', help='Q-Chem memory') + parser.add_argument( + '--config', + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'config', 'qchem.opt_freq_high'), + help='Configuration file for Q-Chem input files' + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/pyTEST_Example/scripts/auto3D.py b/pyTEST_Example/scripts/auto3D.py new file mode 100755 index 0000000..fca900f --- /dev/null +++ b/pyTEST_Example/scripts/auto3D.py @@ -0,0 +1,158 @@ +import argparse +import sys +import yaml +import logging +import Auto3D +from Auto3D.auto3D import options, main + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # using yaml input + parameters_yaml = sys.argv[1] + parameters = yaml.load(open(parameters_yaml, "r"), Loader=yaml.FullLoader) + # change 'None' to None + for key, val in parameters.items(): + if val == "None": + parameters[key] = None + + path = parameters["path"] + k = parameters["k"] + window = parameters["window"] + memory = parameters["memory"] + capacity = parameters["capacity"] + enumerate_tautomer = parameters["enumerate_tautomer"] + tauto_engine = parameters["tauto_engine"] + pKaNorm = parameters["pKaNorm"] + isomer_engine = parameters["isomer_engine"] + max_confs = parameters["max_confs"] + enumerate_isomer = parameters["enumerate_isomer"] + mode_oe = parameters["mode_oe"] + mpi_np = parameters["mpi_np"] + optimizing_engine = parameters["optimizing_engine"] + use_gpu = parameters["use_gpu"] + gpu_idx = parameters["gpu_idx"] + opt_steps = parameters["opt_steps"] + convergence_threshold = parameters["convergence_threshold"] + patience = parameters["patience"] + threshold = parameters["threshold"] + verbose = parameters["verbose"] + job_name = parameters["job_name"] + + else: + # using argparse + parser = argparse.ArgumentParser( + prog="Auto3D", + description="Automatic generation of the low-energy 3D structures from ANI neural network potentials" + ) + + parser.add_argument('path', type=str, + help='a path of .smi file to store all SMILES and IDs') + parser.add_argument('--k', type=int, default=False, + help='Outputs the top-k structures for each SMILES.') + parser.add_argument('--window', type=float, default=False, + help=('Outputs the structures whose energies are within ' + 'window (kcal/mol) from the lowest energy')) + parser.add_argument('--memory', type=int, default=None, + help='The RAM size assigned to Auto3D (unit GB)') + parser.add_argument('--capacity', type=int, default=40, + help='This is the number of SMILES that each 1 GB of memory can handle') + parser.add_argument('--enumerate_tautomer', default=False, type=lambda x: (str(x).lower() == 'true'), + help="When True, enumerate tautomers for the input") + parser.add_argument('--tauto_engine', type=str, default='rdkit', + help="Programs to enumerate tautomers, either 'rdkit' or 'oechem'") + parser.add_argument('--pKaNorm', default=True, type=lambda x: (str(x).lower() == 'true'), + help="When True, the ionization state of each tautomer will be assigned to a predominant state at ~7.4 (Only works when tauto_engine='oechem')") + parser.add_argument('--isomer_engine', type=str, default='rdkit', + help=('The program for generating 3D isomers for each ' + 'SMILES. This parameter is either ' + 'rdkit or omega')) + parser.add_argument('--max_confs', type=int, default=None, + help=("Maximum number of isomers for each configuration of the SMILES.", + "Default is None, and Auto3D will uses a dynamic conformer number for each SMILES.")) + parser.add_argument('--enumerate_isomer', default=True, type=lambda x: (str(x).lower() == 'true'), + help='When True, cis/trans and r/s isomers are enumerated.') + parser.add_argument('--mode_oe', type=str, default='classic', + help=("The mode that omega program will take. It can be either 'classic', 'macrocycle', 'dense', 'pose', 'rocs' or 'fast_rocs'. By default, the 'classic' mode is used.")) + parser.add_argument('--mpi_np', type=int, default=4, + help="Number of CPU cores for the isomer generation step.") + parser.add_argument('--optimizing_engine', type=str, default='AIMNET', + help=("Choose either 'ANI2x', 'ANI2xt', or 'AIMNET' for energy " + "calculation and geometry optimization.")) + parser.add_argument('--use_gpu', default=True, type=lambda x: (str(x).lower() == 'true'), + help="If True, the program will use GPU.") + parser.add_argument('--gpu_idx', default=0, type=int, + help="GPU index. It only works when --use_gpu=True") + parser.add_argument('--opt_steps', type=int, default=5000, + help="Maximum optimization steps for each structure.") + parser.add_argument('--convergence_threshold', type=float, default=0.003, + help="Optimization is considered as converged if maximum force is below this threshold.") + parser.add_argument('--patience', type=int, default=1000, + help="If the force does not decrease for a continuous patience steps, the conformer will be dropped out of the optimization loop.") + parser.add_argument('--threshold', type=float, default=0.3, + help=("If the RMSD between two conformers are within threhold, " + "they are considered as duplicates. One of them will be removed.")) + parser.add_argument('--verbose', default=False, type=lambda x: (str(x).lower() == 'true'), + help='When True, save all meta data while running.') + parser.add_argument('--job_name', default="", type=str, + help='A folder that stores all the results. By default, the name is the current date and time.') + + args = parser.parse_args() + + path = args.path + k = args.k + window = args.window + memory = args.memory + capacity = args.capacity + enumerate_tautomer = args.enumerate_tautomer + tauto_engine = args.tauto_engine + pKaNorm = args.pKaNorm + isomer_engine = args.isomer_engine + max_confs = args.max_confs + enumerate_isomer = args.enumerate_isomer + mode_oe = args.mode_oe + mpi_np = args.mpi_np + optimizing_engine = args.optimizing_engine + use_gpu = args.use_gpu + gpu_idx = args.gpu_idx + opt_steps = args.opt_steps + convergence_threshold = args.convergence_threshold + patience = args.patience + threshold = args.threshold + verbose = args.verbose + job_name = args.job_name + + arguments = options( + path, + k=k, + window=window, + verbose=verbose, + job_name=job_name, + enumerate_tautomer=enumerate_tautomer, + tauto_engine=tauto_engine, + pKaNorm=pKaNorm, + isomer_engine=isomer_engine, + enumerate_isomer=enumerate_isomer, + mode_oe=mode_oe, + mpi_np=mpi_np, + max_confs=max_confs, + use_gpu=use_gpu, + gpu_idx=gpu_idx, + capacity=capacity, + optimizing_engine=optimizing_engine, + opt_steps=opt_steps, + convergence_threshold=convergence_threshold, + patience=patience, + threshold=threshold, + memory=memory + ) + + print(f""" + _ _ _____ ____ + / \ _ _ | |_ ___ |___ / | _ \ + / _ \ | | | | | __| / _ \ |_ \ | | | | + / ___ \ | |_| | | |_ | (_) | ___) | | |_| | + /_/ \_\ \__,_| \__| \___/ |____/ |____/ {str(Auto3D.__version__)} + // Automatic generation of the low-energy 3D structures + """) + out = main(arguments) diff --git a/pyTEST_Example/scripts/gscreate b/pyTEST_Example/scripts/gscreate new file mode 100755 index 0000000..119b132 --- /dev/null +++ b/pyTEST_Example/scripts/gscreate @@ -0,0 +1,14 @@ +#!/bin/bash + +file=molecule + +#nl $file > tmp.mole +#nl link > tmp.link +#join tmp.mole tmp.link > tmp.geom2 +#awk '{ print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }' tmp.geom2 > tmp.geom + +#cat qstart tmp.geom qend > qcin +#cat qstart molecule$1 qend > scratch/qcin$1 +cat qstart $QCSCRATCH/molecule$1 qend > $QCSCRATCH/qcin$1 + +#rm tmp.geom tmp.geom2 tmp.link tmp.mole diff --git a/pyTEST_Example/scripts/ograd b/pyTEST_Example/scripts/ograd new file mode 100755 index 0000000..58c87d4 --- /dev/null +++ b/pyTEST_Example/scripts/ograd @@ -0,0 +1,32 @@ +#!/bin/bash +# The path to ORCA should be added to .bashrc or exported in command line + +if [ -z $2 ] +then + echo " need two arguments! " + exit +fi + +#echo " in ograd: $1 $2 " + +ofile=$SLURM_LOCAL_SCRATCH/orcain$1.in +ofileout=scratch/orcain$1.out +molfile=scratch/structure$1 +ncpu=$2 + +#echo " ofile: $ofile ofileout: $ofileout molfile: $molfile ncpu: $ncpu" + +########## DFT settings: ################# +#echo '! DFT B3LYP ENGRAD 6-31G*' > $ofile +echo '! B97-3c ENGRAD' > $ofile +echo '! nomoprint' >> $ofile +echo '%scf' >> $ofile +echo ' maxiter 200' >> $ofile +echo 'end' >> $ofile +# charge and spin multiplicity +echo '* xyz 0 1' >> $ofile +cat $molfile >> $ofile +echo '*' >> $ofile + +#echo "running: orca $ofile > $ofileout" +orca $ofile > $ofileout diff --git a/pyTEST_Example/scripts/ograd_xtb b/pyTEST_Example/scripts/ograd_xtb new file mode 100755 index 0000000..b66126a --- /dev/null +++ b/pyTEST_Example/scripts/ograd_xtb @@ -0,0 +1,22 @@ +#!/bin/bash + +if [ -z $2 ] +then + echo " need two arguments! " + exit +fi + +ofile=orcain$1.in +ofileout=orcain$1.out +molfile=structure$1 +ncpu=$2 + +basename="${ofile%.*}" + +########## XTB settings: ################# +cd scratch +wc -l < $molfile > $ofile.xyz +echo "Dummy for XTB/TM calculation" >> $ofile.xyz +cat $molfile >> $ofile.xyz + + diff --git a/pyTEST_Example/scripts/qend b/pyTEST_Example/scripts/qend new file mode 100755 index 0000000..b39a0e1 --- /dev/null +++ b/pyTEST_Example/scripts/qend @@ -0,0 +1 @@ +$end diff --git a/pyTEST_Example/scripts/qstart b/pyTEST_Example/scripts/qstart new file mode 100755 index 0000000..c3401b0 --- /dev/null +++ b/pyTEST_Example/scripts/qstart @@ -0,0 +1,16 @@ +$rem +JOBTYPE FORCE +METHOD B3LYP +SCF_MAX_CYCLES 150 +BASIS 6-31G* +WAVEFUNCTION_ANALYSIS FALSE +GEOM_OPT_MAX_CYCLES 300 +xc_grid 1 +scf_convergence 6 +SYM_IGNORE TRUE +SYMMETRY FALSE +molden_format true +$end + +$molecule +0 1 diff --git a/pyTEST_Example/scripts/tm2orca.py b/pyTEST_Example/scripts/tm2orca.py new file mode 100755 index 0000000..3c7ef69 --- /dev/null +++ b/pyTEST_Example/scripts/tm2orca.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Created on 26.06.2017 +Fixed on 01.06.2020 by Sebastian Ehlert + +@author: dohm +''' + +from __future__ import print_function + +import sys, os +element_symbols_lc=["bq","h","he","li","be","b","c","n","o","f","ne", + "na","mg","al","si","p","s","cl","ar","k","ca","sc", + "ti","v","cr","mn","fe","co","ni","cu","zn","ga","ge", + "as","se","br","kr","rb","sr","y","zr","nb","mo","tc", + "ru","rh","pd","ag","cd","in","sn","sb","te","i","xe", + "cs","ba","la","ce","pr","nd","pm","sm","eu","gd","tb", + "dy","ho","er","tm","yb","lu","hf","ta","w","re","os", + "ir","pt","au","hg","tl","pb","bi","po","at","rn","fr", + "ra","ac","th","pa","u","np","pu","am","cm","bk","cf", + "es","fm","md","no","lr","rf","db","sg","bh","hs","mt", + "ds","rg","cn","ut","fl","up","lv","us","uo"] + +#element_symbols_lc = map(lambda x:x.lower(),element_symbols) + +argcount = len(sys.argv) +if (argcount != 2): + print(u"need Basename, exiting...") + quit() +#assign cli to strings +basename = str(sys.argv[1]) + +engrad = open(basename + ".engrad", 'w') +gradient = open('gradient', 'r') +energy = open('energy', 'r') +outfile = open(basename + ".out", 'w') + +enline = energy.readlines()[-2] +energy_value = float(enline.strip().split()[1]) +gradlines = gradient.readlines() +natoms = int((len(gradlines) - 3) / 2) +coordlines = gradlines[2:2+natoms] +gradvallines= gradlines[2+natoms:2+2*natoms] + +#write engrad file: +engrad.write(u"#\n# Number of atoms\n#\n") +engrad.write(u"{:3d}".format(natoms) +u"\n") +engrad.write(u"#\n# The current total energy in Eh\n#"+u"\n") +engrad.write(u'{:20.12f}'.format(energy_value)+u"\n") +engrad.write(u"#\n# The current gradient in Eh/bohr\n#"+u"\n") +for gl in gradvallines: + glval = gl.strip().split() + for gv in glval: + gv = gv.replace("D","E") + engrad.write(u'{:21.12f}'.format(float(gv))+u"\n") +engrad.write(u"#\n# The atomic numbers and current coordinates in Bohr\n#"+u"\n") +for cl in coordlines: + cl = cl.replace("D","E") + + cls = cl.strip().split() + coords = [float(cls[0]),float(cls[1]),float(cls[2])] + atn = element_symbols_lc.index(cls[3].lower()) + engrad.write(u'{:4d} {:13.7f}{:13.7f}{:13.7f}'.format(atn, coords[0],coords[1],coords[2])+u"\n") +engrad.close() + +#write .out file: +outfile.write(u"ORCA-Dummy output for GSM TS Optimizer. Not a real ORCA-Output"+u"\n") +outfile.write(u"Total Energy : {:20.8f}".format(energy_value)+u"\n") +outfile.write(u"------------------\nCARTESIAN GRADIENT\n------------------\n\n") + +for i in range(natoms): + gl = gradvallines[i].replace("D","E") + cl = coordlines[i].replace("D","E") + cls = cl.strip().split() + gls = gl.strip().split() + grads = [float(gls[0]),float(gls[1]),float(gls[2])] + outfile.write(u'{:4d}{:>4} :{:15.9f}{:15.9f}{:15.9f}'.format(i, cls[3],grads[0],grads[1],grads[2])+u"\n") + +gradient.close() +energy.close() + +os.rename(u"energy", basename+u".energy") +os.rename(u"gradient", basename+u".gradient") diff --git a/pyTEST_Example/template.yaml b/pyTEST_Example/template.yaml new file mode 100644 index 0000000..faecb30 --- /dev/null +++ b/pyTEST_Example/template.yaml @@ -0,0 +1,48 @@ +# Note that a bimolecular reaction is recommended to use .xyz or .mol file +# Enumetation part +#input: {current_path}/reaction_xyz/DA.xyz # (1) The input folder with .xyz or .mol file (2) a text file with multiple smiles +input: reaction_xyz # (1) The input folder with .xyz or .mol file (2) a text file with multiple smiles +scratch: {current_path}/RESULT/ # The output folder with reaction geometry for constructing reaction network. +reaction_data: DA.p +n_break: 1 # Indicate the number of breaking bonds +form_all: False # Forming all possible bonds (matters for lone paired electrons (oxygen, sulfur, and other similar atoms)) (default: 0) +lewis_criteria: 5.0 # the criteria to find the products +ff: uff # force field for driving the reaction coordinate +crest: crest +lot: gfn2 +crest_quick: False +xtb: xtb +method: crest +enumeration: False # if you only have reactant, you will need to do enumeration. Ohterwise, you need to provide reaction geometry. +n_conf: 5 +nconf_dft: 5 +strategy: 2 +nprocs: 1 +c_nprocs: 1 +mem: 1 # in GB +crest: {conda_path}/crest +opt_level: vtight +crest_quick: False +opt: False +pysis_wt: 3600 +select: network +charge: 0 +multiplicity: 1 +skip_low_IRC: False +skip_low_TS: False +constrained_TS: True +model_path: {model_path} +gsm_inp: {gsm_file_path} + +package: ORCA #Gaussian or ORCA +dft_nprocs: 4 +dft_lot: UHF 6-31G +dft_wt: 4 +ppn: 8 +partition: standby +dft_njobs: 1 +hess_recalc: 3 +dft_irc: True +backward_DE: False +dielectric: 95.3 +solvation: CPCM/read diff --git a/pyTEST_Example/test_rxn.py b/pyTEST_Example/test_rxn.py new file mode 100644 index 0000000..a0778ae --- /dev/null +++ b/pyTEST_Example/test_rxn.py @@ -0,0 +1,107 @@ +import pytest, os, re, yaml +import shutil +import subprocess +#import yarp as yp +#from calculator import add +''' +import yarp as yp +import numpy as np +import threading +import pickle +import multiprocessing as mp +from multiprocessing import Queue +from logging.handlers import QueueHandler +from joblib import Parallel, delayed +from yarp.find_lewis import all_zeros +from yarp.find_lewis import bmat_unique +import os, sys, yaml, fnmatch +import logging +from openbabel import pybel +from utils import * +from wrappers.reaction import * +from job_mapping import * +from wrappers.crest import CREST +from qc_jobs import * +from conf import * +from analyze_functions import * +from wrappers.pysis import PYSIS +from wrappers.gsm import GSM +''' +def truthy(value): + return bool(value) +def falsy(value): + return not bool(value) + +def check_metal(xyz): + + finish = False + FeCO5 = yp.yarpecule(xyz) + # first check adj_mat + nBonds = 20 + nE = 58 + nDative= 5 + if(FeCO5.adj_mat.sum() == nBonds and FeCO5.bond_mats[0].sum() == nE): + # then check bmat + if(FeCO5.adj_mat.sum(axis=1)[0]==nDative): + finish = True + return finish + +def form_bond(a, hashes, nform): + mols = [a] + for i in range(0, nform): + mols = list(set([ y for y in yp.form_bonds(mols,hashes=hashes)])) + hashes.update([ _.hash for _ in mols ]) + print(f"form {i} bond resulted in {len(mols)} new products") + +def break_bond(a, hashes, nbreak): + mols = [a] + mols = list(set([ y for y in yp.break_bonds(mols,n=nbreak)])) + hashes.update([ _.hash for _ in mols ]) + print(f"break {nbreak} bond resulted in {len(mols)} new products") + +def rxn_setYAML(current_path, model_path, gsm_path, conda_path): + if not os.path.isfile("template.yaml"): return + shutil.copyfile("template.yaml", "parameters.yaml") + if not os.path.isfile("parameters.yaml"): return + + with open('parameters.yaml', 'r') as file: filedata = file.read() + # Replace the target string + filedata = filedata.replace('{current_path}', current_path) + filedata = filedata.replace('{model_path}', model_path) + filedata = filedata.replace('{gsm_file_path}', gsm_path) + filedata = filedata.replace('{conda_path}', conda_path) + with open('parameters.yaml', 'w') as file: file.write(filedata) + +def rxn_xtb(): + #subprocess.call("crest ", shell=True) + #subprocess.call("pysis ", shell=True) + #subprocess.call("xtb " , shell=True) + + subprocess.call("python main_xtb.py parameters.yaml", shell=True) + #exec(open("main_xtb.py").read()) + + +def test_file(): + current_directory = os.getcwd() + '/' + CONDA="CONDA_PATH" + rxn_setYAML(current_path = current_directory, + model_path = f"{current_directory}/bin", + gsm_path = f"{current_directory}/bin/inpfileq", + conda_path = f"{CONDA}/bin") + + rxn_xtb() + with open('parameters.yaml', 'rb') as f: conf = yaml.safe_load(f.read()) + #RUN(conf) + ''' + assert os.path.exists('FeCO5.xyz') + assert check_metal("FeCO5.xyz") + print("Organometallics CHECK FINISHED\n") + reactant="C=CC=C.C=C" + a = yp.yarpecule(reactant) + hashes = set([a.hash]) + print(f"reactant: {reactant}") + form_bond(a, hashes, 2) + break_bond(a, hashes, 2) + assert len(hashes) == 29 + ''' + diff --git a/pyTEST_Example/utils.py b/pyTEST_Example/utils.py new file mode 100644 index 0000000..7506aa3 --- /dev/null +++ b/pyTEST_Example/utils.py @@ -0,0 +1,451 @@ +import sys, itertools, timeit, os, copy +from openbabel import pybel +from openbabel import openbabel as ob +from collections import Counter +import numpy as np +from yarp.taffi_functions import graph_seps,table_generator,return_rings,adjmat_to_adjlist,canon_order +from yarp.properties import el_to_an,an_to_el,el_mass, el_radii +from yarp.find_lewis import find_lewis,return_formals,return_n_e_accept,return_n_e_donate,return_formals,return_connections,return_bo_dict +from yarp.hashes import atom_hash,yarpecule_hash +from yarp.input_parsers import xyz_parse,xyz_q_parse,xyz_from_smiles, mol_parse +from yarp.misc import merge_arrays, prepare_list +from openbabel import pybel +from rdkit import Chem +from rdkit.Chem import EnumerateStereoisomers, AllChem, TorsionFingerprints, rdmolops, rdDistGeom +from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions +from rdkit.ML.Cluster import Butina +import fnmatch +from wrappers.xtb import * +def geometry_opt(molecule): + ''' + geometry optimization on yarp class + ''' + mol_file='.tmp.mol' + mol_write_yp(mol_file, molecule, append_opt=False) + mol=next(pybel.readfile("mol", mol_file)) + mol.localopt(forcefield='uff') + for count_i, i in enumerate(molecule.geo): + molecule.geo[count_i]=mol.atoms[count_i].coords + os.system("rm {}".format(mol_file)) + return molecule + +def opt_geo_xtb(elements, geo, bond_mat, q=0, filename='tmp'): + ''' + Apply xTB to find product/reactant geometry from reactant/product geometry. + elements: the elements for geo (a list) + geo: the geometry of product or reactant + bond_mat: the bond electron matrix for reactant or product + q: the charge state + ''' + tmp_xyz_file=f".{filename}.xyz" + tmp_inp_file=f".{filename}.inp" + bond=return_bond_info(bond_mat) + length=[] + constraints=[] + xyz_write(tmp_xyz_file, elements, geo) + for i in bond: length.append(el_radii[elements[i[0]]]+el_radii[elements[i[1]]]) + for count_i, i in enumerate(bond): constraints+=[(i[0]+1, i[1]+1, length[count_i])] + #print("A") + optjob = XTB(input_geo=tmp_xyz_file,work_folder='.',jobtype=['opt'],jobname='opt',charge=q) + optjob.add_command(distance_constraints=constraints, force_constant=1.0) + optjob.execute() + # print(optjob.optimization_success()) + if optjob.optimization_success(): + _, Gr = optjob.get_final_structure() + print(Gr) + else: + print("XTB fails to locate reactant/product pair for this conformer.") + return [] + adj_mat_o = bondmat_to_adjmat(bond_mat) + adj_mat_n = table_generator(elements, Gr) + + try: + files=[i for i in os.listdir(".") if fnmatch.fnmatch(i, f".{filename}*")] + for i in files: os.remove(i) + except: + pass + + if np.abs(adj_mat_o-adj_mat_n).sum() == 0: + return G + else: + print("XTB fails to locate reactant/product pair for this conformer.") + return [] +# def generate_xtb_constraint(bond, length, filename=".tmp.inp") +def return_bond_info(mat): + info=[] + for i in range(len(mat)-1): + for j in range(i+1, len(mat)): + if mat[i][j]>0: + info+=[(i, j)] + return info +def opt_geo(elements,geo,bond_mat,q=0,ff='mmff94',step=1000,filename='tmp',constraints=[]): + ''' + Apply openbabel to perform force field geometry optimization + Will support constraints option in the near future + ''' + # Write a temporary molfile for obminimize to use + tmp_filename = '.{}.mol'.format(filename) + tmp_xyz_file = '.{}.xyz'.format(filename) + count = 0 + while os.path.isfile(tmp_filename): + count += 1 + if count == 10: + print("ERROR in opt_geo: could not find a suitable filename for the tmp geometry. Exiting...") + return geo + else: + tmp_filename = ".{}".format(filename) + tmp_filename + + counti = 0 + while os.path.isfile(tmp_xyz_file): + counti += 1 + if counti == 10: + print("ERROR in opt_geo: could not find a suitable filename for the tmp geometry. Exiting...") + return geo + else: + tmp_xyz_file = ".{}".format(filename) + tmp_xyz_file + + # write down mol file + mol_write(tmp_filename,elements,geo,bond_mat,q=q) + + # set up openbabel + conv = ob.OBConversion() + conv.SetInAndOutFormats('mol','xyz') + mol = ob.OBMol() + conv.ReadFile(mol,tmp_filename) + + # Setup the force field with the constraints + forcefield = ob.OBForceField.FindForceField(ff) + success = forcefield.Setup(mol) + if not success: + forcefield = ob.OBForceField.FindForceField("uff") + forcefield.Setup(mol) + #forcefield.Setup(mol, constraints) + #forcefield.SetConstraints(constraints) + + # Do a given number of steps conjugate gradient minimiazation and save the coordinates to mol. + forcefield.ConjugateGradients(step) + forcefield.GetCoordinates(mol) + # Write the mol to a file + conv.WriteFile(mol,tmp_xyz_file) + + _,G = xyz_parse(tmp_xyz_file) + # Remove the tmp file that was read by obminimize + try: + files=[i for i in os.listdir(".") if fnmatch.fnmatch(i, f".{filename}*")] + for i in files: os.remove(i) + except: + pass + + # check if geo opt returns desired geometry + adj_mat_o = bondmat_to_adjmat(bond_mat) + adj_mat_n = table_generator(elements, G) + if np.abs(adj_mat_o-adj_mat_n).sum() == 0: + return G + else: + # print(adj_mat_o-adj_mat_n) + print("Error: geometry optimization by uff is failed.") + return [] + +def bondmat_to_adjmat(bond_mat): + adj_mat=copy.deepcopy(bond_mat) + for count_i, i in enumerate(bond_mat): + for count_j, j in enumerate(i): + if j and count_i!=count_j: adj_mat[count_i][count_j]=1.0 + if count_i==count_j: adj_mat[count_i][count_i]=0.0 + return adj_mat + +def mol_write(name, elements, geo, bond_mat, q=0, append_opt=False): + adj_mat=bondmat_to_adjmat(bond_mat) + if len(elements) >= 1000: + print( "ERROR in mol_write: the V2000 format can only accomodate up to 1000 atoms per molecule.") + return + mol_dict={3:1, 2:2, 1:3, -1:5, -2:6, -3:7, 0:0} + # Check for append vs overwrite condition + if append_opt == True: + open_cond = 'a' + else: + open_cond = 'w' + + # Parse the basename for the mol header + base_name = name.split(".") + if len(base_name) > 1: + base_name = ".".join(base_name[:-1]) + else: + base_name = base_name[0] + + keep_lone=[count_i for count_i, i in enumerate(bond_mat) if i[count_i]%2==1] + # deal with radicals + fc = list(return_formals(bond_mat, elements)) + # deal with charges + chrg = len([i for i in fc if i != 0]) + valence=[] # count the number of bonds for mol file + for count_i, i in enumerate(bond_mat): + bond=0 + for count_j, j in enumerate(i): + if count_i!=count_j: bond=bond+int(j) + valence.append(bond) + # Write the file + with open(name,open_cond) as f: + # Write the header + f.write('{}\nGenerated by mol_write.py\n\n'.format(base_name)) + + # Write the number of atoms and bonds + f.write("{:>3d}{:>3d} 0 0 0 0 0 0 0 0 1 V2000\n".format(len(elements),int(np.sum(adj_mat/2.0)))) + + # Write the geometry + for count_i,i in enumerate(elements): + f.write(" {:> 9.4f} {:> 9.4f} {:> 9.4f} {:<3s} 0 {:>2d} 0 0 0 {:>2d} 0 0 0 0 0 0\n".format(geo[count_i][0],geo[count_i][1],geo[count_i][2], i.capitalize(), mol_dict[fc[count_i]], valence[count_i])) + # Write the bonds + bonds = [ (count_i,count_j) for count_i,i in enumerate(adj_mat) for count_j,j in enumerate(i) if j == 1 and count_j > count_i ] + for i in bonds: + + # Calculate bond order from the bond_mat + bond_order = int(bond_mat[i[0],i[1]]) + if bond_order==0: bond_order=1 + f.write("{:>3d}{:>3d}{:>3d} 0 0 0 0\n".format(i[0]+1,i[1]+1,bond_order)) + + # write radical info if exist + if len(keep_lone) > 0: + if len(keep_lone) == 1: + f.write("M RAD{:>3d}{:>4d}{:>4d}\n".format(1,keep_lone[0]+1,2)) + elif len(keep_lone) == 2: + f.write("M RAD{:>3d}{:>4d}{:>4d}{:>4d}{:>4d}\n".format(2,keep_lone[0]+1,2,keep_lone[1]+1,2)) + else: + print("Only support one/two radical containing compounds, radical info will be skip in the output mol file...") + + if chrg > 0: + if chrg == 1: + charge = [i for i in fc if i != 0][0] + f.write("M CHG{:>3d}{:>4d}{:>4d}\n".format(1,fc.index(charge)+1,int(charge))) + else: + info = "" + fc_counter = 0 + for count_c,charge in enumerate(fc): + if charge != 0: + if(fc_counter % 8 == 0): #Only 8 items a line# + info += "\nM CHG{:>3d}".format(chrg - fc_counter if chrg - fc_counter <= 8 else 8) + info += '{:>4d}{:>4d}'.format(count_c+1,int(charge)) + fc_counter += 1 + info += '\n' + f.write(info) + + f.write("M END\n$$$$\n") + + return + +def xyz_write(name, element, geo, append_opt=False): + if append_opt==False: out=open(name, 'w+') + else: out=open(name, 'a+') + out.write('{}\n\n'.format(len(element))) + for count_i, i in enumerate(element): + out.write('{} {} {} {}\n'.format(i, geo[count_i][0], geo[count_i][1], geo[count_i][2])) + out.close() + return + +def mol_write_yp(name,molecule,append_opt=False): + elements=molecule.elements + geo=molecule.geo + bond_mat=molecule.bond_mats[0] + q=molecule.q + adj_mat=molecule.adj_mat + # Consistency check + if len(elements) >= 1000: + print( "ERROR in mol_write: the V2000 format can only accomodate up to 1000 atoms per molecule.") + return + mol_dict={3:1, 2:2, 1:3, -1:5, -2:6, -3:7, 0:0} + # Check for append vs overwrite condition + if append_opt == True: + open_cond = 'a' + else: + open_cond = 'w' + + # Parse the basename for the mol header + base_name = name.split(".") + if len(base_name) > 1: + base_name = ".".join(base_name[:-1]) + else: + base_name = base_name[0] + + keep_lone=[count_i for count_i, i in enumerate(bond_mat) if i[count_i]%2==1] + # deal with radicals + fc = list(return_formals(bond_mat, elements)) + # deal with charges + chrg = len([i for i in fc if i != 0]) + valence=[] # count the number of bonds for mol file + for count_i, i in enumerate(bond_mat): + bond=0 + for count_j, j in enumerate(i): + if count_i!=count_j: bond=bond+int(j) + valence.append(bond) + # Write the file + with open(name,open_cond) as f: + # Write the header + f.write('{}\nGenerated by mol_write.py\n\n'.format(base_name)) + + # Write the number of atoms and bonds + f.write("{:>3d}{:>3d} 0 0 0 0 0 0 0 0 1 V2000\n".format(len(elements),int(np.sum(adj_mat/2.0)))) + + # Write the geometry + for count_i,i in enumerate(elements): + f.write(" {:> 9.4f} {:> 9.4f} {:> 9.4f} {:<3s} 0 {:>2d} 0 0 0 {:>2d} 0 0 0 0 0 0\n".format(geo[count_i][0],geo[count_i][1],geo[count_i][2], i.capitalize(), mol_dict[fc[count_i]], valence[count_i])) + + # Write the bonds + bonds = [ (count_i,count_j) for count_i,i in enumerate(adj_mat) for count_j,j in enumerate(i) if j == 1 and count_j > count_i ] + for i in bonds: + + # Calculate bond order from the bond_mat + bond_order = int(bond_mat[i[0],i[1]]) + if bond_order==0: bond_order=1 + f.write("{:>3d}{:>3d}{:>3d} 0 0 0 0\n".format(i[0]+1,i[1]+1,bond_order)) + + # write radical info if exist + if len(keep_lone) > 0: + if len(keep_lone) == 1: + f.write("M RAD{:>3d}{:>4d}{:>4d}\n".format(1,keep_lone[0]+1,2)) + elif len(keep_lone) == 2: + f.write("M RAD{:>3d}{:>4d}{:>4d}{:>4d}{:>4d}\n".format(2,keep_lone[0]+1,2,keep_lone[1]+1,2)) + else: + print("Only support one/two radical containing compounds, radical info will be skip in the output mol file...") + + if chrg > 0: + if chrg == 1: + charge = [i for i in fc if i != 0][0] + f.write("M CHG{:>3d}{:>4d}{:>4d}\n".format(1,fc.index(charge)+1,int(charge))) + else: + info = "" + fc_counter = 0 + for count_c,charge in enumerate(fc): + if charge != 0: + if(fc_counter % 8 == 0): #Only 8 items a line# + info += "M CHG{:>3d}".format(chrg - fc_counter if chrg - fc_counter <= 8 else 8) + info += '{:>4d}{:>4d}'.format(count_c+1,int(charge)) + fc_counter += 1 + info += '\n' + f.write(info) + + f.write("M END\n$$$$\n") + + return + +def return_smi(E,G,bond_mat=None,namespace='obabel'): + ''' Function to Return smiles string using openbabel (pybel) ''' + if bond_mat is None: + xyz_write(f"{namespace}_input.xyz",E,G) + # Read the XYZ file using Open Babel + molecule = next(pybel.readfile("xyz", f"{namespace}_input.xyz")) + # Generate the canonical SMILES string directly + smile = molecule.write(format="can").strip().split()[0] + # Clean up the temporary file + os.remove(f"{namespace}_input.xyz") + return smile + + else: + mol_write(f"{namespace}_input.mol",E,G,bond_mat) + # Read the mol file using Open Babel + molecule = next(pybel.readfile("mol", f"{namespace}_input.mol")) + # Generate the canonical SMILES string directly + smile = molecule.write(format="can").strip().split()[0] + # Clean up the temporary file + os.remove(f"{namespace}_input.mol") + + return smile + +def return_smi_yp(molecule, namespace="obabel"): + mol_write_yp(f"{namespace}_input.mol",molecule) + mol=next(pybel.readfile("mol", f"{namespace}_input.mol")) + smile=mol.write(format="can").strip().split()[0] + os.remove(f"{namespace}_input.mol") + return smile + +def return_rxn_constraint(mol1, mol2): + adj1=mol1.adj_mat + adj2=mol2.adj_mat + bond_change=[] + d_adj=np.abs(adj2-adj1) + for i in range(len(mol1.elements)): + for j in range(i+1, len(mol1.elements)): + if d_adj[i][j]!=0: bond_change+=[(i, j)] + reactive_atoms=list(set([atom for bond in bond_change for atom in bond])) + # if there are other atoms next to at least two reactive atom in either Reactant or Product, identify them also as reactive atoms + gs1=graph_seps(adj1) + gs2=graph_seps(adj2) + n1=Counter([indj for indj in range(len(mol1.elements)) for indi in reactive_atoms if gs1[indi][indj]==1]) + n2=Counter([indj for indj in range(len(mol1.elements)) for indi in reactive_atoms if gs2[indi][indj]==1]) + reactive_atoms+=list(set([ind for ind, count in n1.items() if count>1]+[ind for ind, count in n2.items() if count>1])) + + return bond_change, reactive_atoms + +def return_metal_constraint(molecule): + # this function will return the bond constraint for metallic bonds + adj_mat=molecule.adj_mat + elements=molecule.elements + dis_constraint=[] + metal_list=['li', 'be',\ + 'na', 'mg', 'al',\ + 'k', 'ca', 'sc', 'ti', 'v', 'cr', 'mn', 'fe', 'co', 'ni', 'cu', 'zn', 'ga', 'ge',\ + 'rb', 'sr', 'y', 'zr', 'nb', 'mo', 'tc', 'ru', 'rh', 'pd', 'ag', 'cd', 'in', 'sn', 'sb',\ + 'cs', 'ba', 'lu', 'hf', 'ta', 'w', 're', 'os', 'ir', 'pt', 'au', 'hg', 'tl', 'pb', 'bi', 'po',\ + 'fr', 'ra', 'lr', 'rf', 'db', 'sg', 'bh', 'hs', 'mt', 'ds', 'rg', 'cn',\ + 'la', 'ce', 'pr', 'nd', 'pm', 'sm', 'eu', 'gd', 'tb', 'dy', 'ho', 'er', 'tm', 'yb',\ + 'ac', 'th', 'pa', 'u', 'np', 'pu', 'am', 'cm', 'bk', 'cf', 'es', 'fm', 'md', 'no'] + for count_e, e in enumerate(elements): + if e.lower() in metal_list: + for count_i, i in enumerate(adj_mat[count_e]): + if i: + dis_constraint.append([count_e+1, count_i+1, el_radii[e.capitalize()]+el_radii[elements[count_i].capitalize()]]) + return dis_constraint + +def return_model_rxn(reaction, depth=1): + # This function is written by Hsuan-Hao Hsu (hsu205@purdue.edu) + # Read in a true reaction and return a reaction class of model reaction + elements=reaction.reactant.elements + R_adj=reaction.reactant.adj_mat + P_adj=reaction.product.adj_mat + R_bond=reaction.reactant.bond_mats[0] + P_bond=reaction.product.bond_mats[0] + for ind in range(len(bond_mat_2)): + BE_change=P_bond-R_bond + print(BE_change) + bond_break=[] + bond_form=[] + return + +def return_inchikey(molecule): + E=molecule.elements + G=molecule.geo + bond_mat=molecule.bond_mats[0] + q=molecule.q + gs=graph_seps(molecule.adj_mat) + adj_mat=molecule.adj_mat + groups=[] + loop_ind=[] + for i in range(len(gs)): + if i not in loop_ind: + new_group=[count_j for count_j, j in enumerate(gs[i, :]) if j>=0] + loop_ind += new_group + groups+=[new_group] + inchikey=[] + mol=copy.deepcopy(molecule) + for group in groups: + N_atom=len(group) + mol=copy.deepcopy(molecule) + mol.elements=[E[ind] for ind in group] + mol.bond_mats=[bond_mat[group][:, group]] + mol.geo=np.zeros([N_atom, 3]) + mol.adj_mat=adj_mat[group][:, group] + for count_i, i in enumerate(group): mol.geo[count_i, :]=G[i, :] + mol_write_yp(".tmp.mol", mol) + mol=next(pybel.readfile("mol", ".tmp.mol")) + try: + inchi=mol.write(format='inchikey').strip().split()[0] + except: + print(f"{mol.write(format='inchikey')}") + continue + inchikey+=[inchi] + os.system("rm .tmp.mol") + if len(inchikey)==0: + return "ERROR" + elif len(groups) == 1: + return inchikey[0][:14] + else: + return '-'.join(sorted([i[:14] for i in inchikey])) diff --git a/pyTEST_Example/wrappers/.reaction.py.swp b/pyTEST_Example/wrappers/.reaction.py.swp new file mode 100644 index 0000000..0d4a4f1 Binary files /dev/null and b/pyTEST_Example/wrappers/.reaction.py.swp differ diff --git a/pyTEST_Example/wrappers/crest.py b/pyTEST_Example/wrappers/crest.py new file mode 100755 index 0000000..275ef1e --- /dev/null +++ b/pyTEST_Example/wrappers/crest.py @@ -0,0 +1,192 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os,sys +import numpy as np +from yarp.input_parsers import xyz_parse +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from constants import Constants + +class CREST: + def __init__(self, input_geo, work_folder=os.getcwd(), lot='gfn2', nproc=1, mem=2000, solvent=False, solvation_model='alpb', opt_level='vtight', charge=0, multiplicity=1, quick_mode=False, xtb_path=None, crest_path=None): + """ + Initialize a xTB job class + input_geo: a xyz file containing the input geometry + work_folder: space for running xTB and saving outputfiles, if is not specified, will use the path of input_geo + level of theory (lot): gfn1, gfn2, gfnff + memory: specify in MB, per cpu + Solvation model: --alpb: analytical linearized Poisson-Boltzmann (ALPB) model, available solvents are acetone, acetonitrile, aniline, benzaldehyde, benzene, ch2cl2, chcl3, cs2, dioxane, dmf, dmso, + ether, ethylacetate, furane, hexandecane, hexane, methanol, nitromethane, octanol, woctanol, phenol, toluene, thf, water.. + --gbsa: generalized born (GB) model with solvent accessable surface (SASA) model, available solvents are acetone, acetonitrile, benzene (only GFN1-xTB), CH2Cl2, CHCl3, CS2, DMF (only GFN2-xTB), + DMSO, ether, H2O, methanol, n-hexane (only GFN2-xTB), THF and toluene. + opt_level: vloose,loose,normal,tight,vtight + quick_mode: False, quick, squick, vquick + """ + # set basic + self.input_geo = input_geo + self.jobname = input_geo.split('/')[-1].split('.xyz')[0] + self.mem = int(mem) + self.nproc = int(nproc) + + # set flag + self.charge = f'-chrg {charge}' + self.unpair = f'-uhf {multiplicity-1}' + self.lot = f'-{lot}' + + # set solvent + if solvation_model.lower() == 'alpb': solvation_model = 'alpb' + else: solvation_model = 'g' # use GBSA implicit solvent + if solvent: self.solvent = f'-{solvation_model} {solvent} ' + else: self.solvent = solvent + + # set working folder + self.work_folder = work_folder + self.xcontrol = os.path.join(self.work_folder,f'{self.jobname}.xcontrol') + self.output = os.path.join(self.work_folder,f'{self.jobname}-crest.out') + + # create work folder + if os.path.isdir(self.work_folder) is False: os.mkdir(self.work_folder) + + # set xtb and crest path + if xtb_path is None: xtb_path = os.popen('which xtb').read().rstrip() + if crest_path is None: crest_path = os.popen('which crest').read().rstrip() + + # crest calculation basic command + self.command = f'{crest_path} {self.input_geo} -xname {xtb_path} {self.charge} {self.unpair} {self.lot} -nozs -T {self.nproc} ' + if quick_mode: self.command += f'-{quick_mode} ' + if self.solvent: self.command += self.solvent + print(f"CREST COMMAND: {self.command}\n") + def generate_xcontrol(self, distance_constraints=[], cartesian_constraints=[], force_constant=0.5): + """ + Generate an XTB input file with constraints + Each element in distance_constraints should be [atomi,atomj,distance] -- index start from 1 + cartesian_constraints should be a list of atoms that need to be constrained + """ + with open(self.xcontrol, 'w') as f: + if len(distance_constraints) > 0: + f.write(f'$constrain\nforce constant={force_constant}\n') + for dis in distance_constraints: + f.write(f'distance:{dis[0]}, {dis[1]}, {dis[2]:.4f}\n') + f.write('$\n\n') + + if len(cartesian_constraints) > 0: + list_of_ranges, used_atoms = [], [] + for i in sorted(cartesian_constraints): + atom_range = [] + if i not in used_atoms: + while i in cartesian_constraints: + used_atoms.append(i) + atom_range.append(i) + i += 1 + if len(atom_range) == 1: + list_of_ranges += str(atom_range[0]) + else: + list_of_ranges.append(f'{atom_range[0]}-{atom_range[-1]}') + + # write into constraints + f.write(f'$constrain\nforce constant={force_constant}\natoms: {",".join(list_of_ranges)}\n$\n\n') + + return + + def add_command(self, additional=False, distance_constraints=[], cartesian_constraints=[], force_constant=0.5): + """ + Add in additional command and cpnstraints + Examples of additional commands: + -ewin : set energy window in kcal/mol, + [default: 6.0 kcal/mol] + -rthr : set RMSD threshold in Ang, + [default: 0.125 Ang] + -ethr : set E threshold in kcal/mol, + [default: 0.05 kcal/mol] + -bthr : set Rot. const. threshold , + [default: 0.01 (= 1%)] + -pthr : Boltzmann population threshold + [default: 0.05 (= 5%)] + -temp : set temperature in cregen, [default: 298.15 K] + """ + # add other commands if is needed: + if additional: self.command += additional + if len(distance_constraints) > 0 or len(cartesian_constraints) > 0: + self.generate_xcontrol(distance_constraints, cartesian_constraints, force_constant) + self.command += f' -cinp {self.xcontrol}' + + def execute(self): + """ + Execute a CREST calculation using the runtime flags + """ + + # obtain current path + current_path = os.getcwd() + + # go into the work folder and run the command + os.chdir(self.work_folder) + env = os.environ.copy() + env['OMP_NUM_THREADS'] = str(self.nproc) + result = subprocess.run(f"{self.command} > {self.output}", shell=True, env=env, capture_output=True, text=True) + os.chdir(current_path) + + return result + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + # load in crest output + if os.path.isfile(self.output) is False: return False + + try: lines = open(self.output, 'r', encoding="utf-8").readlines() + except: + print(f"{self.output} is failed to read. please check it!") + return False + + for n_line, line in enumerate(reversed(lines)): + if 'CREST terminated normally.' in line: + return True + + return False + + def get_stable_conformer(self): + """ + Get the final set of geometry (and elements) from crest output files + """ + # First try the .xyz file generated + xyz_file_name = f'{self.work_folder}/crest_best.xyz' + if os.path.exists(xyz_file_name): + E,G = xyz_parse(xyz_file_name) + line= open(xyz_file_name, 'r', encoding="utf-8").readlines()[1] + if "energy" not in line: SPE = float(line.split()[0]) + else: SPE=float(line.split()[1]) + return E, G, SPE + else: + return False + + def get_all_conformers(self): + """ + Get the entire set of geometry (and elements) from crest output files + """ + # First try the .xyz file generated + xyz_file_name = f'{self.work_folder}/crest_conformers.xyz' + ene_file_name = f'{self.work_folder}/crest.energies' + if os.path.exists(xyz_file_name) and os.path.exists(ene_file_name): + mols=[] + elements, geometries = xyz_parse(xyz_file_name,multiple=True) + for count_i, i in enumerate(elements): + mols.append((i, geometries[count_i])) + lines = open(ene_file_name, 'r', encoding="utf-8").readlines() + ene_list = [] + for line in lines: + if len(line.split()) == 0: break + ene_list.append(float(line.split()[-1])) + # check consistency of these two files + if len(ene_list) != len(mols): + print("Inconsistent output energies and conformers") + return False + else: + conf = {} + for ind,mol in enumerate(mols): + conf[ind] = {'G':mol[1],'E_ref':ene_list[ind]} + return conf + else: + return False + diff --git a/pyTEST_Example/wrappers/gaussian.py b/pyTEST_Example/wrappers/gaussian.py new file mode 100644 index 0000000..0a865bd --- /dev/null +++ b/pyTEST_Example/wrappers/gaussian.py @@ -0,0 +1,342 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os,sys +import time +import numpy as np + +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from yarp.input_parsers import xyz_parse +from constants import Constants +from utils import xyz_write + +class Gaussian: + def __init__(self, input_geo, work_folder=os.getcwd(), lot='B3LYP/6-31G*', jobtype='OPT', nproc=1, mem=1000, jobname='gaussianjob', charge=0, multiplicity=1, \ + solvent=False, dielectric=0.0,solvation_model='PCM', dispersion=False): + """ + Initialize an Gaussian job class + input_geo: a xyz file containing the input geometry + work_folder: working directory for running the gaussian task + jobtype: can be single (e.g., "OPT") or multiple jobs (e.g., "OPT FREQ") or with additional specification (e.g., "OPT=(TS, CALCALL, NOEIGEN, maxcycles=100)") + lot: Level of theory, e.g., "B3LYP/TZVP" + mem: unit in MB, per core + solvent: if False, will not call solvation model, otherwise specify water, THF, etc. + solvation_model: select from PCM, CPCM, SMD + """ + self.input_geo = input_geo + self.work_folder = work_folder + self.gjf = f'{work_folder}/{jobname}.gjf' + self.jobtype = jobtype + self.lot = lot + self.nproc = int(nproc) + self.mem = int(mem) + self.jobname = jobname + self.output = f'{work_folder}/{jobname}.out' + self.additional = False + self.dielectric = float(dielectric) + self.dispersion=dispersion + if solvent=="read": + self.solvation = f"SCRF=(Read)" + elif solvent: + self.solvation = f"SCRF=({solvation_model}, solvent={solvent})" + else: + self.solvation = False + + # create work folder + if os.path.isdir(self.work_folder) is False: os.mkdir(self.work_folder) + + # prepare_input_geometry(self): + elements, geometry = xyz_parse(input_geo) + self.natoms = len(elements) + self.elements = elements + self.geometry = geometry + self.charge=int(charge) + self.multiplicity=int(multiplicity) + self.xyz = f'{charge} {multiplicity}\n' + for ind in range(len(elements)): + self.xyz += f'{elements[ind]:<3} {geometry[ind][0]:^12.8f} {geometry[ind][1]:^12.8f} {geometry[ind][2]:^12.8f}\n' + self.xyz += '\n' + + def generate_input(self, constraints=[]): + """ + Create an Gaussian job script for given settings + """ + with open(self.gjf, "w") as f: + f.write(f"%NProcShared={self.nproc}\n") + f.write(f"%Mem={self.mem*self.nproc}MB\n") + if self.dispersion: + command = f"#{self.lot} EmpiricalDispersion=GD3 " + else: + command = f"#{self.lot} " + if self.solvation: + command += f" {self.solvation}" + # jobtype settings + if self.jobtype.lower()=="opt": + if self.natoms==1: command += f"Int=UltraFine Opt=(maxcycles=300) SCF=QC\n\n" + else: command += f"Opt=(maxcycles=300) Int=UltraFine SCF=QC Freq\n\n" + elif self.jobtype.lower()=="tsopt": + command+=f" OPT=(TS, CALCALL, NOEIGEN, maxcycles=300) Freq\n\n" + elif self.jobtype.lower()=='irc': + command+=f" IRC=(LQA, recorrect=never, CalcFC, maxpoints=100, Stepsize=10, maxcycles=300, Report=Cartesians)\n\n" + elif self.jobtype.lower()=='copt': + command+=f" Opt geom=connectivity\n\n" + # add constraints as the following form: + # C -1 x y z + # C -1 x y z + # H 0 x y z + # H 0 x y z + # constraint on C and fully optimize on H + self.xyz=f"{self.charge} {self.multiplicity}\n" + for count_i, i in enumerate(self.elements): + if count_i in constraints: + self.xyz+=f"{i:<3} -1 {self.geometry[count_i][0]:^12.8f} {self.geometry[count_i][3]:^12.8f} {self.geometry[count_i][2]:^12.8f}\n" + self.xyz+="\n" + f.write(command) + f.write(f"{self.jobname} {self.jobtype}\n\n") + f.write(self.xyz) + if self.solvation and self.dielectric>0.0: + f.write("solventname=newsolvent\n") + f.write(f"eps={self.dielectric}\n\n") + def execute(self): + """ + Execute a Gaussian calculation using the runtime flags + """ + + # obtain current path + current_path = os.getcwd() + + # go into the work folder and run the command + os.chdir(self.work_folder) + env = os.environ.copy() + env['OMP_NUM_THREADS'] = str(self.nproc) + result = subprocess.run(f"module load gaussian16;g16 < {self.gjf} > {self.output}", shell=True, env=env, capture_output=True, text=True) + + # go back to the original folder + os.chdir(current_path) + + return result + + def job_finished(self) -> bool: + """ + Check if the gaussian job has been finished + """ + if os.path.isfile(self.output) is False: return False + # load gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # set termination indicators + termination_strings = ['Normal termination', 'Error termination'] + + for n_line, line in enumerate(reversed(lines)): + if any(substring in line for substring in termination_strings): return True + # The above lines are pretty close to the end of the file – so skip parsing it all + if n_line > 30: return False + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + if os.path.isfile(self.output) is False: return False + # load gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for n_line, line in enumerate(reversed(lines)): + if 'Normal termination' in line: return True + # The above lines are pretty close to the end of the file – so skip parsing it all + if n_line > 30: return False + + def optimization_converged(self) -> bool: + """ + Check if the optimization converges + """ + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'Optimization completed' in line: + return True + + return False + + def get_imag_freq(self): + """ + Obtain all imaginary frequencies + """ + imag_freq, imag_ind = [],[] + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + N_imag= 0 + # identify the position of the final frequencies + for count, line in enumerate(reversed(lines)): + if 'imaginary frequencies (negative' in line: + N_imag = int(line.split()[1]) + imag_line = len(lines) - count - 1 + break + + if N_imag == 0: + return [], N_imag + else: + freq_line = lines[imag_line+9].split() + imag_freq = [float(freq_line[ind+2]) for ind in range(N_imag)] + return imag_freq, N_imag + + def is_TS(self) -> bool: + """ + Check if this is a ture transition state after TS optimization + """ + imag_freq, N_imag = self.get_imag_freq() + if N_imag == 1 and abs(imag_freq[0]) > 10: return True + else: return False + + def get_final_structure(self): + """ + Get the final set of geometry (and elements) from an Gaussian output file + """ + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # Initialize periodic table + periodic = { "H": 1, "He": 2,\ + "Li":3, "Be":4, "B":5, "C":6, "N":7, "O":8, "F":9, "Ne":10,\ + "Na":11, "Mg":12, "Al":13, "Si":14, "P":15, "S":16, "Cl":17, "Ar":18,\ + "K":19, "Ca":20, "Sc":21, "Ti":22, "V":23, "Cr":24, "Mn":25, "Fe":26, "Co":27, "Ni":28, "Cu":29, "Zn":30, "Ga":31, "Ge":32, "As":33, "Se":34, "Br":35, "Kr":36,\ + "rb":37, "sr":38, "y":39, "zr":40, "nb":41, "mo":42, "tc":43, "ru":44, "rh":45, "pd":46, "ag":47, "cd":48, "in":49, "sn":50, "sb":51, "te":52, "i":53, "xe":54,\ + "cs":55, "ba":56, "hf":72, "ta":73, "w":74, "re":75, "os":76, "ir":77, "pt":78, "au":79, "hg":80, "tl":81, "pb":82, "bi":83, "po":84, "at":85, "rn":86} + + # create an inverse periodic table + invert_periodic = {} + for p in periodic.keys(): + invert_periodic[periodic[p]]=p + + # identify the position of the final frequencies + split_ind = [] + for count, line in enumerate(reversed(lines)): + if '---------------------------------------------------------------------' in line: + split_ind.append(len(lines) - count - 1) + if 'Standard orientation:' in line: + start_ind = len(lines) - count + 4 + end_ind = split_ind[-3] + break + + # initialize E and G + E,G = [],[] + for count in range(start_ind,end_ind): + fields = lines[count].split() + E += [invert_periodic[float(fields[1])]] + G += [[float(fields[3]),float(fields[4]),float(fields[5])]] + + return E, np.array(G) + + def get_imag_freq_mode(self) -> np.ndarray: + """ + Get the imaginary frequency mode + """ + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # identify the position of the final frequencies + for count, line in enumerate(reversed(lines)): + if 'imaginary frequencies (negative' in line: + imag_line = len(lines) - count - 1 + break + + # initialize imag_freq mode + mode = [] + for count in range(imag_line+14,imag_line+self.natoms+14): + fields = lines[count].split() + mode += [[float(fields[2]),float(fields[3]),float(fields[4])]] + + return np.array(mode) + + def analyze_IRC(self, return_internal=False): + """ + Analyze IRC output, return two end points + """ + # load output job + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for lc,line in enumerate(lines): + fields = line.split() + # find the summary of IRC + if len(fields)== 5 and fields[0] == 'Summary' and fields[1] == 'of' and fields[2] == 'reaction': + count_start = lc + 3 + # locate the end of summary + if len(fields)== 5 and fields[0]=='Total' and fields[1]=='number' and fields[2]=='of' and fields[3]=='points:': + N_image = int(fields[4]) + 1 + count_end = lc - 2 + + # initialize the geometry dictionary + geo_dict={} + for i in range(N_image+1)[1:]: + geo_dict[str(i)]=[] + + for count in range(count_start,count_end): + fields = lines[count].split() + if fields[0] in geo_dict.keys(): + geo_dict[fields[0]] += [float(value) for value in fields[1:]] + + # parse energy, iternal coord, and geometry + Energy = [] + ITC = [] + traj = [] + for i in range(N_image+1)[1:]: + coords = geo_dict[str(i)] + Energy.append(float(coords[0])) + ITC.append(coords[1]) + traj.append(np.array(coords[2:]).reshape((self.natoms,3))) + barrier=[max(Energy)-Energy[0], max(Energy)-Energy[-1]] + TS=traj[Energy.index(max(Energy))] + # Write trajectory + out=open(f"{self.work_folder}/{self.jobname}_traj.xyz", "w+") + for count_i, i in enumerate(Energy): + out.write(f"{self.natoms}\n") + out.write(f"Image: {count_i} Energy: {i}\n") + for count_j, j in enumerate(traj[count_i]): + out.write(f"{self.elements[count_j]} {j[0]} {j[1]} {j[2]}\n") + out.close() + if not return_internal: + return self.elements, traj[0], traj[-1], TS, barrier[0], barrier[1] + else: + return self.elements, traj[0], traj[-1], TS, barrier[0], barrier[1], ITC + + def get_thermal(self) -> dict: + """ + Get thermochemistry properties, including Gibbs free energy, enthalpy, entropy, and inner enenrgy, from Gaussian output file + """ + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + ZPE_corr,zero_E,H_298,F_298=0,0,0,0 + grad_lines = [] + + for lc,line in enumerate(lines): + fields = line.split() + if len(fields) == 4 and fields[0] == 'Zero-point' and fields[1] == 'correction=' and fields[3] == '(Hartree/Particle)': ZPE_corr = float(fields[-2]) + if len(fields) == 7 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[4] == 'zero-point': zero_E = float(fields[-1]) + if len(fields) == 7 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[5] == 'Enthalpies=': H_298 = float(fields[-1]) + if len(fields) == 8 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[5] == 'Free' and fields[6] == 'Energies=': F_298 = float(fields[-1]) + + # parse thermal properties from output + thermal = {'GibbsFreeEnergy':F_298,'Enthalpy':H_298,'InnerEnergy':zero_E,'SPE':zero_E-ZPE_corr, "Entropy": 0.0} + + return thermal + + def get_energy(self): + SPE=0.0 + # load Gaussian output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + ZPE_corr,zero_E,H_298,F_298=0,0,0,0 + grad_lines = [] + + for lc,line in enumerate(lines): + fields = line.split() + if len(fields) == 4 and fields[0] == 'Zero-point' and fields[1] == 'correction=' and fields[3] == '(Hartree/Particle)': ZPE_corr = float(fields[-2]) + if len(fields) == 7 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[4] == 'zero-point': zero_E = float(fields[-1]) + if len(fields) == 7 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[5] == 'Enthalpies=': H_298 = float(fields[-1]) + if len(fields) == 8 and fields[0] == 'Sum' and fields[2] == 'electronic' and fields[5] == 'Free' and fields[6] == 'Energies=': F_298 = float(fields[-1]) + + # parse thermal properties from output + SPE=zero_E-ZPE_corr + return SPE diff --git a/pyTEST_Example/wrappers/gsm.py b/pyTEST_Example/wrappers/gsm.py new file mode 100755 index 0000000..a8efdd2 --- /dev/null +++ b/pyTEST_Example/wrappers/gsm.py @@ -0,0 +1,215 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) +import subprocess +import os,sys +import numpy as np + +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from yarp.input_parsers import xyz_parse +from constants import Constants + +class GSM: + def __init__(self, input_geo, input_file, work_folder=os.getcwd(), method= 'xtb',lot="gfn2", jobname='gsmjob', jobid=1, nprocs=1, charge=0, multiplicity=1, solvent=False, solvation_model='alpb'): + """ + Initialize a GSM job class + input_geo: a xyz file containing the input geometry of reactant and product + input_file: To control and generation of GSM task easier, please edit and provide this input file to generate GSM jobs (example in wrappers/GSM/inpfile) + method: select from xtb, orca, and qchem + lot, charge, multiplicity: molecular properties + + Notes: + 1. GSM binaries are stored in bin/, gsm.orca works for orca and xTB calculations while gsm.qchem works for qchem + 2. If you are using Orca for GSM, you need to edit scripts/ograd to specify the level of theory and other settings of orca for this task + 3. If you are using QChem for GSM, you need to edit scripts/qstart to specify the level of theory and other settings of qchem for this task + 4. tm2orca.py, ograd_xtb, gscreate and qend are for xTB and QChem, please do not touch them. + + """ + self.input_geo = input_geo + self.input_file = input_file + self.work_folder = work_folder + self.jobname = jobname + self.jobid = jobid + self.nprocs = nprocs + self.method = method + self.source_path = '/'.join(os.path.abspath(__file__).split('/')[:-2]) + self.output = f'{work_folder}/scratch/paragsm{jobid:04d}' + self.charge = int(charge) + self.multiplicity = int(multiplicity) + self.lot=lot[-1] + if solvent: + if solvation_model.lower() == 'alpb': self.solvation = f'--alpb {solvent}' + else: self.solvation = f'--gbsa {solvent}' # use GBSA implicit solvent + else: + self.solvation = False + + def write_ograd(self): + """ + Write down/copy grad files for GSM + """ + if self.method.lower() == 'xtb': + os.system(f'cp {self.source_path}/scripts/ograd_xtb {self.work_folder}/ograd') + with open(f'{self.work_folder}/ograd','a') as f: + if self.solvation: + f.write(f'xtb $ofile.xyz --grad --chrg {self.charge} --uhf {self.multiplicity-1} --gfn {self.lot} {self.solvation} > $ofile.xtbout\n\n') + else: + f.write(f'xtb $ofile.xyz --grad --chrg {self.charge} --uhf {self.multiplicity-1} --gfn {self.lot} > $ofile.xtbout\n\n') + f.write('python tm2orca.py $basename\n') + f.write('rm xtbrestart\ncd ..\n') + elif self.method.lower() == 'orca': + os.system(f'cp {self.source_path}/scripts/ograd {self.work_folder}') + elif self.method.lower() == 'qchem': + os.system(f'cp {self.source_path}/scripts/gscreate {self.work_folder}') + + def prepare_job(self): + """ + Prepare necessary files for running GSM + """ + # make a scratch folder in working folder + if os.path.isdir(self.work_folder) is False: + os.mkdir(self.work_folder) + + if os.path.isdir(f'{self.work_folder}/scratch') is False: + os.mkdir(f'{self.work_folder}/scratch') + + # copy input geometry to scratch + os.system(f'cp {self.input_geo} {self.work_folder}/scratch/initial{self.jobid:04d}.xyz') + + # prepare necessary files + if self.method.lower() == 'xtb': + self.write_ograd() + os.system(f'cp {self.source_path}/scripts/tm2orca.py {self.work_folder}/scratch') + os.system(f'cp {self.source_path}/bin/gsm.orca {self.work_folder}') + os.system(f'cp {self.input_file} {self.work_folder}/inpfileq') + print(f"Finish preparing working environment for GSM-xTB job {self.jobname}") + elif self.method.lower() == 'orca': + self.write_ograd() + os.system(f'cp {self.source_path}/bin/gsm.orca {self.work_folder}') + os.system(f'cp {self.input_file} {self.work_folder}') + print(f"Finish preparing working environment for GSM-Orca job {self.jobname}") + elif self.method.lower() == 'qchem': + self.write_ograd() + os.system(f'cp {self.source_path}/scripts/qstart {self.work_folder}') + os.system(f'cp {self.source_path}/scripts/qend {self.work_folder}') + os.system(f'cp {self.source_path}/bin/gsm.qchem {self.work_folder}') + os.system(f'cp {self.input_file} {self.work_folder}') + print(f"Finish preparing working environment for GSM-QChem job {self.jobname}") + else: + print("Current version only supports xTB/Orca/QChem as QC Engines, other packages will be added in the future") + + def execute(self): + """ + Execute a GSM calculation using the runtime flags + """ + + # obtain current path + current_path = os.getcwd() + + # go into the work folder and run the command + os.chdir(self.work_folder) + env = os.environ.copy() + env['OMP_NUM_THREADS'] = str(self.nprocs) + #env["LD_LIBRARY_PATH"] = "/export/apps/CentOS7/intel/oneapi/mkl/2021.1.1/lib/intel64:" + env.get("LD_LIBRARY_PATH", "") # special for athena + #env["LD_LIBRARY_PATH"] = "/sw/pkgs/arc/intel/2022.1.2/mkl/2022.0.2/lib/intel64:" + env.get("LD_LIBRARY_PATH", "") # special for great lakes + result = subprocess.run(f"./gsm.orca {self.jobid} 1 > {self.output}", shell=True, env=env, capture_output=True, text=True) + + # cleanup files + tmp_scratch = f"{self.work_folder}/scratch" + files = [os.path.join(tmp_scratch,filei) for filei in os.listdir(tmp_scratch) if 'orca' in filei or 'structure' in filei] + if len(files) > 0: [os.remove(filei) for filei in files] + + # go back to the original folder + os.chdir(current_path) + + return result + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + if os.path.isfile(self.output) is False: return False + + # load gsm output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # set termination indicator + for line in reversed(lines): + if 'about to write tsq.xyz' in line or 'exiting' in line or 'creating final string file' in line: + return True + + return False + + def find_correct_TS(self) -> bool: + """ + Check if the SSM task successfully locate a TS + """ + # load sm output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + energies = [] + for line in reversed(lines): + if 'string E (kcal/mol)' in line: + energies = [float(i) for i in line.split()[3:]] + break + if 'V_profile:' in line: + energies = [float(i) for i in line.split()[1:]] + break + + if len(energies) == 0: return False + + # check energies + peaks = [] + for i in range(1, len(energies) - 1): + if energies[i] > energies[i-1] and energies[i] > energies[i+1]: + peaks.append(i) + + if len(peaks) != 1: return False + else: return peaks[0] + + def get_barrier(self) -> float: + """ + Get single point energy from Orca output file + """ + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'string E (kcal/mol)' in line: + energies = [float(i) for i in line.split()[3:]] + break + if 'V_profile:' in line: + energies = [float(i) for i in line.split()[1:]] + break + + try: + TS_E = max(energies[1:]) + TS_ind = energies.index(TS_E, 1) + R_E = min(energies[:TS_ind]) + barrier = TS_E - R_E + return barrier + except: + return False + + def get_strings(self): + """ + Get the final optimized string of images + """ + strings_xyz = f'{self.work_folder}/stringfile.xyz{self.jobid:04d}' + if os.path.exists(strings_xyz): + elements, geometries = xyz_parse(strings_xyz,multiple=True) + mol=[] + for count_i, i in enumerate(elements): + mol.append((i, geometries[count_i])) + return mol + else: + return False + + def get_TS(self): + """ + Get the ts geometry (and elements) from a gsm output file + """ + if not self.calculation_terminated_normally(): return False, [] + if not self.find_correct_TS(): return False, [] + if not self.get_strings(): return False, [] + images = self.get_strings() + ts_ind = self.find_correct_TS() + ts = images[ts_ind] + return ts[0], ts[1] diff --git a/pyTEST_Example/wrappers/model_reaction.py b/pyTEST_Example/wrappers/model_reaction.py new file mode 100644 index 0000000..cf17b01 --- /dev/null +++ b/pyTEST_Example/wrappers/model_reaction.py @@ -0,0 +1,572 @@ +import sys, itertools, timeit, os, copy, math +from itertools import combinations +from openbabel import pybel +from openbabel import openbabel as ob +from collections import Counter +import numpy as np +import yarp as yp +import yaml, fnmatch, pickle +import scipy +# from sklearn.preprocessing import normalize +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from yarp.taffi_functions import graph_seps,table_generator,return_rings,adjmat_to_adjlist,canon_order +from yarp.properties import el_to_an,an_to_el,el_mass, el_radii +from yarp.find_lewis import find_lewis,return_formals,return_n_e_accept,return_n_e_donate,return_formals,return_connections,return_bo_dict +from yarp.hashes import atom_hash,yarpecule_hash +from yarp.input_parsers import xyz_parse,xyz_q_parse,xyz_from_smiles, mol_parse +from yarp.misc import merge_arrays, prepare_list +from openbabel import pybel +from rdkit import Chem +from rdkit.Chem import EnumerateStereoisomers, AllChem, TorsionFingerprints, rdmolops, rdDistGeom +from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions +from rdkit.ML.Cluster import Butina +from math import cos, sin +from wrappers.reaction import * +from utils import * + +class MODEL(object): + def __init__(self, reaction, depth=1, ff='mmff94'): + self.reaction=reaction + self.depth=depth + self.ff=ff + self.model=None + def return_model_rxn(self): + rxn=self.reaction + depth=self.depth + elements=rxn.reactant.elements + R_geo=rxn.reactant.geo + P_geo=rxn.product.geo + R_adj=rxn.reactant.adj_mat + P_adj=rxn.product.adj_mat + R_bond=rxn.reactant.bond_mats[0] + P_bond=rxn.product.bond_mats[0] + BE_change=P_bond-R_bond + adj_change=P_adj-R_adj + bond_change, reactive_atoms=return_adj_change(adj_change) + gs=graph_seps(R_adj) + keep_idx=list(reactive_atoms) + edge_idx=[] + for i in bond_change: + if i[0] not in keep_idx: keep_idx.append(i[0]) + if i[1] not in keep_idx: keep_idx.append(i[1]) + for count_j, j in enumerate(gs[i[0]]): + if j>0 and j<=depth and count_j not in keep_idx: keep_idx.append(count_j) + if j>0 and j==depth and reactive_atoms and count_j not in edge_idx: edge_idx.append(count_j) + for count_j, j in enumerate(gs[i[1]]): + if j>0 and j<=depth and count_j not in keep_idx: keep_idx.append(count_j) + if j>0 and j==depth and reactive_atoms and count_j not in edge_idx: edge_idx.append(count_j) + tmp_E=[] + for i in tmp_E: + if i != "H" and i!="h": tmp_E.append(tmp_E) + if len(tmp_E)==len(keep_idx): + print(f"This reaction is a model reaction with depth {depth}.") + self.model=rxn + return + new_R_E, new_R_geo, new_P_geo=self.return_model_geo(elements, R_geo, R_bond, BE_change, keep_idx, edge_idx) + if len(new_P_geo)==0 or len(new_R_geo)==0: + print(f"This reaction is failed to optimized by {self.ff}.") + self.model=None + return + xyz_write(".tmp_R.xyz", new_R_E, new_R_geo) + reactant=yp.yarpecule(".tmp_R.xyz", canon=False) + os.system("rm .tmp_R.xyz") + xyz_write(".tmp_P.xyz", new_R_E, new_P_geo) + product=yp.yarpecule(".tmp_P.xyz", canon=False) + os.system("rm .tmp_P.xyz") + self.model=reaction(reactant, product, args=rxn.args, opt=True) + return + def return_model_geo(self, elements, geo, bondmat, BE_change, keep_idx, edge_idx): + new_E, new_geo, new_edge, new_bondmat, numbond, new_BE_change=[], [], [], [], [], [] + for count_i, i in enumerate(elements): + tmp=0 + if count_i in keep_idx: + if count_i in edge_idx: new_edge.append(len(new_E)) + new_E.append(i) + new_geo.append(geo[count_i]) + new_bondmat.append([j for count_j, j in enumerate(bondmat[count_i]) if count_j in keep_idx]) + new_BE_change.append([j for count_j, j in enumerate(BE_change[count_i]) if count_j in keep_idx]) + for count_j, j in enumerate(bondmat[count_i]): + if count_j != count_i: tmp+=j + numbond.append(tmp) + + #for count_, _ in enumerate(new_E): + # print(f"{_} {new_geo[count_][0]} {new_geo[count_][1]} {new_geo[count_][2]}") + + for i in new_edge: + tot_bond=0 + double_bond=0 + for count_j, j in enumerate(new_bondmat[i]): + if count_j != i: tot_bond+=j + if j>1: double_bond+=(j-1) + num_add_hydrogen=numbond[i]-tot_bond + if num_add_hydrogen!=0: + bond_length=el_radii[new_E[i]]+el_radii["H"] + connect_ids=[count_k for count_k, k in enumerate(new_bondmat[i]) if count_k!=i and k>=1] + if numbond[i]-double_bond==1: + # A-H condition: just add the hydrogen randomly + new_coord=new_geo[i]+bond_length*np.array([1.02, 0.0, 0.0]) + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif numbond[i]-double_bond==2: + if num_add_hydrogen==1: + # B-A-H condition: add hydrogen along AB vector + vec=[new_geo[i][0]-new_geo[connect_ids[0]][0], new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + vec=vec/np.linalg.norm(vec) + new_coord=new_geo[i]+vec*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif num_add_hydrogen==2: + # H-A-H condition: add hydrogen randomly for first hydrogen and add another one along vecton A-H1 + # First H + new_coord=new_geo[i]+bond_length*np.array([1.02, 0.0, 0.0]) + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + # Second H + new_coord=new_geo[i]+bond_length*np.array([-1.02, 0.0, 0.0]) + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif numbond[i]-double_bond==3: + if num_add_hydrogen==1: + # B-A(H)-C condition: find the middle point between BC. + # find vector A and the point. Then, locate hydrogen. + vec_BA=[new_geo[i][0]-new_geo[connect_ids[0]][0], new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + vec_CA=[new_geo[i][0]-new_geo[connect_ids[1]][0], new_geo[i][1]-new_geo[connect_ids[1]][1], new_geo[i][2]-new_geo[connect_ids[1]][2]] + vec_BA=vec_BA/np.linalg.norm(vec_BA) + vec_CA=vec_CA/np.linalg.norm(vec_CA) + cross_vec=np.cross(vec_BA, vec_CA) + if np.linalg.norm(cross_vec)<1E-5: # angle (BAC) is nearly 180 deg. Contruct the plane perpendcular to vec_BA + # the plane would be vec[0]*x+vec[1]*y+vec[2]*z=vec*A + dot=np.dot(vec_BA, new_geo[i]) + vec=[abs(vec_BA[0]), abs(vec_BA[1]), abs(vec_BA[2])] + max_idx=vec.index(max(vec)) + if max_idx==0: + point=[dot/vec_BA[0], 0.0, 0.0] + elif max_idx==1: + point=[0.0, dot/vec_BA[1], 0.0] + else: + point=[0.0, 0.0, dot/vec_BA[2]] + vec=np.array([point[0]-new_geo[i][0], point[1]-new_geo[i][1], point[2]-new_geo[i][2]]) + else: + middle=[(new_geo[connect_ids[0]][0]+new_geo[connect_ids[1]][0])/2.0, (new_geo[connect_ids[0]][1]+new_geo[connect_ids[1]][1])/2.0, (new_geo[connect_ids[0]][2]+new_geo[connect_ids[1]][2])/2.0] + vec=np.array([new_geo[i][0]-middle[0], new_geo[i][1]-middle[1], new_geo[i][2]-middle[2]]) + new_coord=new_geo[i]+(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif num_add_hydrogen==2: + # H-A(H)-C condition: add first one by vector AC and insert another one follow the rule above.. + vec=[new_geo[i][0]-new_geo[connect_ids[0]][0], new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + vec=np.array(vec)/np.linalg.norm(vec) + new_coord=new_geo[i]+vec*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + dot=np.dot(vec, new_geo[i]) + tmp=[abs(vec[0]), abs(vec[1]), abs(vec[2])] + max_idx=tmp.index(max(tmp)) + if max_idx==0: + point=[dot/vec[0], 0.0, 0.0] + elif max_idx==1: + point=[0.0, dot/vec[1], 0.0] + else: + point=[0.0, 0.0, dot/vec[2]] + vec=np.array([point[0]-new_geo[i][0], point[1]-new_geo[i][1], point[2]-new_geo[i][2]]) + vec=np.array(vec)/np.linalg.norm(vec) + new_coord=new_geo[i]+vec*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif num_add_hydrogen==3: + # AH3 condition: use regular triangle to construct this molecule. + new_coord=[new_geo[i][0], new_geo[i][1]+1.0*bond_length*1.02, new_geo[i][2]] + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + new_coord=[new_geo[i][0]-1.732/2.0*bond_length*1.02, new_geo[i][1]-1.0/2.0*bond_length*1.02, new_geo[i][2]] + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + new_coord=[new_geo[i][0]+1.732/2.0*bond_length*1.02, new_geo[i][1]-1.0/2.0*bond_length*1.02, new_geo[i][2]] + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif numbond[i]-double_bond==4: + if num_add_hydrogen==1: + # (B)A(C)(D)H condition + middle=[(new_geo[connect_ids[0]][0]+new_geo[connect_ids[1]][0]+new_geo[connect_ids[2]][0])/3.0,\ + (new_geo[connect_ids[0]][1]+new_geo[connect_ids[1]][1]+new_geo[connect_ids[2]][1])/3.0,\ + (new_geo[connect_ids[0]][2]+new_geo[connect_ids[1]][2]+new_geo[connect_ids[2]][2])/3.0] + vec=[new_geo[i][0]-middle[0], new_geo[i][1]-middle[1], new_geo[i][2]-middle[2]] + if np.linalg.norm(vec)<1E-5: # point A is to close to the plane + vec1=[new_geo[connect_ids[0]][0]-new_geo[connect_ids[1]][0],\ + new_geo[connect_ids[0]][1]-new_geo[connect_ids[1]][1],\ + new_geo[connect_ids[0]][2]-new_geo[connect_ids[1]][2]] + vec2=[new_geo[connect_ids[0]][0]-new_geo[connect_ids[2]][0],\ + new_geo[connect_ids[0]][1]-new_geo[connect_ids[2]][1],\ + new_geo[connect_ids[0]][2]-new_geo[connect_ids[2]][2]] + vec=np.cross(vec1, vec2) + new_coord=new_geo[i]+(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif num_add_hydrogen==2: + # (B)AH2(C) + vec_BA=[new_geo[i][0]-new_geo[connect_ids[0]][0], new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + vec_CA=[new_geo[i][0]-new_geo[connect_ids[1]][0], new_geo[i][1]-new_geo[connect_ids[1]][1], new_geo[i][2]-new_geo[connect_ids[1]][2]] + vec_BA=vec_BA/np.linalg.norm(vec_BA) + vec_CA=vec_CA/np.linalg.norm(vec_CA) + cross_vec=np.cross(vec_BA, vec_CA) + if np.linalg.norm(cross_vec)<1E-5: # angle (BAC) is nearly 180 deg. Contruct the plane perpendcular to vec_BA + # the plane would be vec[0]*x+vec[1]*y+vec[2]*z=vec*A + dot=np.dot(vec_BA, new_geo[i]) + vec=[abs(vec_BA[0]), abs(vec_BA[1]), abs(vec_BA[2])] + max_idx=vec.index(max(vec)) + if max_idx==0: + point=[dot/vec_BA[0], 0.0, 0.0] + elif max_idx==1: + point=[0.0, dot/vec_BA[1], 0.0] + else: + point=[0.0, 0.0, dot/vec_BA[2]] + vec=np.array([point[0]-new_geo[i][0], point[1]-new_geo[i][1], point[2]-new_geo[i][2]]) + else: + middle=[(new_geo[connect_ids[0]][0]+new_geo[connect_ids[1]][0])/2.0, (new_geo[connect_ids[0]][1]+new_geo[connect_ids[1]][1])/2.0, (new_geo[connect_ids[0]][2]+new_geo[connect_ids[1]][2])/2.0] + vec=np.array([new_geo[i][0]-middle[0], new_geo[i][1]-middle[1], new_geo[i][2]-middle[2]]) + new_coord=new_geo[i]+(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + new_coord=new_geo[i]-(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + elif num_add_hydrogen==3: + # BAH3 condition + vec=[new_geo[i][0]-new_geo[connect_ids[0]][0], new_geo[i][1]-new_geo[connect_ids[0]][1], new_geo[i][2]-new_geo[connect_ids[0]][2]] + new_coord=new_geo[i]+(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + dot=np.dot(vec, new_geo[i]) + tmp=[abs(vec[0]),abs(vec[1]), abs(vec[2])] + max_idx=tmp.index(max(tmp)) + if max_idx==0: + point=[dot/vec[0], 0.0, 0.0] + elif max_idx==1: + point=[0.0, dot/vec[1], 0.0] + else: + point=[0.0, 0.0, dot/vec[2]] + vec=np.array([point[0]-new_geo[i][0], point[1]-new_geo[i][1], point[2]-new_geo[i][2]]) + new_coord=new_geo[i]+(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + new_coord=new_geo[i]-(vec/np.linalg.norm(vec))*bond_length*1.02 + new_E.append("H") + new_geo.append(new_coord) + for count_k, k in enumerate(new_bondmat): + if count_k != i: + new_bondmat[count_k].append(0) + new_BE_change[count_k].append(0) + elif count_k == i: + new_bondmat[count_k].append(1) + new_BE_change[count_k].append(0) + bond_h=[] + change=[] + for count_k, k in enumerate(new_bondmat[0]): + if count_k != i: + bond_h.append(0) + change.append(0) + else: + bond_h.append(1) + change.append(0) + new_bondmat.append(bond_h) + new_BE_change.append(change) + new_bondmat=np.asarray(new_bondmat) + try: + #for count_i, i in enumerate(new_E): + # print(f"{i} {new_geo[count_i][0]} {new_geo[count_i][1]} {new_geo[count_i][2]}") + #print(new_bondmat) + new_geo=opt_geo(new_E, new_geo, new_bondmat) + + except: + print("Model reactant is failed to optimize.") + return [], [], [] + try: + new_BE_change=np.asarray(new_BE_change) + new_bondmat=new_bondmat+new_BE_change + new_change_geo=opt_geo(new_E, new_geo, new_bondmat) + except: + print("Model product is failed to optimize.") + return [], [], [] + return new_E, new_geo, new_change_geo + +def return_adj_change(adjmat): + keep_idx=[] + reactive_atoms=[] + for i in range(len(adjmat)): + for j in range(len(adjmat)): + if i > j: + if adjmat[i][j]!=0: + keep_idx+=[(i, j)] + reactive_atoms.append(i) + reactive_atoms.append(j) + return keep_idx, reactive_atoms diff --git a/pyTEST_Example/wrappers/orca.py b/pyTEST_Example/wrappers/orca.py new file mode 100755 index 0000000..37e6761 --- /dev/null +++ b/pyTEST_Example/wrappers/orca.py @@ -0,0 +1,429 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os,sys +import time +import numpy as np + +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) + +from yarp.input_parsers import xyz_parse +from constants import Constants +from utils import xyz_write + +# prepare corresponding input files for each calculator +class ORCA: + def __init__(self, input_geo, work_folder=os.getcwd(), lot='B97-3c', jobtype='ENGRAD', nproc=1, mem=4000, scf_iters=500, jobname='orcajob', charge=0, multiplicity=1,\ + defgrid=2, solvent=False, solvation_model='CPCM', dielectric=0.0, writedown_xyz=False): + """ + Initialize an Orca job class + input_geo: a xyz file containing the input geometry + work_folder: working directory for running the orca task + orca_input: this ORCA class will generate an orca input file; Please specify full path, since it implies the working folder + jobtype: can be single (e.g., "TSOPT") or multiple jobs (e.g., "OptTS Freq MOREAD") + lot: Level of theory, e.g., "B3LYP TZVP" + mem: unit in MB, per core + defgrid: grid size in Orca, default is 2 in orca but 1 here + writedown_xyz: if True, will write xyz information into the orca input file; if False, specify the input_geo path as xyz input + """ + self.input_geo = input_geo + self.work_folder = work_folder + self.orca_input = f'{work_folder}/{jobname}.in' + self.jobtype = jobtype + self.lot = lot + self.nproc = int(nproc) + self.mem = int(mem) + self.scf_iters = int(scf_iters) + self.jobname = jobname + self.defgrid = f"defgrid{defgrid}" + self.output = f'{work_folder}/{jobname}.out' + self.geom = False + self.irc = False + self.additional = False + self.dielectric = float(dielectric) + self.solvation = False + if solvent=="read": + self.solvation = f"{solvation_model}" + elif solvent: + self.solvation = f"{solvation_model}({solvent})" + else: + self.solvation = False + + # create work folder + if os.path.isdir(self.work_folder) is False: os.mkdir(self.work_folder) + + # prepare_input_geometry(self): + if writedown_xyz is False: + if input_geo[0] == '/': # Full path + self.xyz = f'*xyzfile {charge} {multiplicity} {input_geo}\n' + else: + self.xyz = f'*xyzfile {charge} {multiplicity} {os.path.join(os.getcwd(),input_geo)}\n' + else: + self.xyz = f'*xyz {charge} {multiplicity}\n' + elements, geometry = xyz_parse(input_geo) + for ind in range(len(elements)): + self.xyz += f'{elements[ind]:<3} {geometry[ind][0]:^12.8f} {geometry[ind][1]:^12.8f} {geometry[ind][2]:^12.8f}\n' + self.xyz += '*\n' + + def generate_geometry_settings(self, hess=True, hess_step=10, constraints=[], oldhess=False): + """ + Specific info block for geometry optimization + For constraints, please use orca constraint type # Note atom index starting from 0 + {B 0 1 C} # B for Bond, C for Constraint (with bond length as current input geometry) + {B 0 1 1.25 C} # B for Bond, C for Constraint (with specified bond length [1.25]) + {A 0 1 2 C } #A for Angle + {D 0 1 2 3 C } # D for Dihedral angle + {C 5 C} # Constraining atom no. 5 in space. + A valid constraints example: constraints=['{B 66 72 C}','{B 35 72 C}','{B 32 68 C}'] + """ + + info = '%geom\n' + if hess: info += f' Calc_Hess true\n Recalc_Hess {hess_step}\n' + if oldhess: info += f' inhess Read\n InHessName "{oldhess}"\n' + if len(constraints) > 0: + info += ' Constraints\n' + for constraint in constraints: + info += f' {constraint}\n' + info += ' end\n' + info += 'end\n\n' + + self.geom = info + + def generate_irc_settings(self, max_iter=60, print_level=1, oldhess=False): + """ + Specific info block for IRC job + """ + info = f'%irc\n MaxIter {max_iter}\n PrintLevel 1\n Direction both\n Follow_CoordType cartesian\n Scale_Displ_SD 0.15\n Adapt_Scale_Displ true\n' + if oldhess: info += f' InitHess Read\n Hess_Filename "{oldhess}"\n' + else: info += f'InitHess calc_anfreq\n' + info += 'end\n\n' + self.irc = info + + def parse_additional_infoblock(self,commands): + """ + Specific other special info block for Orca jobs + Note: this should be entire orca commands in string format that can be directly parsed to orca job writter + Single line example: commands = '%moinp "RRS_7-opt.gbw"\n\n' + multiple lines example: commands = '%block1\n block-specific keywords\nend\n\n%block2\n block-specific keywords\nend\n\n' + """ + self.additional = commands + + def generate_input(self): + """ + Create an orca job script for given settings + """ + with open(self.orca_input, "w") as f: + if self.solvation: + f.write(f"! {self.lot} {self.solvation} {self.defgrid} {self.jobtype}\n\n") + else: + f.write(f"! {self.lot} {self.defgrid} {self.jobtype}\n\n") + if self.dielectric != 0.0: + f.write(f"%cpcm\n epsilon {self.dielectric}\nend\n\n") + f.write(f"%scf\n MaxIter {self.scf_iters}\nend\n\n") + f.write(f"%pal\n nproc {self.nproc}\nend\n\n") + f.write(f"%maxcore {self.mem}\n\n") + if self.geom: f.write(f"{self.geom}") + if self.irc: f.write(f"{self.irc}") + if self.additional: f.write(f"{self.additional}\n\n") + f.write(f'%base "{self.jobname}"\n\n') + f.write(self.xyz) + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + if os.path.isfile(self.output) is False: return False + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # set termination indicators + termination_strings = ['ORCA TERMINATED NORMALLY', 'ORCA finished with error'] + + for n_line, line in enumerate(reversed(lines)): + + if any(substring in line for substring in termination_strings): + return True + + if n_line > 30: + # The above lines are pretty close to the end of the file – so skip parsing it all + return False + + return False + + def get_energy(self) -> float: + """ + Get single point energy from Orca output file + """ + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'FINAL SINGLE POINT ENERGY' in line: + return float(line.split()[4]) + + return False + + def analyze_IRC(self, return_traj=False): + """ + Analyze IRC output, return two end points + """ + # load output job + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # find barriers + for lc, line in enumerate(lines): + if 'IRC PATH SUMMARY' in line: barrier_left = -float(lines[lc+5].split()[2]) + if 'Timings for individual modules:' in line: barrier_right = -float(lines[lc-2].split()[2]) + + # find output files + backward_traj = f'{self.work_folder}/{self.jobname}_IRC_B_trj.xyz' + forward_traj = f'{self.work_folder}/{self.jobname}_IRC_F_trj.xyz' + TS_xyz = f'{self.work_folder}/{self.jobname}.xyz'.replace('-IRC','-TS') + + # load geometries + E,TSG = xyz_parse(TS_xyz) + _, traj_F = xyz_parse(forward_traj, multiple=True) + _, tmp_traj_B = xyz_parse(backward_traj, multiple=True) + traj_B=[] + for k in tmp_traj_B: + traj_B.append(k) + traj_B.append(TSG) + traj=traj_B + for k in traj_F: traj.append(k) + + # write down traj + for imag in traj: + xyz_write(f'{self.work_folder}/{self.jobname}_IRC_T_trj.xyz',E, imag, append_opt=True) + + if not return_traj: + return E, traj[0], traj[-1], TSG, barrier_left, barrier_right + else: + return E, traj[0], traj[-1], TSG, barrier_left, barrier_right, traj + + def optimization_converged(self) -> bool: + """ + Check if the optimization converges + """ + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'THE OPTIMIZATION HAS CONVERGED' in line: + return True + + return False + + def get_imag_freq(self): + """ + Obtain all imaginary frequencies + """ + imag_freq, imag_ind = [],[] + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # identify the position of the final frequencies + for line in reversed(lines): + if 'imaginary mode' in line: + imag_freq.append(float(line.split()[1])) + imag_ind.append(int(line.split()[0].split(':')[0])) + if 'VIBRATIONAL FREQUENCIES' in line: + break + return imag_freq, imag_ind + + def is_TS(self) -> bool: + """ + Check if this is a ture transition state after TS optimization + """ + imag_freq,_ = self.get_imag_freq() + if len(imag_freq) == 1 and abs(imag_freq[0]) > 10: return True + else: return False + + def get_final_structure(self): + """ + Get the final set of geometry (and elements) from an ORCA output file + """ + # First try the .xyz file generated + xyz_file_name = f'{self.work_folder}/{self.jobname}.xyz' + if os.path.exists(xyz_file_name): + E,G = xyz_parse(xyz_file_name) + return E, G + + # if xyz file does not exist, go to potentially long .out file + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # identify number of atoms + for line in lines: + if 'Number of atoms' in line: + n_atoms = int(line.split()[-1]) + break + + # identify the position of the final geometry + for i, line in enumerate(reversed(lines)): + if 'CARTESIAN COORDINATES (ANGSTROEM)' in line: + n_line = len(lines)-i-1 + break + + # initialize elements and gepmetry + E, G = [], np.zeros([n_atoms,3]) + + # parse E and G + for oline in lines[n_line+2:n_line+2+n_atoms]: + label, x, y, z = oline.split() + G[len(E),:] = np.array([x, y, z]) + E.append(label) + + return E, G + + def get_imag_freq_mode(self) -> np.ndarray: + """ + Get the imaginary frequency mode + """ + geo_lines,freq_lines, mode_lines, imag_ind = [],[],[],[] + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # identify the position of final normal mode + for i, line in enumerate(reversed(lines)): + if 'NORMAL MODES' in line: + mode_line = len(lines)-i-1 + break + + # obtain imag_mode number and elements + imag_freq,imag_ind = self.get_imag_freq() + E, _ = self.get_final_structure() + + # parse imaginary mode + imag_mode = [] + for lc in range(mode_line+6,len(lines)): + if str(imag_ind[0]) not in lines[lc]: continue + fields = lines[lc].split() + if len(fields) == 6: + start_line = lc+1 + position = fields.index(str(imag_ind[0]))+1 + break + + for lc in range(start_line,start_line+len(E)*3): + fields = lines[lc].split() + imag_mode += [float(fields[position])] + + # reshape and normalize imag_mode + # first time massi**0.5 to convert to normal displacement + imag_mode = np.array(imag_mode) + imag_mode = imag_mode.reshape((len(E),3)) + + return imag_mode + + def get_gradients(self) -> np.ndarray: + """ + e.g. + + #------------------ + CARTESIAN GRADIENT <- i + #------------------ + + 1 C : -0.011390275 -0.000447412 0.000552736 <- j + """ + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # identify number of atoms + for line in lines: + if 'Number of atoms' in line: + n_atoms = int(line.split()[-1]) + break + + # identify the position of the final gradient + for i, line in enumerate(reversed(lines)): + if 'CARTESIAN GRADIENT' in line: + first,last = len(lines)-i+2,len(lines)-i+2+n_atoms + break + if 'CARTESIAN GRADIENT (NUMERICAL)' in line: + first, last = len(lines)-i+1,len(lines)-i+1+n_atoms + + # parse gradient + gradients = [] + for grad_line in lines[first:last]: + + if len(grad_line.split()) <= 3: + continue + + dadx, dady, dadz = grad_line.split()[-3:] + gradients.append([float(dadx), float(dady), float(dadz)]) + + # Convert from Ha a0^-1 to Ha A-1 + return np.array(gradients) / Constants.a0_to_ang + + def get_hessian(self) -> np.ndarray: + """Grab the Hessian from the output .hess file + + e.g.:: + + $hessian + 9 + 0 1 + 2 3 4 + 0 6.48E-01 4.376E-03 2.411E-09 -3.266E-01 -2.5184E-01 + . . . . . . + """ + # locate the .hess file generated + hess_file = f'{self.work_folder}/{self.jobname}.hess' + start_line = False + if os.path.exists(hess_file): + # load in the hessian file + lines = open(hess_file, 'r', encoding="utf-8").readlines() + for i, line in enumerate(lines): + if '$hessian' in line: + start_line = i + 3 + break + + if not start_line: + print("Wrong hessian file!") + return False + + # obtain number of atoms + n_atoms = int(lines[start_line - 2].split()[0]) // 3 + + # pasre hessian + hessian_blocks = [] + + for j, h_line in enumerate(lines[start_line:]): + + if len(h_line.split()) == 0: + # Assume we're at the end of the Hessian + break + + # Skip blank lines in the file, marked by one or more fewer items than the previous + if len(h_line.split()) < len(lines[start_line+j-1].split()): + continue + + # First item is the coordinate number, thus append all others + hessian_blocks.append([float(v) for v in h_line.split()[1:]]) + + # reshape hessian + hessian = [block for block in hessian_blocks[:3*n_atoms]] + + for i, block in enumerate(hessian_blocks[3*n_atoms:]): + hessian[i % (3 * n_atoms)] += block + + # Hessians printed in Ha/a0^2, so convert to base Ha/Å^2 + return np.array(hessian, dtype='f8') / Constants.a0_to_ang**2 + + def get_thermal(self) -> dict: + """ + Get thermochemistry properties, including Gibbs free energy, enthalpy, entropy, and inner enenrgy, from Orca output file + """ + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # parse thermal properties from output + thermal = {'GibbsFreeEnergy':False,'Enthalpy':False,'InnerEnergy':False,'Entropy':False} + for line in reversed(lines): + if 'Final Gibbs free energy' in line: thermal['GibbsFreeEnergy'] = float(line.split()[-2]) + if 'Total Enthalpy' in line: thermal['Enthalpy'] = float(line.split()[-2]) + if 'Total thermal energy' in line: thermal['InnerEnergy'] = float(line.split()[-2]) + if 'Final entropy term' in line: thermal['Entropy'] = float(line.split()[-4]) + if 'THERMOCHEMISTRY AT' in line: break + + return thermal + diff --git a/pyTEST_Example/wrappers/pysis.py b/pyTEST_Example/wrappers/pysis.py new file mode 100755 index 0000000..84c40ea --- /dev/null +++ b/pyTEST_Example/wrappers/pysis.py @@ -0,0 +1,336 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os,sys,shutil +import time +import numpy as np +import h5py + +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from yarp.input_parsers import xyz_parse +from constants import Constants + +class PYSIS: + def __init__(self, input_geo, work_folder=os.getcwd(), jobname='pysis', jobtype='tsopt', coord_type='cart', nproc=1, mem=4000, charge=0, multiplicity=1, alpb=False, gbsa=False): + """ + Initialize a pysis job class + input_geo: a xyz file containing the input geometry. Full path recommended + work_folder: work folder for running pysis job + jobtype: select from tsopt, irc, opt. + orca_input: this ORCA class will generate an orca input file; Please specify full path, since it implies the working folder + jobtype: can be single (e.g., "TSOPT") or multiple jobs (e.g., "OptTS Freq MOREAD") + mem: unit in MB, per core + defgrid: grid size in Orca, default is 2 in orca but 1 here + writedown_xyz: if True, will write xyz information into the orca input file; if False, specify the input_geo path as xyz input + """ + self.input_geo = input_geo + self.work_folder = work_folder + self.pysis_input = os.path.join(work_folder, f'{jobname}_input.yaml') + self.output = os.path.join(work_folder, f'{jobname}-{jobtype}.out') + self.nproc = int(nproc) + self.mem = int(mem) + self.jobname = jobname + self.jobtype = jobtype + self.coord_type = coord_type + self.charge = charge + self.multiplicity = multiplicity + self.alpb = alpb + self.gbsa = gbsa + # create work folder + if os.path.isdir(self.work_folder) is False: os.mkdir(self.work_folder) + + def generate_calculator_settings(self, calctype='xtb'): + """ + Specific info block for setting up calculators + Current version only support xtb, will add psi4, pyscf, in the future + """ + if calctype == 'xtb': + with open(self.pysis_input,'a') as f: + if self.alpb: + f.write(f'calc:\n type: {calctype}\n pal: {self.nproc}\n mem: {self.mem}\n charge: {self.charge}\n mult: {self.multiplicity}\n alpb: {self.alpb}\n') + elif self.gbsa: + f.write(f'calc:\n type: {calctype}\n pal: {self.nproc}\n mem: {self.mem}\n charge: {self.charge}\n mult: {self.multiplicity}\n gbsa: {self.gbsa}\n') + else: + f.write(f'calc:\n type: {calctype}\n pal: {self.nproc}\n mem: {self.mem}\n charge: {self.charge}\n mult: {self.multiplicity}\n') + else: + print("Supports for other packages are underway") + return False + + def generate_job_settings(self, method=None, thresh='gau', hess=True, hess_step=3, hess_init=False): + """ + Default and available method for different jobs: + preopt + OPT: will be added in the near FUTURE + COS: will be added in the near FUTURE + TSOPT: rsirfo, rsprfo (default), trim + IRC: euler, eulerpc (default), dampedvelocityverlet, gonzalezschlegel, lqa, imk, rk4 + Thresh: Convergence threshold, select from gau_loose, gau, gau_tight, gau_vtight + """ + # For TS-opt + if self.jobtype.lower() == 'tsopt': + if method is None: method = 'rsprfo' + with open(f'{self.pysis_input}','a') as f: + if hess: f.write(f'tsopt:\n type: {method}\n do_hess: True\n hessian_recalc: {hess_step}\n thresh: {thresh}\n max_cycles: 50\n') + else: f.write(f'tsopt:\n type: {method}\n do_hess: False\n thresh: {thresh}\n max_cycles: 300\n') + # For IRC calculation + elif self.jobtype.lower()== 'irc': + if method is None: method = 'eulerpc' + with open(f'{self.pysis_input}','a') as f: + f.write(f'irc:\n type: {method}\n forward: True\n backward: True\n downhill: False\n') + if hess_init: f.write(f' hessian_init: {hess_init}\n') + f.write(f'endopt:\n fragments: False\n do_hess: False\n thresh: {thresh}\n max_cycles: 300\n') + # For geometry optimization + elif self.jobtype.lower() == 'opt': + if method is None: method='rfo' + with open(f'{self.pysis_input}', 'a') as f: + if hess: f.write(f'opt:\n type: {method}\n max_cycles: 50\n overachieve_factor: 3\n hessian_recalc: {hess_step}\n do_hess: True\n') + else: f.write(f'opt:\n type: {method}\n max_cycles: 300\n overachieve_factor: 3\n') + # For string methods + elif self.jobtype.lower()=="string": + if method is None: method='gs' + with open(f'{self.pysis_input}', 'a') as f: + f.write(f'cos:\n type: {method}\n max_nodes: 9\n climb: True\n climb_rms: 0.005\n climb_lanczos: False\n reparam_check: rms\n reparam_every: 1\n reparam_every_full: 1\n') + f.write(f'opt:\n type: string\n stop_in_when_full: -1\n align: True\n scale_step: global\n') + + else: + print("Supports for other job types are underway") + return False + + def generate_input(self, calctype='xtb', method=None, thresh='gau', hess=True, hess_step=3, hess_init=False): + """ + Create a pysis input job based on input settings + """ + with open(self.pysis_input, "w") as f: + f.write(f'geom:\n type: {self.coord_type}\n fn: {self.input_geo}\n') + + # generate calc + self.generate_calculator_settings(calctype=calctype) + + # generate job + self.generate_job_settings(method=method, thresh=thresh, hess=hess, hess_step=hess_step, hess_init=hess_init) + + # def execute(self, timeout=3600): + # """ + # Execute a PYSIS calculation using the runtime flags + # """ + + # # obtain current path + # current_path = os.getcwd() + + # # go into the work folder and run the command + # os.chdir(self.work_folder) + # env = os.environ.copy() + # env['OMP_NUM_THREADS'] = str(self.nproc) + # try: + # result = subprocess.run(f'pysis {self.pysis_input} > {self.output}', shell=True, env=env, capture_output=True, text=True, timeout=timeout) + # except: + # result = subprocess.CompletedProcess(args=f'pysis {self.pysis_input} > {self.output}', returncode=1, stdout='', stderr=f"PYSIS job {self.jobname} timed out") + + # # go back to the original folder + # os.chdir(current_path) + + # return result + + def execute(self, timeout=3600): + """ + Execute a PYSIS calculation using the runtime flags + """ + + # obtain current path + current_path = os.getcwd() + + # go into the work folder and run the command + os.chdir(self.work_folder) + env = os.environ.copy() + env['OMP_NUM_THREADS'] = str(self.nproc) + + # running job and count time + start_time = time.time() + process = subprocess.Popen(f'pysis {self.pysis_input} > {self.output}', shell=True, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + while True: + if process.poll() is not None: # process has terminated + result = subprocess.CompletedProcess(args=f'pysis {self.pysis_input} > {self.output}', returncode=process.returncode, stdout=process.stdout.read(), stderr=process.stderr.read()) + break + elif time.time() - start_time > timeout: + process.kill() # send SIGKILL signal to the process + result = subprocess.CompletedProcess(args=f'pysis {self.pysis_input} > {self.output}', returncode=1, stdout='', stderr=f"PYSIS job {self.jobname} timed out") + break + time.sleep(1) # wait a bit before checking again + + # cleanup files + tmp_scratch = f"{self.work_folder}/qm_calcs" + files = [os.path.join(tmp_scratch,filei) for filei in os.listdir(tmp_scratch)] + if len(files) > 0: [os.remove(filei) for filei in files] + + # go back to the original folder + if process.poll() is None: process.kill() # make sure this process has been killed + os.chdir(current_path) + + return result + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + if os.path.isfile(self.output) is False: return False + + # load orca output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # find termination indicators + for line in reversed(lines): + if 'pysisyphus run took' in line: + return True + + return False + + def get_energy(self) -> float: + """ + Get single point energy from the output file + """ + # load output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'energy:' in line: + return float(line.split()[-2]) + + return False + + def optimization_converged(self) -> bool: + """ + Check if the optimization converges + """ + # load output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'Converged!' in line: + return True + + return False + + def get_final_ts(self): + """ + Get the final set of geometry (and elements) from an ORCA output file + """ + # First try the .xyz file generated + xyz_file_name = f'{self.work_folder}/ts_opt.xyz' + if os.path.exists(xyz_file_name): + E,G = xyz_parse(xyz_file_name) + return E, G + else: + xyz_file_name = f'{self.work_folder}/ts_final_geometry.xyz' + if os.path.exists(xyz_file_name): + E,G = xyz_parse(xyz_file_name) + return E, G + else: + print("No final TS xyz file has been found!") + return False + + def get_opt_geo(self): + """ + Get the optimized geometry and elements from pysis + """ + xyz_file_name=f"{self.work_folder}/final_geometry.xyz" + if os.path.exists(xyz_file_name): + E, G=xyz_parse(xyz_file_name) + return E, G + else: + print(f"{xyz_file_name} is failed to read.") + return False + + def is_true_ts(self): + """ + Check is the TS has and only has one imaginary not too small (>50) frequency + """ + import re + # load output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'Imaginary frequencies:' in line: + freqs = re.findall(r'-?\d+\.\d+', line) + if len(freqs) == 0: freqs = re.findall(r'-?\d+\.+', line) + freqs = [float(freq) for freq in freqs] + #if len(freqs) == 1 and abs(freqs[0]) > 50: return True + if len(freqs) == 1: return True + else: return False + return False + + def load_final_hess(self, return_freq = False): + """ + Get the final set of hessian + """ + # First try the .xyz file generated + hess_file_name = f'{self.work_folder}/ts_final_hessian.h5' + if os.path.exists(hess_file_name): + data = h5py.File(hess_file_name, 'r') + hessian = np.array(data['hessian']) / Constants.a0_to_ang**2 + freq = np.array(data['vibfreqs']) + if return_freq: + return hessian, freq + else: + return hessian + else: + return False + + def load_imag_freq_mode(self): + """ + Load the (largest) imaginary frequency mode + """ + mode_file = f'{self.work_folder}/ts_imaginary_mode_000.trj' + elements, geometries = xyz_parse(mode_file,multiple=True) + imag_freq_mode=[] + for count_i, i in enumerate(elements): + imag_freq_mode.append((i, geometries[count_i])) + return imag_freq_mode + + def get_energies_from_IRC(self): + """ + Get single point energies of reactant, product and TSs, from the output file + """ + # load output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for lc,line in enumerate(lines): + if 'File' in line and 'E_el' in line: + E1 = float(lines[lc+2].split()[-1]) + E2 = float(lines[lc+3].split()[-1]) + E3 = float(lines[lc+4].split()[-1]) + return E1,E2,E3 + return False + + def analyze_IRC(self, return_traj=False): + """ + Analyze IRC output, return two end points + """ + # load output job + lines = open(self.output, 'r', encoding="utf-8").readlines() + + # find barriers + for lc, line in enumerate(lines): + if 'Minimum energy of' in line: + barrier_left = float(lines[lc+3].split()[1]) - float(lines[lc+2].split()[1]) + barrier_right= float(lines[lc+3].split()[1]) - float(lines[lc+4].split()[1]) + break + + # find output files + backward_end_xyz = f'{self.work_folder}/backward_end_opt.xyz' + forward_end_xyz = f'{self.work_folder}/forward_end_opt.xyz' + IRC_traj_xyz = f'{self.work_folder}/finished_irc.trj' + TS_xyz = f'{self.work_folder}/ts_final_geometry.xyz' + + # load geometries + E, G1 = xyz_parse(backward_end_xyz) + _, G2 = xyz_parse(forward_end_xyz) + _,TSG = xyz_parse(TS_xyz) + + if not return_traj: + return E, G1, G2, TSG, barrier_left / Constants.kcal2kJ, barrier_right / Constants.kcal2kJ + else: + elements, geometries = xyz_parse(IRC_traj_xyz, multiple=True) + for count_i, i in enumerate(elements): traj.append((i, geometries[count_i])) + return E, G1, G2, TSG, barrier_left / Constants.kcal2kJ, barrier_right / Constants.kcal2kJ, traj + diff --git a/pyTEST_Example/wrappers/reaction.py b/pyTEST_Example/wrappers/reaction.py new file mode 100644 index 0000000..d1a33e5 --- /dev/null +++ b/pyTEST_Example/wrappers/reaction.py @@ -0,0 +1,264 @@ +import sys, itertools, timeit, os +import logging +from logging.handlers import QueueHandler +import numpy as np +import pickle +from yarp.taffi_functions import table_generator,return_rings,adjmat_to_adjlist,canon_order +from yarp.properties import el_to_an,an_to_el,el_mass +from yarp.find_lewis import find_lewis,return_formals,return_n_e_accept,return_n_e_donate,return_formals,return_connections,return_bo_dict +from yarp.hashes import atom_hash,yarpecule_hash +from yarp.input_parsers import xyz_parse,xyz_q_parse,xyz_from_smiles, mol_parse +from yarp.misc import merge_arrays, prepare_list +from openbabel import pybel +from rdkit import Chem +from rdkit.Chem import EnumerateStereoisomers, AllChem, TorsionFingerprints, rdmolops, rdDistGeom +from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions +from rdkit.ML.Cluster import Butina +from copy import deepcopy +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +from wrappers.xtb import * +from utils import * +from conf import * + +class reaction: + """ + Base class for storing information of a reaction and performing conformational sampling + + Attributes + ---------- + + reactant: yarpecule class for reactant + + product: yarpecule class for product + + opt: perform initial geometry optimization on product side. (default: False) + + """ + def __init__(self, reactant, product, args=dict(), opt=True): + + self.reactant=reactant + self.product=product + self.args=args + if "scratch_crest" in args.keys(): self.conf_path=self.args["scratch_crest"] + else: self.conf_path="conformer" + if "n_conf" in args.keys(): + n_conf=self.args["n_conf"] + self.n_conf=self.args["n_conf"] + else: + self.n_conf=0 + # safe check + for count_i, i in enumerate(reactant.elements): reactant.elements[count_i]=i.capitalize() + for count_i, i in enumerate(product.elements): product.elements[count_i]=i.capitalize() + for count_i, i in enumerate(reactant.elements): + if i != product.elements[count_i]: + print("Fatal error: reactant and product are not same. Please check the input.....") + exit() + if opt: self.product=geometry_opt(self.product) + self.reactant_xtb_opt=dict() + self.product_xtb_opt=dict() + self.reactant_dft_opt=dict() + self.product_dft_opt=dict() + self.reactant_conf=dict() + self.product_conf=dict() + self.reactant_energy=dict() + self.product_energy=dict() + self.reactant_inchi=return_inchikey(self.reactant) + self.product_inchi=return_inchikey(self.product) + self.reactant_smiles=return_smi_yp(self.reactant) + self.reactant_smiles=return_smi_yp(self.product) + self.rxn_conf=dict() + self.id=0 + self.TS_guess=dict() + self.TS_xtb=dict() + self.TS_dft=dict() + self.IRC_xtb=dict() + self.IRC_dft=dict() + self.constrained_TS=dict() + if os.path.isdir(self.conf_path) is False: os.system('mkdir {}'.format(self.conf_path)) + self.hash=f"{reactant.hash}-{product.hash}" + def conf_rdkit(self): + if self.args["strategy"]==0 or self.args["strategy"]==2: + if os.path.isdir('{}/{}'.format(self.conf_path, self.reactant_inchi)) is False: os.system('mkdir {}/{}'.format(self.conf_path, self.reactant_inchi)) + if os.path.isfile('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.reactant_inchi)) is False: + # sampling on reactant side + mol_file='.reactant.tmp.mol' + mol_write_yp(mol_file, self.reactant, append_opt=False) + mol=Chem.rdmolfiles.MolFromMolFile(mol_file, removeHs=False) + ids=AllChem.EmbedMultipleConfs(mol, useRandomCoords=True, numConfs=50, maxAttempts=1000000, pruneRmsThresh=0.1,\ + useExpTorsionAnglePrefs=False, useBasicKnowledge=True, enforceChirality=False) + ids=list(ids) + out=open('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.reactant_inchi), 'w+') + os.system('rm .reactant.tmp.mol') + for count_i, i in enumerate(ids): + geo=mol.GetConformer(i).GetPositions() + # check table + adj_mat=table_generator(self.reactant.elements, geo, verbose=False) + adj_diff=np.abs(adj_mat-self.reactant.adj_mat) + if adj_diff.sum()==0: + self.reactant_conf[count_i]=geo + out.write('{}\n\n'.format(len(self.reactant.elements))) + for count, e in enumerate(self.reactant.elements): + out.write('{} {} {} {}\n'.format(e.capitalize(), geo[count][0], geo[count][1], geo[count][2])) + else: + _, geo=xyz_parse('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.reactant_inchi), multiple=True) + for count_i, i in enumerate(geo): + self.reactant_conf[count_i]=i + if self.args["strategy"]==1 or self.args["strategy"]==2: + if os.path.isdir('{}/{}'.format(self.conf_path, self.product_inchi)) is False: os.system('mkdir {}/{}'.format(self.conf_path, self.product_inchi)) + if os.path.isfile('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.product_inchi)) is False: + # sampling on reactant side + mol_file='.product.tmp.mol' + mol_write_yp(mol_file, self.product, append_opt=False) + mol=Chem.rdmolfiles.MolFromMolFile(mol_file, removeHs=False) + ids=AllChem.EmbedMultipleConfs(mol, useRandomCoords=True, numConfs=50, maxAttempts=1000000, pruneRmsThresh=0.1,\ + useExpTorsionAnglePrefs=False, useBasicKnowledge=True, enforceChirality=False) + ids=list(ids) + out=open('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.product_inchi), 'w+') + os.system('rm .product.tmp.mol') + for count_i, i in enumerate(ids): + geo=mol.GetConformer(i).GetPositions() + adj_mat=table_generator(self.reactant.elements, geo, verbose=False) + adj_diff=np.abs(adj_mat-self.product.adj_mat) + # check table + if adj_diff.sum()==0: + self.product_conf[count_i]=geo + out.write('{}\n\n'.format(len(self.product.elements))) + for count, e in enumerate(self.product.elements): + out.write('{} {} {} {}\n'.format(e.capitalize(), geo[count][0], geo[count][1], geo[count][2])) + else: + _, geo=xyz_parse('{}/{}/rdkit_conf.xyz'.format(self.conf_path, self.product_inchi), multiple=True) + for count_i, i in enumerate(geo): + self.product_conf[count_i]=i + + def rxn_conf_generation(self, logging_queue): + # set up logger + logger = logging.getLogger("main") + # Add handler only if it doesn't already exist + if not logger.hasHandlers(): + logger.addHandler(QueueHandler(logging_queue)) + logger.setLevel(logging.INFO) + job_id=f"{self.reactant_inchi}_{self.id}" + + RG=self.reactant.geo + RE=self.reactant.elements + R_adj=self.reactant.adj_mat + R_bond_mats=self.reactant.bond_mats + + PG=self.product.geo + PE=self.product.elements + P_adj=self.product.adj_mat + P_bond_mats=self.product.bond_mats + + tmp_rxn_dict=dict() + count=0 + print(self.reactant_inchi) + if bool(self.product_conf)==False and self.args["strategy"]!=0: + logger.info("Warning: No conformers for product. Just use input geometry") + print("Warning: No conformers for product. Just use input geometry") + self.product_conf[0]=self.product.geo + #print(self.product.elements) + #print(self.product_conf[0]) + if bool(self.reactant_conf)==False and self.args["strategy"]!=1: + logger.info("Warning: No conformers for reactant. Just use input geometry") + print("Warning: No conformers for reactant. Just use input geometry") + self.reactant_conf[0]=self.reactant.geo + #print(self.reactant.elements) + #print(self.reactant_conf[0]) + # Create a dictionary to store the conformers and product/reactant bond mat. + if self.args["strategy"]!=0: + for i in self.product_conf.keys(): + tmp_rxn_dict[count]={"E": RE, "bond_mat_r": R_bond_mats[0], "G": deepcopy(self.product_conf[i]), 'direct':'B'} + count=count+1 + if self.args["strategy"]!=1: + for i in self.reactant_conf.keys(): + tmp_rxn_dict[count]={"E": RE, "bond_mat_r": P_bond_mats[0], "G": deepcopy(self.reactant_conf[i]), 'direct': "F"} + count=count+1 + # load ML model to find conformers + if len(tmp_rxn_dict)>3*self.n_conf: model=pickle.load(open(os.path.join(self.args['model_path'],'rich_model.sav'), 'rb')) + else: model=pickle.load(open(os.path.join(self.args['model_path'],'poor_model.sav'), 'rb')) + ind_list, pass_obj_values=[], [] + for conf_ind, conf_entry in tmp_rxn_dict.items(): + # apply force-field optimization + # apply xTB-restrained optimization soon! + Gr = opt_geo(conf_entry['E'],conf_entry['G'],conf_entry['bond_mat_r'],ff=self.args['ff'],step=100,filename=f'tmp_{job_id}') + if len(Gr)==0 or len(Gr)!=len(conf_entry["G"]): + print("Falied to optimize") + logger.info("Falied to optimize") + continue + tmp_xyz_p = f"{self.args['scratch_xtb']}/{job_id}_p.xyz" + logger.info(f"{self.args['scratch_xtb']}/{job_id}_p.xyz") + xyz_write(tmp_xyz_p,conf_entry['E'],Gr) + tmp_xyz_r = f"{self.args['scratch_xtb']}/{job_id}_r.xyz" + xyz_write(tmp_xyz_r,conf_entry['E'],conf_entry['G']) + logger.info(f"{self.args['scratch_xtb']}/{job_id}_r.xyz") + # calculate indicator + #logger.info(f"{len(conf_entry['E'])}") + #logger.info(f"{len(conf_entry['G'])}") + #logger.info(f"{len(Gr)}") + indicators = return_indicator(conf_entry['E'],conf_entry['G'],Gr,namespace=f'tmp_{job_id}') + reactant=io.read(tmp_xyz_r) + product=io.read(tmp_xyz_p) + minimize_rotation_and_translation(reactant,product) + io.write(tmp_xyz_p,product) + _,Gr_opt = xyz_parse(tmp_xyz_p) + indicators_opt = return_indicator(conf_entry['E'],conf_entry['G'],Gr_opt,namespace=f'tmp_{job_id}') + # if applying ase minimize_rotation_and_translation will increase the intended probability, use the rotated geometry + if model.predict_proba(indicators)[0][1] < model.predict_proba(indicators_opt)[0][1]: indicators, Gr = indicators_opt, Gr_opt + # check whether the channel is classified as intended and check uniqueness + if model.predict_proba(indicators)[0][1] > 0.0 and check_duplicate(indicators,ind_list,thresh=0.025): + ind_list.append(indicators) + pass_obj_values.append((model.predict_proba(indicators)[0][0],deepcopy(conf_entry['G']),Gr,deepcopy(conf_entry['direct']))) + # remove tmp file + if os.path.isfile(tmp_xyz_r): os.remove(tmp_xyz_r) + if os.path.isfile(tmp_xyz_p): os.remove(tmp_xyz_p) + pass_obj_values=sorted(pass_obj_values, key=lambda x: x[0]) + N_conf=0 + for item in pass_obj_values: + input_type=item[3] + tmp_xyz_r = f"{self.args['scratch_xtb']}/{job_id}_r.xyz" + tmp_xyz_p = f"{self.args['scratch_xtb']}/{job_id}_p.xyz" + xyz_write(tmp_xyz_r, RE, item[1]) + xyz_write(tmp_xyz_p, RE, item[2]) + if self.args['opt']: + if self.args['low_solvation']: + solvation_model, solvent = self.args['low_solvation'].split('/') + optjob = XTB(input_geo=tmp_xyz_p,work_folder=self.args['scratch_xtb'],jobtype=['opt'],jobname=f'opt_{job_id}_p',solvent=solvent,\ + solvation_model=solvation_model,charge=self.args['charge'],multiplicity=self.args['multiplicity']) + else: + optjob = XTB(input_geo=tmp_xyz_p,work_folder=self.args['scratch_xtb'],jobtype=['opt'],jobname=f'opt_{job_id}_p',charge=self.args['charge'],multiplicity=self.args['multiplicity']) + + optjob.execute() + + if optjob.optimization_success(): + _, Gr = optjob.get_final_structure() + else: + #logger.info(f"xtb geometry optimization fails for the other end of {job_id} (conf: {conf_ind}), will use force-field optimized geometry for instead") + #Gr = item[2] + continue + + xyz_write(tmp_xyz_p,conf_entry['E'],Gr) + + if input_type=='F': + _, rg=xyz_parse(tmp_xyz_r) + _, pg=xyz_parse(tmp_xyz_p) + self.rxn_conf[N_conf]={"R": rg, "P": pg} + os.system(f"rm {tmp_xyz_r}") + os.system(f"rm {tmp_xyz_p}") + #os.system(f"cp {tmp_xyz_r} {self.args['conf_output']}/{job_id}_{N_conf}.xyz; cat {tmp_xyz_p} >> {self.args['conf_output']}/{job_id}_{N_conf}.xyz;rm {tmp_xyz_r} {tmp_xyz_p}") + else: + _, rg=xyz_parse(tmp_xyz_p) + _, pg=xyz_parse(tmp_xyz_r) + self.rxn_conf[N_conf]={"R": rg, "P": pg} + os.system(f"rm {tmp_xyz_r}") + os.system(f"rm {tmp_xyz_p}") + #os.system(f"cp {tmp_xyz_p} {self.args['conf_output']}/{job_id}_{N_conf}.xyz; cat {tmp_xyz_r} >> {self.args['conf_output']}/{job_id}_{N_conf}.xyz;rm {tmp_xyz_r} {tmp_xyz_p}") + N_conf=N_conf+1 + if N_conf>=self.args["n_conf"]: break + + if len(pass_obj_values) == 0: + print(f"WARNING: None of the reaction conformation can be generated for the input reaction {job_id}. please check this reaction to make sure it is a vaild one") + + # add a joint-opt alignment if too few alignments pass the criteria + # will add soon + return + diff --git a/pyTEST_Example/wrappers/xtb.py b/pyTEST_Example/wrappers/xtb.py new file mode 100755 index 0000000..c38687b --- /dev/null +++ b/pyTEST_Example/wrappers/xtb.py @@ -0,0 +1,278 @@ +#!/bin/env python +# Author: Qiyuan Zhao (zhaoqy1996@gmail.com) + +import subprocess +import os,sys +import numpy as np + +sys.path.append('/'.join(os.path.abspath(__file__).split('/')[:-2])) +#from parsers import xyz_parse +from yarp.input_parsers import xyz_parse +from constants import Constants +from yarp.taffi_functions import table_generator + +class XTB: + def __init__(self, input_geo, work_folder=os.getcwd(), lot='gfn2', jobtype=['opt'], nproc=1, scf_iters=300, jobname='xtbjob', solvent=False, solvation_model='alpb', charge=0, multiplicity=1, xtb_path='xtb'): + """ + Initialize a xTB job class + input_geo: a xyz file containing the input geometry + work_folder: space for running xTB and saving outputfiles, if is not specified, will use the path of input_geo + jobtype: select from ['','opt','grad','hess'], '' refers to single point energy calculation. + Solvation model: --alpb: analytical linearized Poisson-Boltzmann (ALPB) model, available solvents are acetone, acetonitrile, aniline, benzaldehyde, benzene, ch2cl2, chcl3, cs2, dioxane, dmf, dmso, + ether, ethylacetate, furane, hexandecane, hexane, methanol, nitromethane, octanol, woctanol, phenol, toluene, thf, water.. + --gbsa: generalized born (GB) model with solvent accessable surface (SASA) model, available solvents are acetone, acetonitrile, benzene (only GFN1-xTB), CH2Cl2, CHCl3, CS2, DMF (only GFN2-xTB), + DMSO, ether, H2O, methanol, n-hexane (only GFN2-xTB), THF and toluene. + """ + # set basic + self.input_geo = input_geo + self.scf_iters = f'--iterations {scf_iters}' + self.jobname = f'--namespace {jobname}' + self.charge = f'--chrg {charge}' + self.unpair = f'--uhf {multiplicity-1}' + self.nproc = f'--parallel {nproc}' + + # set level of theory + if lot == 'gfnff': self.lot = '--gfnff' + else: self.lot = f'--gfn {lot[-1]}' + + # set solvent + if solvation_model.lower() == 'alpb': solvation_model = 'alpb' + else: solvation_model = 'gbsa' # use GBSA implicit solvent + if solvent: self.solvent = f'--{solvation_model} {solvent} ' + else: self.solvent = solvent + + # set job + self.jobtype = '' + if 'opt' in jobtype: self.jobtype += '--opt ' + if 'grad' in jobtype: self.jobtype += '--grad ' + if 'hess' in jobtype: self.jobtype += '--hess ' + + # set working folder + self.work_folder = work_folder + self.xcontrol = os.path.join(self.work_folder,f'{jobname}.xcontrol') + self.output = os.path.join(self.work_folder,f'{jobname}-xtb.out') + + # create work folder + if os.path.isdir(self.work_folder) is False: os.mkdir(self.work_folder) + + # XTB calculation basic command + self.command = f'{xtb_path} {self.input_geo} {self.scf_iters} {self.charge} {self.unpair} {self.jobname} {self.lot} {self.jobtype} {self.nproc} ' + if self.solvent is True: self.command += self.solvent + + def generate_xcontrol(self, distance_constraints=[], cartesian_constraints=[], force_constant=0.5): + """ + Generate an XTB input file with constraints + Each element in distance_constraints should be [atomi,atomj,distance] -- index start from 1 + cartesian_constraints should be a list of atoms that need to be constrained + """ + with open(self.xcontrol, 'w') as f: + if len(distance_constraints) > 0: + for dis in distance_constraints: + f.write(f'$constrain\nforce constant={force_constant}\ndistance: {dis[0]}, {dis[1]}, {dis[2]:.4f}\n$\n\n') + + if len(cartesian_constraints) > 0: + list_of_ranges, used_atoms = [], [] + for i in sorted(cartesian_constraints): + atom_range = [] + if i not in used_atoms: + while i in cartesian_constraints: + used_atoms.append(i) + atom_range.append(i) + i += 1 + if len(atom_range) == 1: + list_of_ranges += str(atom_range[0]) + else: + list_of_ranges.append(f'{atom_range[0]}-{atom_range[-1]}') + + # write into constraints + f.write(f'$constrain\nforce constant={force_constant}\natoms: {",".join(list_of_ranges)}\n$\n\n') + + return + + def add_command(self, additional=False, distance_constraints=[], cartesian_constraints=[], force_constant=0.5): + """ + Add in additional command and cpnstraints + """ + # add other commands if is needed: + if additional: self.command += additional + if len(distance_constraints) > 0 or len(cartesian_constraints) > 0: + self.generate_xcontrol(distance_constraints, cartesian_constraints, force_constant) + self.command += f' --input {self.xcontrol}' + #print(self.command) + def execute(self): + """ + Execute a XTB calculation using the runtime flags + """ + + # obtain current path + current_path = os.getcwd() + + # go into the work folder and run the command + os.chdir(self.work_folder) + result = subprocess.run(f'{self.command} > {self.output}', shell=True, capture_output=True, text=True) + print(self.command) + print(self.output) + # print(self.command) + # print(result) + # go back to the original folder + os.chdir(current_path) + # os.system("sleep 10") + return None + + def calculation_terminated_normally(self) -> bool: + """ + Check if the calculation terminate normally + """ + # load in xtb output + print("NORMAL") + print(os.path.isfile(self.output)) + if os.path.isfile(self.output) is False: return False + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for n_line, line in enumerate(reversed(lines)): + if 'finished run' in line: + return True + + if 'ERROR' in line: + return False + + return False + + def get_energy(self) -> float: + """ + Get single point energy from Orca output file + """ + # load in xtb output + lines = open(self.output, 'r', encoding="utf-8").readlines() + for line in reversed(lines): + if 'TOTAL ENERGY' in line: + return float(line.split()[-3]) + + return False + + def optimization_converged(self) -> bool: + """ + Check if the optimization converges + """ + # load in xtb output + lines = open(self.output, 'r', encoding="utf-8").readlines() + + for line in reversed(lines): + if 'GEOMETRY OPTIMIZATION CONVERGED' in line: + return True + + return False + + def get_final_structure(self): + """ + Get the final set of geometry (and elements) from xTB output files + """ + # First try the .xyz file generated + xyz_file_name = self.output.replace('-xtb.out','.xtbopt.xyz') + if os.path.exists(xyz_file_name): + E,G = xyz_parse(xyz_file_name) + return E, G + + # if xyz file does not exist, go to potentially long .out file + # load xTB output file + lines = open(self.output, 'r', encoding="utf-8").readlines() + # locate geometry + for i, line in enumerate(lines): + if 'final structure' in line: + n_atoms = int(lines[i+2].split()[0]) + E, G = [], np.zeros([n_atoms,3]) + for xyz_line in lines[i+4:i+4+n_atoms]: + label, x, y, z = xyz_line.split() + G[len(E),:] = np.array([x, y, z]) + E.append(label) + return E, G + + return False + + def optimization_success(self) -> bool: + """ + Check if the optimization converges and the structure does not change + """ + if not self.optimization_converged(): return False + E, G = xyz_parse(self.input_geo) + _, optG = self.get_final_structure() + adj_mat_i = table_generator(E, G) + adj_mat_o = table_generator(E, optG) + if np.sum(abs(adj_mat_i-adj_mat_o)) != 0: + rows, cols = np.where(adj_mat_i != adj_mat_o) + contain_metal = [E[rows[ind]] in ['Zn','Mg','Li'] or E[cols[ind]] in ['Zn','Mg','Li'] for ind in range(len(rows))] + if False in contain_metal: + return False + else: + return True + else: + return True + + def get_gradients(self) -> np.ndarray: + """ + e.g. + + $grad + cycle = 1 SCF energy = -116.98066838318 |dE/dxyz| = 0.000789 + 9.38359090261906 0.81176045977317 1.76153019659726 C + 7.93119990853722 -1.04153755943172 2.82916916631817 C + .............. + .............. + 1.6047114675284E-06 1.8306743536636E-06 -1.6882288831211E-05 + 4.6690618608144E-06 -6.9694786356967E-07 -7.4404905013152E-06 + -4.0275148550147E-05 2.3942665105975E-05 -1.8011064350936E-05 + """ + gradients = [] + grad_file_name = self.output.replace('-xtb.out','.gradient') + + if os.path.exists(grad_file_name): + with open(grad_file_name, 'r') as grad_file: + for lc,line in enumerate(grad_file): + if lc < 2 or len(line.split()) != 3: continue + if '$end' in line: break + x, y, z = line.split() + gradients.append(np.array([float(x), float(y), float(z)])) + + # Convert from Ha a0^-1 to Ha A-1 + gradients = [grad / Constants.a0_to_ang for grad in gradients] + return np.array(gradients) + else: + return False + + def get_hessian(self) -> np.ndarray: + """Grab the Hessian from the output .hessian file + + e.g.:: + $hessian + 0.6826504303 0.0274199974 -0.0259468432 -0.1835741403 -0.1456143386 + 0.0850759357 -0.0398480235 -0.0214075717 0.0129469348 0.0039869468 + -0.0019274759 0.0014269885 -0.0263103022 0.0241808385 -0.0135337523 + ........ + """ + # locate the .hess file generated + hess_file_name = self.output.replace('-xtb.out','.hessian') + + start_line = False + hessian_blocks = [] + if os.path.exists(hess_file_name): + # load in the hessian file + lines = open(hess_file_name, 'r', encoding="utf-8").readlines() + for i, line in enumerate(lines): + if '$hessian' in line: continue + if len(line.split()) == 0: break + hessian_blocks += line.split() + + # convert into numpy and reshape + hessian_blocks = np.array(hessian_blocks, dtype='f8') + n_atoms_3 = int(len(hessian_blocks)**0.5) + + # check if the dimention of hessian block is n*n + if len(hessian_blocks) % n_atoms_3 != 0: + print("Wrong hessian file...") + return False + else: + hessian = hessian_blocks.reshape([n_atoms_3, n_atoms_3]) + + # Hessians printed in Ha/a0^2, so convert to base Ha/Å^2 + return hessian/ Constants.a0_to_ang**2 +