-
Notifications
You must be signed in to change notification settings - Fork 1
/
gauss_reader.py
84 lines (67 loc) · 2.87 KB
/
gauss_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#gauss_reader reads in a .out file from gaussian 09 and extracts
#data as desired for further analysis. A pandas dataframe should be generated
#which can be easily written off to an excel file
#author Trevor Sleight: twsleight@gmail.# COMBAK:
#Initially drafted Summer 2020
import os
import pandas as pd
import glob
#need to show lots of precision because we're working in Hartrees
pd.set_option('precision', 7)
cwd = os.getcwd()
print(cwd)
namelist=[]
prevLine = []
allData = pd.DataFrame(columns = ['SMILES', 'TA98', 'filename'])
#this is a reference file that provides the truth data that will be used for
#training algorithms later.
ta98 = pd.read_excel(cwd +r"//newta98.xlsx", sheet_name = 'Sheet1')
for filename in glob.glob('*gas.out'):
print(filename)
namelist.append(filename)
termFlag = 0 # make sure the file terminated normally
errorFlag = 0 # make sure that there are no errors
with open(filename)as searchfile:
#************************
#initialized with error codes
p = 'not found'
c = 'not found'
t = 'not found'
for line in searchfile:
#error checking
if 'Normal termination of Gaussian 09' in line:
termFlag = 1
if 'slurmstepd: error:' in line:
errorFlag = 1
#***************************************************************
#Get the smiles code that we have in the top of every input card
left,sep,right=line.partition('Symbolic Z-matrix:')
if sep:
# print(sep)
# print(prevLine)
parsedLine = prevLine2. split()
for p in parsedLine:
#restricting the smiles code to just this limits the
#dataset to hydrocarbons
if set(p).issubset('CcOo=()-123456789\/') and ('c' or 'C') in set(p):
# print(p)
curSmiles = p
prevLine2 = prevLine
prevLine = line
#end of the searchfile loop
#add the filename data, and the TA98 data
if termFlag == 1 and errorFlag == 0:
#********************
if len(ta98.loc[ta98['SMILES'] == p]['result']) == 1:
ta98Result = int(ta98.loc[ta98['SMILES'] == p]['result'])
else:
ta98Result = 'error'
saveName = filename.split('_')
tempSlice = pd.DataFrame(data = {'SMILES':[p],'TA98': ta98Result, 'filename': saveName[0] })
allData = allData.append(tempSlice)
#********************
else:
print(filename, ' did not terminate correctly')
#clean up and drop duplicate smiles codes
allData.drop_duplicates(subset='SMILES', keep="first", inplace = True)
print(allData.to_string())