Skip to content

Commit

Permalink
Merge pull request #8 from lcmd-epfl/regression-read-kernel
Browse files Browse the repository at this point in the history
Global kernels!
  • Loading branch information
briling authored Oct 25, 2022
2 parents 5dcc624 + 8ee3510 commit 2db024b
Show file tree
Hide file tree
Showing 10 changed files with 283 additions and 46 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies:
- wheel=0.37.0=pyhd3eb1b0_1
- xz=5.2.5=h7b6447c_0
- zlib=1.2.11=h7f8727e_4
- scikit-learn
- pip:
- attrs==21.4.0
- h5py==3.6.0
Expand Down
29 changes: 21 additions & 8 deletions qstack/regression/hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import numpy as np
import scipy
from sklearn.model_selection import train_test_split, KFold
from qstack.regression.kernel_utils import get_kernel, defaults
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
from qstack.tools import correct_num_threads

def hyperparameters(X, y,
sigma=defaults.sigmaarr, eta=defaults.etaarr,
sigma=defaults.sigmaarr, eta=defaults.etaarr, gkernel=defaults.gkernel, gdict=defaults.gdict,
akernel=defaults.kernel, test_size=defaults.test_size, splits=defaults.splits,
printlevel=0, adaptive=False):
printlevel=0, adaptive=False, read_kernel=False):

def k_fold_opt(K_all):
kfold = KFold(n_splits=splits, shuffle=False)
Expand All @@ -32,7 +32,11 @@ def k_fold_opt(K_all):
def hyper_loop(sigma, eta):
errors = []
for s in sigma:
K_all = kernel(X_train, X_train, 1.0/s)
if read_kernel is False:
K_all = kernel(X_train, X_train, 1.0/s)
else:
K_all = X_train

for e in eta:
K_all[np.diag_indices_from(K_all)] += e
mean, std = k_fold_opt(K_all)
Expand All @@ -43,8 +47,13 @@ def hyper_loop(sigma, eta):
errors.append((mean, std, e, s))
return errors

kernel = get_kernel(akernel)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
kernel = get_kernel(akernel, [gkernel, gdict])
if read_kernel is False:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
else:
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=0)
X_train = X[np.ix_(idx_train,idx_train)]
sigma = [np.nan]

work_sigma = sigma
errors = []
Expand Down Expand Up @@ -84,20 +93,24 @@ def main():
parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file')
parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file')
parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')')
parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default '+defaults.gkernel+')')
parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters')
parser.add_argument('--splits', type=int, dest='splits', default=defaults.splits, help='k in k-fold cross validation (default='+str(defaults.n_rep)+')')
parser.add_argument('--print', type=int, dest='printlevel', default=0, help='printlevel')
parser.add_argument('--eta', type=float, dest='eta', default=defaults.etaarr, nargs='+', help='eta array')
parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigmaarr, nargs='+', help='sigma array')
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
parser.add_argument('--ada', action='store_true', dest='adaptive', default=False, help='if adapt sigma')
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
args = parser.parse_args()
if(args.readk): args.sigma = [np.nan]
print(vars(args))
if(args.ll): correct_num_threads()

X = np.load(args.repr)
y = np.loadtxt(args.prop)
errors = hyperparameters(X, y, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive)
errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive)

print()
print('error stdev eta sigma')
Expand Down
25 changes: 17 additions & 8 deletions qstack/regression/kernel.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#!/usr/bin/env python3
import numpy as np
from qstack.regression.kernel_utils import get_kernel, defaults
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
from qstack.tools import correct_num_threads
import sys

def kernel(X, sigma=defaults.sigma, akernel=defaults.kernel):
kernel = get_kernel(akernel)
K = kernel(X, X, 1.0/sigma)
def kernel(X, Y=[], sigma=defaults.sigma, akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict):
if len(Y) == 0 :
Y = X
kernel = get_kernel(akernel, [gkernel, gdict])
K = kernel(X, Y, 1.0/sigma)
return K

def main():
Expand All @@ -14,15 +17,21 @@ def main():
parser = argparse.ArgumentParser(description='This program computes kernel.')
parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file')
parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')')
parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters')
parser.add_argument('--gkernel', type=str, dest='gkernel', default=None, help='global kernel type (agv for average, rem for REMatch kernel, None for local kernels) (default None)')
parser.add_argument('--dir', type=str, dest='dir', default='./', help='directory to save the output in (default=current dir)')
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
args = parser.parse_args()
print(vars(args))
if(args.ll): correct_num_threads()
X = np.load(args.repr)
K = kernel(X, args.sigma, args.kernel)
np.save(args.dir+'/K_'+os.path.basename(args.repr)+'_'+args.kernel, K)
if os.path.isfile(args.repr):
X = np.load(args.repr)
else:
x_files = [os.path.join(args.repr, f) for f in os.listdir(args.repr) if os.path.isfile(os.path.join(args.repr, f))]
X = [np.load(f, allow_pickle=True) for f in x_files]
K = kernel(X, sigma=args.sigma, akernel=args.akernel, gkernel=args.gkernel, gdict=args.gdict)
np.save(args.dir+'/K_'+os.path.splitext(os.path.basename(args.repr))[0]+'_'+args.akernel+'_'+f"{args.gkernel}"+f"_norm{'_'.join([str(v) for v in args.gdict.values()])}_"+"%e"%args.sigma, K)

if __name__ == "__main__":
main()
192 changes: 181 additions & 11 deletions qstack/regression/kernel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,30 @@
import sysconfig
from types import SimpleNamespace
import qstack
import argparse

class ParseKwargs(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, defaults.gdict)
for value in values:
key, value = value.split('=')
for t in [int, float]:
try:
value = t(value)
break
except:
continue
getattr(namespace, self.dest)[key] = value




defaults = SimpleNamespace(
sigma=32.0,
eta=1e-5,
kernel='L',
gkernel='avg',
gdict={'alpha':1.0, 'normalize':1},
test_size=0.2,
n_rep=5,
splits=5,
Expand All @@ -16,6 +35,133 @@
sigmaarr_mult=list(numpy.logspace(0,2, 5))
)




def mol_to_dict(mol, species):
mol_dict = {s:[] for s in species}
for a in mol:
mol_dict[a[0]].append(a[1])
for k in mol_dict.keys():
mol_dict[k] = numpy.array(mol_dict[k])
return mol_dict

def avg_kernel(kernel, options):
avg = numpy.sum(kernel) / (kernel.shape[0] * kernel.shape[1])
return avg


def rematch_kernel(kernel, options):
alpha = options['alpha']
thresh = 1e-6
n, m = kernel.shape

K = numpy.exp(-(1 - kernel) / alpha)

en = numpy.ones((n,)) / float(n)
em = numpy.ones((m,)) / float(m)

u = en.copy()
v = em.copy()

niter = 0
error = 1

while error > thresh:
v_prev = v
u_prev = u

u = numpy.divide(en, numpy.dot(K, v))
v = numpy.divide(em, numpy.dot(K.T, u))

if niter % 5:
error = numpy.sum((u - u_prev) ** 2) / numpy.sum(u ** 2) + numpy.sum((v - v_prev) ** 2) / numpy.sum(v **2)

niter += 1
p_alpha = numpy.multiply(numpy.multiply(K, u.reshape((-1, 1))) , v)
K_rem = numpy.sum(numpy.multiply(p_alpha, kernel))
return K_rem

def normalize_kernel(kernel, self_x=None, self_y=None):
print("Normalizing kernel.")
if self_x == None and self_y == None:
self_cov = numpy.diag(kernel).copy()
self_x = self_cov
self_y = self_cov
for n in range(kernel.shape[0]):
for m in range(kernel.shape[1]):
kernel[n][m] /= numpy.sqrt(self_x[n]*self_y[m])
return kernel

def get_covariance(mol1, mol2, max_sizes, kernel , sigma=None):
from qstack.regression.kernel_utils import get_kernel
species = sorted(max_sizes.keys())
mol1_dict = mol_to_dict(mol1, species)
mol2_dict = mol_to_dict(mol2, species)
max_size = sum(max_sizes.values())
K_covar = numpy.zeros((max_size, max_size))
idx = 0
for s in species:
n1 = len(mol1_dict[s])
n2 = len(mol2_dict[s])
s_size = max_sizes[s]
if n1 == 0 or n2 == 0:
idx += s_size
continue
x1 = numpy.pad(mol1_dict[s], ((0, s_size - n1),(0,0)), 'constant')
x2 = numpy.pad(mol2_dict[s], ((0, s_size - n2),(0,0)), 'constant')
K_covar[idx:idx+s_size, idx:idx+s_size] = kernel(x1, x2, sigma)
idx += s_size
return K_covar


def get_global_K(X, Y, sigma, local_kernel, global_kernel, options):
n_x = len(X)
n_y = len(Y)
species = sorted(list(set([s[0] for m in numpy.concatenate((X, Y), axis=0) for s in m])))

mol_counts = []
for m in numpy.concatenate((X, Y), axis=0):
count_species = {s:0 for s in species}
for a in m:
count_species[a[0]] += 1
mol_counts.append(count_species)

max_atoms = {s:0 for s in species}
for m in mol_counts:
for k, v in m.items():
max_atoms[k] = max([v, max_atoms[k]])
max_size = sum(max_atoms.values())
print(max_atoms, max_size, flush=True)
K_global = numpy.zeros((n_x, n_y))
print("Computing global kernel elements:\n[", sep='', end='', flush=True)
if X[0] == Y[0]:
self = True
else:
self = False
self_X = []
self_Y = []
for m in range(0, n_x):
if not self:
K_self = get_covariance(X[m], X[m], max_atoms, local_kernel, sigma=sigma)
self_X.append(global_kernel(K_self, options))
for n in range(0, n_y):
if not self:
K_self = get_covariance(Y[m], Y[m], max_atoms, local_kernel, sigma=sigma)
self_Y.append(global_kernel(K_self, options))
K_pair = get_covariance(X[m], Y[n], max_atoms, local_kernel, sigma=sigma)
K_global[m][n] = global_kernel(K_pair, options)
if ((m+1) / len(X) * 100)%10 == 0:
print(f"##### {(m+1) / len(X) * 100}% #####", sep='', end='', flush=True)
print("]", flush=True)
if options['normalize'] == True:
if self :
K_global = normalize_kernel(K_global, self_x=None, self_y=None)
else:
K_global = normalize_kernel(K_global, self_x=self_X, self_y=self_Y)
print(f"Final global kernel has size : {K_global.shape}", flush=True)
return K_global

def my_laplacian_kernel(X, Y, gamma):
""" Compute Laplacian kernel between X and Y """
def cdist(X, Y):
Expand Down Expand Up @@ -57,16 +203,40 @@ def my_laplacian_kernel_c(X, Y, gamma):



def get_kernel(arg):
def get_local_kernel(arg):
if arg=='G':
from sklearn.metrics.pairwise import rbf_kernel
return rbf_kernel
elif arg=='dot':
from sklearn.metrics.pairwise import linear_kernel
return lambda x,y,s: linear_kernel(x, y)
elif arg=='L':
from sklearn.metrics.pairwise import laplacian_kernel
return laplacian_kernel
elif arg=='myL':
return my_laplacian_kernel
elif arg=='myLfast':
return my_laplacian_kernel_c
else:
raise Exception(f'{arg} kernel is not implemented') # TODO

def get_global_kernel(arg, local_kernel):
gkernel, options = arg
if gkernel == 'avg':
global_kernel = avg_kernel
elif gkernel == 'rem':
global_kernel = rematch_kernel
else:
raise Exception(f'{gkernel} kernel is not implemented') # TODO
return lambda x, y, s: get_global_K(x, y, s, local_kernel, global_kernel, options)


def get_kernel(arg, arg2=None):
""" Returns the kernel function depending on the cli argument """
if arg=='G':
from sklearn.metrics.pairwise import rbf_kernel
return rbf_kernel
elif arg=='L':
from sklearn.metrics.pairwise import laplacian_kernel
return laplacian_kernel
elif arg=='myL':
return my_laplacian_kernel
elif arg=='myLfast':
return my_laplacian_kernel_c

local_kernel = get_local_kernel(arg)

if arg2 is None:
return local_kernel
else:
return get_global_kernel(arg2, local_kernel)
Loading

0 comments on commit 2db024b

Please sign in to comment.