diff --git a/environment.yml b/environment.yml index 8bd5d1c..b03042c 100644 --- a/environment.yml +++ b/environment.yml @@ -33,6 +33,7 @@ dependencies: - wheel=0.37.0=pyhd3eb1b0_1 - xz=5.2.5=h7b6447c_0 - zlib=1.2.11=h7f8727e_4 + - scikit-learn - pip: - attrs==21.4.0 - h5py==3.6.0 diff --git a/qstack/regression/hyperparameters.py b/qstack/regression/hyperparameters.py index 7b40ee8..2b71886 100755 --- a/qstack/regression/hyperparameters.py +++ b/qstack/regression/hyperparameters.py @@ -4,13 +4,13 @@ import numpy as np import scipy from sklearn.model_selection import train_test_split, KFold -from qstack.regression.kernel_utils import get_kernel, defaults +from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs from qstack.tools import correct_num_threads def hyperparameters(X, y, - sigma=defaults.sigmaarr, eta=defaults.etaarr, + sigma=defaults.sigmaarr, eta=defaults.etaarr, gkernel=defaults.gkernel, gdict=defaults.gdict, akernel=defaults.kernel, test_size=defaults.test_size, splits=defaults.splits, - printlevel=0, adaptive=False): + printlevel=0, adaptive=False, read_kernel=False): def k_fold_opt(K_all): kfold = KFold(n_splits=splits, shuffle=False) @@ -32,7 +32,11 @@ def k_fold_opt(K_all): def hyper_loop(sigma, eta): errors = [] for s in sigma: - K_all = kernel(X_train, X_train, 1.0/s) + if read_kernel is False: + K_all = kernel(X_train, X_train, 1.0/s) + else: + K_all = X_train + for e in eta: K_all[np.diag_indices_from(K_all)] += e mean, std = k_fold_opt(K_all) @@ -43,8 +47,13 @@ def hyper_loop(sigma, eta): errors.append((mean, std, e, s)) return errors - kernel = get_kernel(akernel) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) + kernel = get_kernel(akernel, [gkernel, gdict]) + if read_kernel is False: + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) + else: + idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=0) + X_train = X[np.ix_(idx_train,idx_train)] + sigma = [np.nan] work_sigma = sigma errors = [] @@ -84,20 +93,24 @@ def main(): parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file') parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file') parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')') - parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') + parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') + parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default '+defaults.gkernel+')') + parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters') parser.add_argument('--splits', type=int, dest='splits', default=defaults.splits, help='k in k-fold cross validation (default='+str(defaults.n_rep)+')') parser.add_argument('--print', type=int, dest='printlevel', default=0, help='printlevel') parser.add_argument('--eta', type=float, dest='eta', default=defaults.etaarr, nargs='+', help='eta array') parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigmaarr, nargs='+', help='sigma array') parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads') parser.add_argument('--ada', action='store_true', dest='adaptive', default=False, help='if adapt sigma') + parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel') args = parser.parse_args() + if(args.readk): args.sigma = [np.nan] print(vars(args)) if(args.ll): correct_num_threads() X = np.load(args.repr) y = np.loadtxt(args.prop) - errors = hyperparameters(X, y, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive) + errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive) print() print('error stdev eta sigma') diff --git a/qstack/regression/kernel.py b/qstack/regression/kernel.py index 1901b86..5662d01 100755 --- a/qstack/regression/kernel.py +++ b/qstack/regression/kernel.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import numpy as np -from qstack.regression.kernel_utils import get_kernel, defaults +from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs from qstack.tools import correct_num_threads +import sys -def kernel(X, sigma=defaults.sigma, akernel=defaults.kernel): - kernel = get_kernel(akernel) - K = kernel(X, X, 1.0/sigma) +def kernel(X, Y=[], sigma=defaults.sigma, akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict): + if len(Y) == 0 : + Y = X + kernel = get_kernel(akernel, [gkernel, gdict]) + K = kernel(X, Y, 1.0/sigma) return K def main(): @@ -14,15 +17,21 @@ def main(): parser = argparse.ArgumentParser(description='This program computes kernel.') parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file') parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')') - parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') + parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') + parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters') + parser.add_argument('--gkernel', type=str, dest='gkernel', default=None, help='global kernel type (agv for average, rem for REMatch kernel, None for local kernels) (default None)') parser.add_argument('--dir', type=str, dest='dir', default='./', help='directory to save the output in (default=current dir)') parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads') args = parser.parse_args() print(vars(args)) if(args.ll): correct_num_threads() - X = np.load(args.repr) - K = kernel(X, args.sigma, args.kernel) - np.save(args.dir+'/K_'+os.path.basename(args.repr)+'_'+args.kernel, K) + if os.path.isfile(args.repr): + X = np.load(args.repr) + else: + x_files = [os.path.join(args.repr, f) for f in os.listdir(args.repr) if os.path.isfile(os.path.join(args.repr, f))] + X = [np.load(f, allow_pickle=True) for f in x_files] + K = kernel(X, sigma=args.sigma, akernel=args.akernel, gkernel=args.gkernel, gdict=args.gdict) + np.save(args.dir+'/K_'+os.path.splitext(os.path.basename(args.repr))[0]+'_'+args.akernel+'_'+f"{args.gkernel}"+f"_norm{'_'.join([str(v) for v in args.gdict.values()])}_"+"%e"%args.sigma, K) if __name__ == "__main__": main() diff --git a/qstack/regression/kernel_utils.py b/qstack/regression/kernel_utils.py index 273764b..0456e79 100644 --- a/qstack/regression/kernel_utils.py +++ b/qstack/regression/kernel_utils.py @@ -2,11 +2,30 @@ import sysconfig from types import SimpleNamespace import qstack +import argparse + +class ParseKwargs(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, defaults.gdict) + for value in values: + key, value = value.split('=') + for t in [int, float]: + try: + value = t(value) + break + except: + continue + getattr(namespace, self.dest)[key] = value + + + defaults = SimpleNamespace( sigma=32.0, eta=1e-5, kernel='L', + gkernel='avg', + gdict={'alpha':1.0, 'normalize':1}, test_size=0.2, n_rep=5, splits=5, @@ -16,6 +35,133 @@ sigmaarr_mult=list(numpy.logspace(0,2, 5)) ) + + + +def mol_to_dict(mol, species): + mol_dict = {s:[] for s in species} + for a in mol: + mol_dict[a[0]].append(a[1]) + for k in mol_dict.keys(): + mol_dict[k] = numpy.array(mol_dict[k]) + return mol_dict + +def avg_kernel(kernel, options): + avg = numpy.sum(kernel) / (kernel.shape[0] * kernel.shape[1]) + return avg + + +def rematch_kernel(kernel, options): + alpha = options['alpha'] + thresh = 1e-6 + n, m = kernel.shape + + K = numpy.exp(-(1 - kernel) / alpha) + + en = numpy.ones((n,)) / float(n) + em = numpy.ones((m,)) / float(m) + + u = en.copy() + v = em.copy() + + niter = 0 + error = 1 + + while error > thresh: + v_prev = v + u_prev = u + + u = numpy.divide(en, numpy.dot(K, v)) + v = numpy.divide(em, numpy.dot(K.T, u)) + + if niter % 5: + error = numpy.sum((u - u_prev) ** 2) / numpy.sum(u ** 2) + numpy.sum((v - v_prev) ** 2) / numpy.sum(v **2) + + niter += 1 + p_alpha = numpy.multiply(numpy.multiply(K, u.reshape((-1, 1))) , v) + K_rem = numpy.sum(numpy.multiply(p_alpha, kernel)) + return K_rem + +def normalize_kernel(kernel, self_x=None, self_y=None): + print("Normalizing kernel.") + if self_x == None and self_y == None: + self_cov = numpy.diag(kernel).copy() + self_x = self_cov + self_y = self_cov + for n in range(kernel.shape[0]): + for m in range(kernel.shape[1]): + kernel[n][m] /= numpy.sqrt(self_x[n]*self_y[m]) + return kernel + +def get_covariance(mol1, mol2, max_sizes, kernel , sigma=None): + from qstack.regression.kernel_utils import get_kernel + species = sorted(max_sizes.keys()) + mol1_dict = mol_to_dict(mol1, species) + mol2_dict = mol_to_dict(mol2, species) + max_size = sum(max_sizes.values()) + K_covar = numpy.zeros((max_size, max_size)) + idx = 0 + for s in species: + n1 = len(mol1_dict[s]) + n2 = len(mol2_dict[s]) + s_size = max_sizes[s] + if n1 == 0 or n2 == 0: + idx += s_size + continue + x1 = numpy.pad(mol1_dict[s], ((0, s_size - n1),(0,0)), 'constant') + x2 = numpy.pad(mol2_dict[s], ((0, s_size - n2),(0,0)), 'constant') + K_covar[idx:idx+s_size, idx:idx+s_size] = kernel(x1, x2, sigma) + idx += s_size + return K_covar + + +def get_global_K(X, Y, sigma, local_kernel, global_kernel, options): + n_x = len(X) + n_y = len(Y) + species = sorted(list(set([s[0] for m in numpy.concatenate((X, Y), axis=0) for s in m]))) + + mol_counts = [] + for m in numpy.concatenate((X, Y), axis=0): + count_species = {s:0 for s in species} + for a in m: + count_species[a[0]] += 1 + mol_counts.append(count_species) + + max_atoms = {s:0 for s in species} + for m in mol_counts: + for k, v in m.items(): + max_atoms[k] = max([v, max_atoms[k]]) + max_size = sum(max_atoms.values()) + print(max_atoms, max_size, flush=True) + K_global = numpy.zeros((n_x, n_y)) + print("Computing global kernel elements:\n[", sep='', end='', flush=True) + if X[0] == Y[0]: + self = True + else: + self = False + self_X = [] + self_Y = [] + for m in range(0, n_x): + if not self: + K_self = get_covariance(X[m], X[m], max_atoms, local_kernel, sigma=sigma) + self_X.append(global_kernel(K_self, options)) + for n in range(0, n_y): + if not self: + K_self = get_covariance(Y[m], Y[m], max_atoms, local_kernel, sigma=sigma) + self_Y.append(global_kernel(K_self, options)) + K_pair = get_covariance(X[m], Y[n], max_atoms, local_kernel, sigma=sigma) + K_global[m][n] = global_kernel(K_pair, options) + if ((m+1) / len(X) * 100)%10 == 0: + print(f"##### {(m+1) / len(X) * 100}% #####", sep='', end='', flush=True) + print("]", flush=True) + if options['normalize'] == True: + if self : + K_global = normalize_kernel(K_global, self_x=None, self_y=None) + else: + K_global = normalize_kernel(K_global, self_x=self_X, self_y=self_Y) + print(f"Final global kernel has size : {K_global.shape}", flush=True) + return K_global + def my_laplacian_kernel(X, Y, gamma): """ Compute Laplacian kernel between X and Y """ def cdist(X, Y): @@ -57,16 +203,40 @@ def my_laplacian_kernel_c(X, Y, gamma): -def get_kernel(arg): +def get_local_kernel(arg): + if arg=='G': + from sklearn.metrics.pairwise import rbf_kernel + return rbf_kernel + elif arg=='dot': + from sklearn.metrics.pairwise import linear_kernel + return lambda x,y,s: linear_kernel(x, y) + elif arg=='L': + from sklearn.metrics.pairwise import laplacian_kernel + return laplacian_kernel + elif arg=='myL': + return my_laplacian_kernel + elif arg=='myLfast': + return my_laplacian_kernel_c + else: + raise Exception(f'{arg} kernel is not implemented') # TODO + +def get_global_kernel(arg, local_kernel): + gkernel, options = arg + if gkernel == 'avg': + global_kernel = avg_kernel + elif gkernel == 'rem': + global_kernel = rematch_kernel + else: + raise Exception(f'{gkernel} kernel is not implemented') # TODO + return lambda x, y, s: get_global_K(x, y, s, local_kernel, global_kernel, options) + + +def get_kernel(arg, arg2=None): """ Returns the kernel function depending on the cli argument """ - if arg=='G': - from sklearn.metrics.pairwise import rbf_kernel - return rbf_kernel - elif arg=='L': - from sklearn.metrics.pairwise import laplacian_kernel - return laplacian_kernel - elif arg=='myL': - return my_laplacian_kernel - elif arg=='myLfast': - return my_laplacian_kernel_c + local_kernel = get_local_kernel(arg) + + if arg2 is None: + return local_kernel + else: + return get_global_kernel(arg2, local_kernel) diff --git a/qstack/regression/regression.py b/qstack/regression/regression.py index 62a53b2..199a905 100755 --- a/qstack/regression/regression.py +++ b/qstack/regression/regression.py @@ -3,23 +3,29 @@ import numpy as np import scipy from sklearn.model_selection import train_test_split -from qstack.regression.kernel_utils import get_kernel, defaults +from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs from qstack.tools import correct_num_threads -def regression(X, y, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.kernel, test_size=defaults.test_size, train_size=defaults.train_size, n_rep=defaults.n_rep, debug=False): - kernel = get_kernel(akernel) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) - all_indices_train = np.arange(X_train.shape[0]) - K_all = kernel(X_train, X_train, 1.0/sigma) - Ks_all = kernel(X_test, X_train, 1.0/sigma) +def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict, test_size=defaults.test_size, train_size=defaults.train_size, n_rep=defaults.n_rep, debug=False): + if read_kernel is False: + kernel = get_kernel(akernel, [gkernel, gdict]) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) + K_all = kernel(X_train, X_train, 1.0/sigma) + Ks_all = kernel(X_test, X_train, 1.0/sigma) + else: + idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=0) + K_all = X[np.ix_(idx_train,idx_train)] + Ks_all = X[np.ix_(idx_test, idx_train)] + K_all[np.diag_indices_from(K_all)] += eta + all_indices_train = np.arange(len(y_train)) if debug: np.random.seed(666) maes_all = [] for size in train_size: - size_train = int(np.floor(X_train.shape[0]*size)) + size_train = int(np.floor(len(y_train)*size)) maes = [] for rep in range(n_rep): train_idx = np.random.choice(all_indices_train, size = size_train, replace=False) @@ -35,22 +41,25 @@ def regression(X, y, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.ke def main(): import argparse parser = argparse.ArgumentParser(description='This program computes the learning curve.') - parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file') - parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file') - parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')') - parser.add_argument('--eta', type=float, dest='eta', default=defaults.eta, help='eta hyperparameter (default='+str(defaults.eta)+')') - parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')') - parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') - parser.add_argument('--splits', type=int, dest='splits', default=defaults.n_rep, help='number of splits (default='+str(defaults.n_rep)+')') - parser.add_argument('--train', type=float, dest='train_size', default=defaults.train_size, nargs='+', help='training set fractions') - parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='enable debug') - parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads') + parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file') + parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file') + parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')') + parser.add_argument('--eta', type=float, dest='eta', default=defaults.eta, help='eta hyperparameter (default='+str(defaults.eta)+')') + parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')') + parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')') + parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default '+defaults.gkernel+')') + parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters') + parser.add_argument('--splits', type=int, dest='splits', default=defaults.n_rep, help='number of splits (default='+str(defaults.n_rep)+')') + parser.add_argument('--train', type=float, dest='train_size', default=defaults.train_size, nargs='+', help='training set fractions') + parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='enable debug') + parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads') + parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel') args = parser.parse_args() print(vars(args)) if(args.ll): correct_num_threads() X = np.load(args.repr) y = np.loadtxt(args.prop) - maes_all = regression(X, y, sigma=args.sigma, eta=args.eta, akernel=args.kernel, + maes_all = regression(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, train_size=args.train_size, n_rep=args.splits, debug=args.debug) for size_train, meanerr, stderr in maes_all: print("%d\t%e\t%e" % (size_train, meanerr, stderr)) diff --git a/requirements.txt b/requirements.txt index 22b9d55..9951ece 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ pytest==6.2.5 numpy===1.22.3 scipy==1.7.3 toml==0.10.2 +scikit-learn git+https://github.com/lab-cosmo/equistore.git#egg=equistore diff --git a/tests/data/SPAHM_a_H2O/X_H2O.npy b/tests/data/SPAHM_a_H2O/X_H2O.npy new file mode 100644 index 0000000..cd5abfc Binary files /dev/null and b/tests/data/SPAHM_a_H2O/X_H2O.npy differ diff --git a/tests/data/SPAHM_a_H2O/X_H2O_dist.npy b/tests/data/SPAHM_a_H2O/X_H2O_dist.npy new file mode 100644 index 0000000..a11e28c Binary files /dev/null and b/tests/data/SPAHM_a_H2O/X_H2O_dist.npy differ diff --git a/tests/data/SPAHM_a_H2O/X_rotated_H2O.npy b/tests/data/SPAHM_a_H2O/X_rotated_H2O.npy new file mode 100644 index 0000000..dc6700b Binary files /dev/null and b/tests/data/SPAHM_a_H2O/X_rotated_H2O.npy differ diff --git a/tests/test_global.py b/tests/test_global.py new file mode 100644 index 0000000..ed84f09 --- /dev/null +++ b/tests/test_global.py @@ -0,0 +1,34 @@ +import os +import numpy as np +from qstack.regression import kernel, kernel_utils + + + +def test_avg_kernel(): + + path = os.path.dirname(os.path.realpath(__file__)) + X_dir = os.path.join(path, 'data/SPAHM_a_H2O/') + mols = [np.load(os.path.join(X_dir, f), allow_pickle=True) for f in os.listdir(X_dir) if os.path.isfile(os.path.join(X_dir,f))] + K = kernel.kernel(mols, akernel='L', gkernel='avg', sigma=1.0) + + true_K = np.array( [[1. , 1. , 0.79179528], \ + [1. , 1. , 0.79179528] , \ + [0.79179528, 0.79179528, 1. ]]) + + + assert(K.shape == (3,3)) + assert(np.abs(np.sum(K-true_K)) < 1e-05) + +def test_rem_kernel(): + path = os.path.dirname(os.path.realpath(__file__)) + X_dir = os.path.join(path, 'data/SPAHM_a_H2O/') + mols = [np.load(os.path.join(X_dir, f), allow_pickle=True) for f in os.listdir(X_dir) if os.path.isfile(os.path.join(X_dir,f))] + K = kernel.kernel(mols, akernel='L', gkernel='rem', sigma=1.0, gdict={'alpha':1.0, 'normalize':1}) + + true_K = np.array( [[1. , 0.6528238, 1. ], \ + [0.6528238,1. ,0.6528238], \ + [1. ,0.6528238 ,1. ]]) + + assert(K.shape == (3,3)) + assert(np.abs(np.sum(K-true_K)) < 1e-05) +