Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GEKPLS version with SK Learn PLS #357

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
399 changes: 399 additions & 0 deletions src/GEKPLS.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,399 @@
using LinearAlgebra
using ScikitLearn
using Statistics

@sk_import cross_decomposition: PLSRegression #todo - replace SK Learn's PLS with a pure Julia alternative

mutable struct GEKPLS <: AbstractSurrogate
x::Matrix{Float64} #1
y::Matrix{Float64} #2
grads::Matrix{Float64} #3
xl::Matrix{Float64} #xlimits #4
delta:: Float64 #5
extra_points::Int64 #6
num_components::Int64 #7
beta::Vector{Float64} #8
gamma::Matrix{Float64} #9
theta::Vector{Float64} #10
reduced_likelihood_function_value:: Float64 #11
X_offset::Matrix{Float64} #12
X_scale::Matrix{Float64} #13
X_after_std::Matrix{Float64} #14 - X after standardization
pls_mean::Matrix{Float64} #15
y_mean::Float64 #16
y_std::Float64 #17
end

#constructor for GEKPLS Struct
function GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, θ)
theta = [θ for i in 1:n_comp]
pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
X_after_std, y_after_std, X_offset, y_mean, X_scale, y_std = standardization(X_after_PLS, y_after_PLS)
D, ij = cross_distances(X_after_std)
pls_mean_reshaped = reshape(pls_mean, (size(X,2),n_comp))
d = componentwise_distance_PLS(D, "squar_exp", n_comp, pls_mean_reshaped)
nt, nd = size(X_after_PLS)
beta, gamma, reduced_likelihood_function_value = _reduced_likelihood_function(theta, "squar_exp", d, nt, ij, y_after_std)
return GEKPLS(X, y, grads, xlimits, delta_x, extra_points, n_comp, beta, gamma, theta, reduced_likelihood_function_value,
X_offset, X_scale, X_after_std, pls_mean_reshaped, y_mean, y_std)
println("struct created")
end


# predictor
function (g::GEKPLS)(X_test)
n_eval, n_features_x = size(X_test)
X_cont = (X_test .- g.X_offset) ./ g.X_scale
dx = differences(X_cont, g.X_after_std)
pred_d = componentwise_distance_PLS(dx, "squar_exp", g.num_components, g.pls_mean)
nt = size(g.X_after_std,1)
r = transpose(reshape(squar_exp(g.theta, pred_d),(nt,n_eval)))
f = ones(n_eval,1)
y_ = (f * g.beta) + (r * g.gamma)
y = g.y_mean .+ g.y_std * y_
return y
end

function _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
"""
Gradient-enhanced PLS-coefficients.
Parameters
----------
X: [n_obs,dim] - The input variables.
y: [n_obs,ny] - The output variable
n_comp: int - Number of principal components used.
gradients: - The gradient values. Matrix size (n_obs,dim)
delta_x: real - The step used in the First Order Taylor Approximation
xlimits: [dim, 2]- The upper and lower var bounds.
extra_points: int - The number of extra points per each training point.
Returns
-------
Coeff_pls: [dim, n_comp] - The PLS-coefficients.
X: Concatenation of XX: [extra_points*nt, dim] - Extra points added (when extra_points > 0) and X
y: Concatenation of yy[extra_points*nt, 1]- Extra points added (when extra_points > 0) and y
"""
# this function is equivalent to a combination of
# https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/utils/kriging_utils.py#L1036
# and https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/surrogate_models/gekpls.py#L48
nt, dim = size(X)
XX = zeros(0,dim)
yy = zeros(0,size(y)[2])
_pls = PLSRegression(n_comp)
coeff_pls = zeros((dim, n_comp, nt))
for i in 1:nt
if dim >= 3 #to do ... this is consistent with SMT but inefficient. Move outside for loop and evaluate dim >= 3 just once.
bb_vals = circshift(boxbehnken(dim, 1),1)
_X = zeros((size(bb_vals)[1], dim))
_y = zeros((size(bb_vals)[1], 1))
bb_vals = bb_vals .* (delta_x * (xlimits[:, 2] - xlimits[:, 1]))' #smt calls this sign. I've called it bb_vals
_X = X[i, :]' .+ bb_vals
bb_vals = bb_vals .* grads[i,:]'
_y = y[i, :] .+ sum(bb_vals, dims=2)
else
println("GEKPLS for less than 3 dimensions is coming soon")
end
_pls.fit(_X, _y)
coeff_pls[:, :, i] = _pls.x_rotations_ #size of _pls.x_rotations_ is (dim, n_comp)
if extra_points != 0
start_index = max(1, length(coeff_pls[:,1,i])-extra_points+1) #todo: evaluate just once
max_coeff = sortperm(broadcast(abs,coeff_pls[:,1,i]))[start_index:end]
for ii in max_coeff
XX = [XX; transpose(X[i, :])]
XX[end, ii] += delta_x * (xlimits[ii,2]-xlimits[ii,1])
yy = [yy; y[i]]
yy[end] += grads[i,ii] * delta_x * (xlimits[ii,2]-xlimits[ii,1])
end
end
end
if extra_points != 0
X = [X; XX]
y = [y; yy]
end
pls_mean = mean(broadcast(abs,coeff_pls),dims=3)
return pls_mean, X, y
end

######start of bbdesign######


#
# Adapted from 'ExperimentalDesign.jl: Design of Experiments in Julia'
# https://github.com/phrb/ExperimentalDesign.jl

# MIT License

# ExperimentalDesign.jl: Design of Experiments in Julia
# Copyright (C) 2019 Pedro Bruel <[email protected]>

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

function boxbehnken(matrix_size::Int)
boxbehnken(matrix_size, matrix_size)
end

function boxbehnken(matrix_size::Int, center::Int)
@assert matrix_size>=3

A_fact = explicit_fullfactorial(Tuple([-1,1] for i = 1:2))

rows = floor(Int, (0.5*matrix_size*(matrix_size-1))*size(A_fact)[1])

A = zeros(rows, matrix_size)

l = 0
for i in 1:matrix_size-1
for j in i+1:matrix_size
l = l +1
A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], i] = A_fact[:, 1]
A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], j] = A_fact[:, 2]
end
end

if center == matrix_size
if matrix_size <= 16
points = [0, 0, 3, 3, 6, 6, 6, 8, 9, 10, 12, 12, 13, 14, 15, 16]
center = points[matrix_size]
end
end

A = transpose(hcat(transpose(A), transpose(zeros(center, matrix_size))))
end

function explicit_fullfactorial(factors::Tuple)
explicit_fullfactorial(fullfactorial(factors))
end

function explicit_fullfactorial(iterator::Base.Iterators.ProductIterator)
hcat(vcat.(collect(iterator)...)...)
end

function fullfactorial(factors::Tuple)
Base.Iterators.product(factors...)
end


######end of bb design######

function standardization(X,y)
"""
We substract the mean from each variable. Then, we divide the values of each
variable by its standard deviation.

Parameters
----------

X - The input variables.
y - The output variable.

Returns
-------

X: [n_obs, dim]
The standardized input matrix.

y: [n_obs, 1]
The standardized output vector.

X_offset: The mean (or the min if scale_X_to_unit=True) of each input variable.

y_mean: The mean of the output variable.

X_scale: The standard deviation of each input variable.

y_std: The standard deviation of the output variable.

"""
#Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L21
X_offset = mean(X, dims = 1)
X_scale = std(X, dims = 1)
X_scale = map(x -> (x==0.0) ? x=1 : x=x, X_scale) #to prevent division by 0 below
y_mean = mean(y)
y_std = std(y)
y_std = map(y -> (y==0) ? y=1 : y=y, y_std) #to prevent division by 0 below
X = (X.-X_offset) ./ X_scale
y = (y .- y_mean) ./ y_std
return X, y, X_offset, y_mean, X_scale, y_std

end

function cross_distances(X)
"""
Computes the nonzero componentwise cross-distances between the vectors
in X

Parameters
----------

X: [n_obs, dim]

Returns
-------
D: [n_obs * (n_obs - 1) / 2, dim]
- The cross-distances between the vectors in X.

ij: [n_obs * (n_obs - 1) / 2, 2]
- The indices i and j of the vectors in X associated to the cross-
distances in D.
"""
# equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L86
n_samples, n_features = size(X)
n_nonzero_cross_dist = ( n_samples * (n_samples - 1) ) ÷ 2
ij = zeros((n_nonzero_cross_dist, 2))
D = zeros((n_nonzero_cross_dist, n_features))
ll_1 = 0

for k in 1:n_samples - 1
ll_0 = ll_1 + 1
ll_1 = ll_0 + n_samples - k - 1
ij[ll_0:ll_1, 1] .= k
ij[ll_0:ll_1, 2] = k+1:1:n_samples
D[ll_0:ll_1, :] = -(X[(k + 1) : n_samples,:] .- X[k,:]')

end
return D, Int.(ij)
end

function componentwise_distance_PLS(D, corr, n_comp, coeff_pls)
"""
Computes the nonzero componentwise cross-spatial-correlation-distance
between the vectors in X.

Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
with some simplifications (removed theta and return_derivative as it's not required for GEKPLS)

Parameters
----------

D: [n_obs * (n_obs - 1) / 2, dim]
- The L1 cross-distances between the vectors in X.

corr: str
- Name of the correlation function used.
squar_exp or abs_exp.

n_comp: int
- Number of principal components used.

coeff_pls: [dim, n_comp]
- The PLS-coefficients.

Returns
-------

D_corr: [n_obs * (n_obs - 1) / 2, n_comp]
- The componentwise cross-spatial-correlation-distance between the
vectors in X.

"""
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
#todo
#figure out how to handle this computation in the case of very large matrices
#similar to what SMT has done
#at https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
D_corr = zeros((size(D)[1], n_comp))

if corr == "squar_exp"
D_corr = D.^2 * coeff_pls.^2
else #abs_exp
D_corr = abs.(D) * abs.(coeff_pls)
end
return D_corr
end

function squar_exp(theta, d)
"""
Squared exponential correlation model.
Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L604
Parameters:
-----------
theta : Hyperparameters of the correlation model
d: componentwise distances from componentwise_distance_PLS

Returns:
--------
r: array containing the values of the autocorrelation model

"""
n_components = size(d)[2]
theta = reshape(theta, (1,n_components))
return exp.(-sum(theta .* d, dims=2))
end

function differences(X, Y)
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L392
#code credit: Elias Carvalho - https://stackoverflow.com/questions/72392010/row-wise-operations-between-matrices-in-julia
Rx = repeat(X, inner=(size(Y, 1), 1))
Ry = repeat(Y, size(X, 1))
return Rx - Ry
end

function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise=0.0)
"""
This function is a loose translation of SMT code from
https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
It determines the BLUP parameters and evaluates the reduced likelihood function for the given theta.

Parameters
----------
theta: array containing the parameters at which the Gaussian Process model parameters should be determined.
kernel_type: name of the correlation function.
d: The componentwise cross-spatial-correlation-distance between the vectors in X.
nt: number of training points
ij: The indices i and j of the vectors in X associated to the cross-distances in D.
y_norma: Standardized y values
noise: noise hyperparameter - increasing noise reduces reduced_likelihood_function_value


Returns
-------
reduced_likelihood_function_value: real
- The value of the reduced likelihood function associated with the given autocorrelation parameters theta.
beta: Generalized least-squares regression weights
gamma: Gaussian Process weights.

"""
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
reduced_likelihood_function_value = -Inf
nugget = 1000000.0 * eps() #a jitter for numerical stability; reducing the multiple from 1000000.0 results in positive definite error for Cholesky decomposition;
if kernel_type =="squar_exp" #todo - add other kernel types like abs_exp, matern52 etc.
r = squar_exp(theta,d)
end
R = (I + zeros(nt,nt)) .* (1.0 + nugget + noise)
for k in 1:size(ij)[1] #todo - speed up using @inbounds / @simd where required
R[ij[k,1],ij[k,2]]=r[k]
R[ij[k,2],ij[k,1]]=r[k]
end
C = cholesky(R).L #todo - #values diverge at this point from SMT code; verify impact
F = ones(nt,1) #todo - examine if this should be a parameter for this function
Ft = C\F
Q, G = qr(Ft)
Q = Array(Q)
Yt = C\y_norma
#todo - in smt, they check if the matrix is ill-conditioned using SVD. Verify and include if necessary
beta = G \ [(transpose(Q) ⋅ Yt)]
rho = Yt .- (Ft .* beta)
gamma = transpose(C) \ rho
sigma2 = sum((rho).^2, dims=1)/nt
detR = prod(diag(C).^(2.0/nt))
reduced_likelihood_function_value = - nt * log10(sum(sigma2)) - nt * log10(detR)
return beta, gamma, reduced_likelihood_function_value
end


Loading