diff --git a/Project.toml b/Project.toml
index 79a481846..d87569e97 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
diff --git a/docs/pages.jl b/docs/pages.jl
index d4ce6dd70..b53c2d048 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -14,6 +14,7 @@ pages = ["index.md"
              "Polynomial Chaos" => "polychaos.md",
              "Variable Fidelity" => "variablefidelity.md",
              "Gradient Enhanced Kriging" => "gek.md",
+             "GEKPLS" => "gekpls.md",
          ]
          "User guide" => [
              "Samples" => "samples.md",
diff --git a/docs/src/gekpls.md b/docs/src/gekpls.md
new file mode 100644
index 000000000..7a5095d98
--- /dev/null
+++ b/docs/src/gekpls.md
@@ -0,0 +1,84 @@
+## GEKPLS Surrogate Tutorial
+
+Gradient Enhanced Kriging with Partial Least Squares Method (GEKPLS) is a surrogate modelling technique that brings down computation time and returns improved accuracy for high-dimensional problems. The Julia implementation of GEKPLS is adapted from the Python version by [SMT](https://github.com/SMTorg) which is based on this [paper](https://arxiv.org/pdf/1708.02663.pdf).  
+
+The following are the inputs when building a GEKPLS surrogate: 
+
+1. X - The matrix containing the training points
+2. y - The vector containing the training outputs associated with each of the training points
+3. grads - The gradients at each of the input X training points
+4. n_comp - Number of components to retain for the partial least squares regression (PLS)
+5. delta_x -  The step size to use for the first order Taylor approximation
+6. xlimits - The lower and upper bounds for the training points
+7. extra_points - The number of additional points to use for the PLS 
+8. theta - The hyperparameter to use for the correlation model
+
+The following example illustrates how to use GEKPLS:
+
+```@example gekpls_water_flow
+
+using Surrogates
+using Zygote
+
+function vector_of_tuples_to_matrix(v)
+    #helper function to convert training data generated by surrogate sampling into a matrix suitable for GEKPLS
+    num_rows = length(v)
+    num_cols = length(first(v))
+    K = zeros(num_rows, num_cols)
+    for row in 1:num_rows
+        for col in 1:num_cols
+            K[row, col]=v[row][col]
+        end
+    end
+    return K
+end
+
+function vector_of_tuples_to_matrix2(v)
+    #helper function to convert gradients into matrix form
+    num_rows = length(v)
+    num_cols = length(first(first(v)))
+    K = zeros(num_rows, num_cols)
+    for row in 1:num_rows
+        for col in 1:num_cols
+            K[row, col] = v[row][1][col]
+        end
+    end
+    return K
+end
+
+function water_flow(x)
+    r_w = x[1]
+    r = x[2]
+    T_u = x[3]
+    H_u = x[4]
+    T_l = x[5]
+    H_l = x[6]
+    L = x[7]
+    K_w = x[8]
+    log_val = log(r/r_w)
+    return (2*pi*T_u*(H_u - H_l))/ ( log_val*(1 + (2*L*T_u/(log_val*r_w^2*K_w)) + T_u/T_l))
+end
+
+n = 1000
+d = 8
+lb = [0.05,100,63070,990,63.1,700,1120,9855]
+ub = [0.15,50000,115600,1110,116,820,1680,12045]
+x = sample(n,lb,ub,SobolSample())
+X = vector_of_tuples_to_matrix(x)
+grads = vector_of_tuples_to_matrix2(gradient.(water_flow, x))
+y = reshape(water_flow.(x),(size(x,1),1))
+xlimits = hcat(lb, ub)
+n_test = 100 
+x_test = sample(n_test,lb,ub,GoldenSample()) 
+X_test = vector_of_tuples_to_matrix(x_test) 
+y_true = water_flow.(x_test)
+n_comp = 2
+delta_x = 0.0001
+extra_points = 2
+initial_theta = 0.01
+g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+y_pred = g(X_test)
+rmse = sqrt(sum(((y_pred - y_true).^2)/n_test)) #root mean squared error
+println(rmse)
+```
+
diff --git a/src/GEKPLS.jl b/src/GEKPLS.jl
new file mode 100644
index 000000000..255cbc588
--- /dev/null
+++ b/src/GEKPLS.jl
@@ -0,0 +1,537 @@
+using LinearAlgebra
+using Statistics
+
+mutable struct GEKPLS{T <: AbstractFloat} <: AbstractSurrogate
+    x::Matrix{T} #1
+    y::Matrix{T} #2
+    grads::Matrix{T} #3
+    xl::Matrix{T} #xlimits #4
+    delta::T #5
+    extra_points::Int #6
+    num_components::Int #7
+    beta::Vector{T} #8
+    gamma::Matrix{T} #9
+    theta::Vector{T} #10
+    reduced_likelihood_function_value::T #11
+    X_offset::Matrix{T} #12
+    X_scale::Matrix{T} #13
+    X_after_std::Matrix{T} #14 - X after standardization
+    pls_mean::Matrix{T} #15
+    y_mean::T #16
+    y_std::T #17
+end
+
+function bounds_error(x, xl)
+    num_x_rows = size(x, 1)
+    num_dim = size(xl, 1)
+    for i in 1:num_x_rows
+        for j in 1:num_dim
+            if (x[i, j] < xl[j, 1] || x[i, j] > xl[j, 2])
+                return true
+            end
+        end
+    end
+    return false
+end
+
+#constructor for GEKPLS Struct
+function GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, θ)
+
+    #ensure that X values are within the upper and lower bounds
+    if bounds_error(X, xlimits)
+        println("X values outside bounds")
+        return
+    end
+
+    theta = [θ for i in 1:n_comp]
+    pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(X, y, n_comp, grads, delta_x,
+                                                         xlimits, extra_points)
+    X_after_std, y_after_std, X_offset, y_mean, X_scale, y_std = standardization(X_after_PLS,
+                                                                                 y_after_PLS)
+    D, ij = cross_distances(X_after_std)
+    pls_mean_reshaped = reshape(pls_mean, (size(X, 2), n_comp))
+    d = componentwise_distance_PLS(D, "squar_exp", n_comp, pls_mean_reshaped)
+    nt, nd = size(X_after_PLS)
+    beta, gamma, reduced_likelihood_function_value = _reduced_likelihood_function(theta,
+                                                                                  "squar_exp",
+                                                                                  d, nt, ij,
+                                                                                  y_after_std)
+    return GEKPLS(X, y, grads, xlimits, delta_x, extra_points, n_comp, beta, gamma, theta,
+                  reduced_likelihood_function_value,
+                  X_offset, X_scale, X_after_std, pls_mean_reshaped, y_mean, y_std)
+    println("struct created")
+end
+
+# predictor 
+function (g::GEKPLS)(X_test)
+    n_eval, n_features_x = size(X_test)
+    X_cont = (X_test .- g.X_offset) ./ g.X_scale
+    dx = differences(X_cont, g.X_after_std)
+    pred_d = componentwise_distance_PLS(dx, "squar_exp", g.num_components, g.pls_mean)
+    nt = size(g.X_after_std, 1)
+    r = transpose(reshape(squar_exp(g.theta, pred_d), (nt, n_eval)))
+    f = ones(n_eval, 1)
+    y_ = (f * g.beta) + (r * g.gamma)
+    y = g.y_mean .+ g.y_std * y_
+    return y
+end
+
+function add_point!(g::GEKPLS, new_x, new_y, new_grads)
+    if new_x in g.x
+        println("Adding a sample that already exists. Cannot build GEKPLS")
+        return
+    end
+
+    if bounds_error(new_x, g.xl)
+        println("x values outside bounds")
+        return
+    end
+
+    g.x = vcat(g.x, new_x)
+    g.y = vcat(g.y, new_y)
+    g.grads = vcat(g.grads, new_grads)
+    pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(g.x, g.y, g.num_components,
+                                                         g.grads, g.delta, g.xl,
+                                                         g.extra_points)
+    g.X_after_std, y_after_std, g.X_offset, g.y_mean, g.X_scale, g.y_std = standardization(X_after_PLS,
+                                                                                           y_after_PLS)
+    D, ij = cross_distances(g.X_after_std)
+    g.pls_mean = reshape(pls_mean, (size(g.x, 2), g.num_components))
+    d = componentwise_distance_PLS(D, "squar_exp", g.num_components, g.pls_mean)
+    nt, nd = size(X_after_PLS)
+    g.beta, g.gamma, g.reduced_likelihood_function_value = _reduced_likelihood_function(g.theta,
+                                                                                        "squar_exp",
+                                                                                        d,
+                                                                                        nt,
+                                                                                        ij,
+                                                                                        y_after_std)
+end
+
+function _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
+    """
+    Gradient-enhanced PLS-coefficients.
+        Parameters
+        ----------
+        X: [n_obs,dim] - The input variables.
+        y: [n_obs,ny] - The output variable
+        n_comp: int - Number of principal components used.
+        gradients: - The gradient values. Matrix size (n_obs,dim)
+        delta_x: real - The step used in the First Order Taylor Approximation
+        xlimits: [dim, 2]- The upper and lower var bounds.
+        extra_points: int - The number of extra points per each training point.
+        Returns
+        -------
+        Coeff_pls: [dim, n_comp] - The PLS-coefficients.
+        X: Concatenation of XX: [extra_points*nt, dim] - Extra points added (when extra_points > 0) and X
+        y: Concatenation of yy[extra_points*nt, 1]- Extra points added (when extra_points > 0) and y
+        """
+    # this function is equivalent to a combination of 
+    # https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/utils/kriging_utils.py#L1036
+    # and https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/surrogate_models/gekpls.py#L48
+
+    nt, dim = size(X)
+    XX = zeros(0, dim)
+    yy = zeros(0, size(y)[2])
+    coeff_pls = zeros((dim, n_comp, nt))
+
+    for i in 1:nt
+        if dim >= 3
+            bb_vals = circshift(boxbehnken(dim, 1), 1)
+        else
+            bb_vals = [0.0 0.0; #center
+                       1.0 0.0; #right
+                       0.0 1.0; #up
+                       -1.0 0.0; #left
+                       0.0 -1.0; #down
+                       1.0 1.0; #right up
+                       -1.0 1.0; #left up
+                       -1.0 -1.0; #left down
+                       1.0 -1.0]
+        end
+        _X = zeros((size(bb_vals)[1], dim))
+        _y = zeros((size(bb_vals)[1], 1))
+        bb_vals = bb_vals .* (delta_x * (xlimits[:, 2] - xlimits[:, 1]))' #smt calls this sign. I've called it bb_vals
+        _X = X[i, :]' .+ bb_vals
+        bb_vals = bb_vals .* grads[i, :]'
+        _y = y[i, :] .+ sum(bb_vals, dims = 2)
+
+        #_pls.fit(_X, _y) # relic from sklearn versiom; retained for future reference.
+        #coeff_pls[:, :, i] = _pls.x_rotations_ #relic from sklearn versiom; retained for future reference.
+
+        coeff_pls[:, :, i] = _modified_pls(_X, _y, n_comp) #_modified_pls returns the equivalent of SKLearn's _pls.x_rotations_
+        if extra_points != 0
+            start_index = max(1, length(coeff_pls[:, 1, i]) - extra_points + 1)
+            max_coeff = sortperm(broadcast(abs, coeff_pls[:, 1, i]))[start_index:end]
+            for ii in max_coeff
+                XX = [XX; transpose(X[i, :])]
+                XX[end, ii] += delta_x * (xlimits[ii, 2] - xlimits[ii, 1])
+                yy = [yy; y[i]]
+                yy[end] += grads[i, ii] * delta_x * (xlimits[ii, 2] - xlimits[ii, 1])
+            end
+        end
+    end
+    if extra_points != 0
+        X = [X; XX]
+        y = [y; yy]
+    end
+
+    pls_mean = mean(broadcast(abs, coeff_pls), dims = 3)
+    return pls_mean, X, y
+end
+
+######start of bbdesign######
+
+# 
+# Adapted from 'ExperimentalDesign.jl: Design of Experiments in Julia'
+# https://github.com/phrb/ExperimentalDesign.jl
+
+# MIT License
+
+# ExperimentalDesign.jl: Design of Experiments in Julia
+# Copyright (C) 2019 Pedro Bruel <pedro.bruel@gmail.com>
+
+# Permission is hereby granted, free of charge,  to any person obtaining a copy of
+# this software  and associated documentation  files (the "Software"), to  deal in
+# the Software  without restriction,  including without  limitation the  rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to  whom the Software is furnished to do so,
+# subject to the following conditions:
+
+# The  above copyright  notice  and  this permission  notice  (including the  next
+# paragraph)  shall be  included  in all  copies or  substantial  portions of  the
+# Software.
+
+# THE  SOFTWARE IS  PROVIDED "AS  IS", WITHOUT  WARRANTY OF  ANY KIND,  EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR  PURPOSE AND NONINFRINGEMENT. IN NO EVENT  SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE  LIABLE FOR ANY CLAIM, DAMAGES OR  OTHER LIABILITY, WHETHER
+# IN  AN ACTION  OF  CONTRACT, TORT  OR  OTHERWISE,  ARISING FROM,  OUT  OF OR  IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+
+function boxbehnken(matrix_size::Int)
+    boxbehnken(matrix_size, matrix_size)
+end
+
+function boxbehnken(matrix_size::Int, center::Int)
+    @assert matrix_size >= 3
+
+    A_fact = explicit_fullfactorial(Tuple([-1, 1] for i in 1:2))
+
+    rows = floor(Int, (0.5 * matrix_size * (matrix_size - 1)) * size(A_fact)[1])
+
+    A = zeros(rows, matrix_size)
+
+    l = 0
+    for i in 1:(matrix_size - 1)
+        for j in (i + 1):matrix_size
+            l = l + 1
+            A[(max(0, (l - 1) * size(A_fact)[1]) + 1):(l * size(A_fact)[1]), i] = A_fact[:,
+                                                                                         1]
+            A[(max(0, (l - 1) * size(A_fact)[1]) + 1):(l * size(A_fact)[1]), j] = A_fact[:,
+                                                                                         2]
+        end
+    end
+
+    if center == matrix_size
+        if matrix_size <= 16
+            points = [0, 0, 3, 3, 6, 6, 6, 8, 9, 10, 12, 12, 13, 14, 15, 16]
+            center = points[matrix_size]
+        end
+    end
+
+    A = transpose(hcat(transpose(A), transpose(zeros(center, matrix_size))))
+end
+
+function explicit_fullfactorial(factors::Tuple)
+    explicit_fullfactorial(fullfactorial(factors))
+end
+
+function explicit_fullfactorial(iterator::Base.Iterators.ProductIterator)
+    hcat(vcat.(collect(iterator)...)...)
+end
+
+function fullfactorial(factors::Tuple)
+    Base.Iterators.product(factors...)
+end
+
+######end of bb design######
+
+function standardization(X, y)
+    """
+    We substract the mean from each variable. Then, we divide the values of each
+    variable by its standard deviation. 
+
+    Parameters
+    ----------
+
+    X - The input variables.
+    y - The output variable.
+
+    Returns
+    -------
+
+    X: [n_obs, dim]
+       The standardized input matrix.
+
+    y: [n_obs, 1]
+       The standardized output vector.
+
+    X_offset: The mean (or the min if scale_X_to_unit=True) of each input variable.
+
+    y_mean: The mean of the output variable.
+
+    X_scale:  The standard deviation of each input variable.
+
+    y_std: The standard deviation of the output variable.
+
+    """
+    #Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L21
+    X_offset = mean(X, dims = 1)
+    X_scale = std(X, dims = 1)
+    X_scale = map(x -> (x == 0.0) ? x = 1 : x = x, X_scale) #to prevent division by 0 below
+    y_mean = mean(y)
+    y_std = std(y)
+    y_std = map(y -> (y == 0) ? y = 1 : y = y, y_std) #to prevent division by 0 below
+    X = (X .- X_offset) ./ X_scale
+    y = (y .- y_mean) ./ y_std
+    return X, y, X_offset, y_mean, X_scale, y_std
+end
+
+function cross_distances(X)
+    """
+    Computes the nonzero componentwise cross-distances between the vectors
+    in X
+
+    Parameters
+    ----------
+
+    X: [n_obs, dim]
+
+    Returns
+    -------
+    D:  [n_obs * (n_obs - 1) / 2, dim]
+        - The cross-distances between the vectors in X.
+
+    ij: [n_obs * (n_obs - 1) / 2, 2]
+            - The indices i and j of the vectors in X associated to the cross-
+              distances in D.
+    """
+    # equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L86
+    n_samples, n_features = size(X)
+    n_nonzero_cross_dist = (n_samples * (n_samples - 1)) ÷ 2
+    ij = zeros((n_nonzero_cross_dist, 2))
+    D = zeros((n_nonzero_cross_dist, n_features))
+    ll_1 = 0
+
+    for k in 1:(n_samples - 1)
+        ll_0 = ll_1 + 1
+        ll_1 = ll_0 + n_samples - k - 1
+        ij[ll_0:ll_1, 1] .= k
+        ij[ll_0:ll_1, 2] = (k + 1):1:n_samples
+        D[ll_0:ll_1, :] = -(X[(k + 1):n_samples, :] .- X[k, :]')
+    end
+    return D, Int.(ij)
+end
+
+function componentwise_distance_PLS(D, corr, n_comp, coeff_pls)
+    """
+        Computes the nonzero componentwise cross-spatial-correlation-distance
+        between the vectors in X.
+
+        Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
+        with some simplifications (removed theta and return_derivative as it's not required for GEKPLS)
+
+        Parameters
+        ----------
+
+        D: [n_obs * (n_obs - 1) / 2, dim]
+            - The L1 cross-distances between the vectors in X.
+
+        corr: str
+                - Name of the correlation function used.
+                squar_exp or abs_exp.
+
+        n_comp: int
+                - Number of principal components used.
+
+        coeff_pls: [dim, n_comp]
+                - The PLS-coefficients.
+
+        Returns
+        -------
+
+        D_corr: [n_obs * (n_obs - 1) / 2, n_comp]
+                - The componentwise cross-spatial-correlation-distance between the
+                vectors in X.
+
+    """
+    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
+    #todo
+    #figure out how to handle this computation in the case of very large matrices
+    #similar to what SMT has done
+    #at https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
+    D_corr = zeros((size(D)[1], n_comp))
+
+    if corr == "squar_exp"
+        D_corr = D .^ 2 * coeff_pls .^ 2
+    else #abs_exp
+        D_corr = abs.(D) * abs.(coeff_pls)
+    end
+
+    return D_corr
+end
+
+function squar_exp(theta, d)
+    """
+    Squared exponential correlation model.
+    Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L604
+    Parameters:
+    -----------
+    theta : Hyperparameters of the correlation model
+    d: componentwise distances from componentwise_distance_PLS
+
+    Returns:
+    --------
+    r:  array containing the values of the autocorrelation model
+
+    """
+    n_components = size(d)[2]
+    theta = reshape(theta, (1, n_components))
+    return exp.(-sum(theta .* d, dims = 2))
+end
+
+function differences(X, Y)
+    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L392
+    #code credit: Elias Carvalho - https://stackoverflow.com/questions/72392010/row-wise-operations-between-matrices-in-julia
+    Rx = repeat(X, inner = (size(Y, 1), 1))
+    Ry = repeat(Y, size(X, 1))
+    return Rx - Ry
+end
+
+function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise = 0.0)
+    """
+    This function is a loose translation of SMT code from 
+    https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
+    It  determines the BLUP parameters and evaluates the reduced likelihood function for the given theta.
+
+    Parameters
+    ----------
+    theta: array containing the parameters at which the Gaussian Process model parameters should be determined.
+    kernel_type: name of the correlation function.
+    d: The componentwise cross-spatial-correlation-distance between the vectors in X.
+    nt: number of training points
+    ij: The indices i and j of the vectors in X associated to the cross-distances in D.
+    y_norma: Standardized y values
+    noise: noise hyperparameter - increasing noise reduces reduced_likelihood_function_value
+
+
+    Returns
+    -------
+    reduced_likelihood_function_value: real
+        - The value of the reduced likelihood function associated with the given autocorrelation parameters theta. 
+    beta:  Generalized least-squares regression weights
+    gamma: Gaussian Process weights.
+
+    """
+    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
+    reduced_likelihood_function_value = -Inf
+    nugget = 1000000.0 * eps() #a jitter for numerical stability; reducing the multiple from 1000000.0 results in positive definite error for Cholesky decomposition;
+    if kernel_type == "squar_exp" #todo - add other kernel type abs_exp etc.
+        r = squar_exp(theta, d)
+    end
+    R = (I + zeros(nt, nt)) .* (1.0 + nugget + noise)
+
+    for k in 1:size(ij)[1]
+        R[ij[k, 1], ij[k, 2]] = r[k]
+        R[ij[k, 2], ij[k, 1]] = r[k]
+    end
+
+    C = cholesky(R).L #todo - #values diverge at this point from SMT code; verify impact
+    F = ones(nt, 1) #todo - examine if this should be a parameter for this function
+    Ft = C \ F
+    Q, G = qr(Ft)
+    Q = Array(Q)
+    Yt = C \ y_norma
+    #todo - in smt, they check if the matrix is ill-conditioned using SVD. Verify and include if necessary
+    beta = G \ [(transpose(Q) ⋅ Yt)]
+    rho = Yt .- (Ft .* beta)
+    gamma = transpose(C) \ rho
+    sigma2 = sum((rho) .^ 2, dims = 1) / nt
+    detR = prod(diag(C) .^ (2.0 / nt))
+    reduced_likelihood_function_value = -nt * log10(sum(sigma2)) - nt * log10(detR)
+    return beta, gamma, reduced_likelihood_function_value
+end
+
+### MODIFIED PLS BELOW ###
+
+# The code below is a simplified version of 
+# SKLearn's PLS
+# https://github.com/scikit-learn/scikit-learn/blob/80598905e/sklearn/cross_decomposition/_pls.py
+
+function _center_scale(X, Y)
+    x_mean = mean(X, dims = 1)
+    X .-= x_mean
+    y_mean = mean(Y, dims = 1)
+    Y .-= y_mean
+    x_std = std(X, dims = 1)
+    x_std[x_std .== 0] .= 1.0
+    X ./= x_std
+    y_std = std(Y, dims = 1)
+    y_std[y_std .== 0] .= 1.0
+    Y ./= y_std
+    return X, Y
+end
+
+function _svd_flip_1d(u, v)
+    # equivalent of https://github.com/scikit-learn/scikit-learn/blob/80598905e517759b4696c74ecc35c6e2eb508cff/sklearn/cross_decomposition/_pls.py#L149
+    biggest_abs_val_idx = findmax(abs.(vec(u)))[2]
+    sign_ = sign(u[biggest_abs_val_idx])
+    u .*= sign_
+    v .*= sign_
+end
+
+function _get_first_singular_vectors_power_method(X, Y)
+    my_eps = eps()
+    y_score = vec(Y)
+    x_weights = transpose(X)y_score / dot(y_score, y_score)
+    x_weights ./= (sqrt(dot(x_weights, x_weights)) + my_eps)
+    x_score = X * x_weights
+    y_weights = transpose(Y)x_score / dot(x_score, x_score)
+    y_score = Y * y_weights / (dot(y_weights, y_weights) + my_eps)
+    #Equivalent in intent to https://github.com/scikit-learn/scikit-learn/blob/80598905e517759b4696c74ecc35c6e2eb508cff/sklearn/cross_decomposition/_pls.py#L66
+    if any(isnan.(x_weights)) || any(isnan.(y_weights))
+        return false, false
+    end
+    return x_weights, y_weights
+end
+
+function _modified_pls(X, Y, n_components)
+    x_weights_ = zeros(size(X, 2), n_components)
+    _x_scores = zeros(size(X, 1), n_components)
+    x_loadings_ = zeros(size(X, 2), n_components)
+    Xk, Yk = _center_scale(X, Y)
+
+    for k in 1:n_components
+        x_weights, y_weights = _get_first_singular_vectors_power_method(Xk, Yk)
+
+        if x_weights == false
+            break
+        end
+
+        _svd_flip_1d(x_weights, y_weights)
+        x_scores = Xk * x_weights
+        x_loadings = transpose(x_scores)Xk / dot(x_scores, x_scores)
+        Xk = Xk - (x_scores * x_loadings)
+        y_loadings = transpose(x_scores) * Yk / dot(x_scores, x_scores)
+        Yk = Yk - x_scores * y_loadings
+        x_weights_[:, k] = x_weights
+        _x_scores[:, k] = x_scores
+        x_loadings_[:, k] = vec(x_loadings)
+    end
+
+    x_rotations_ = x_weights_ * pinv(transpose(x_loadings_)x_weights_)
+    return x_rotations_
+end
+
+### MODIFIED PLS ABOVE ###
diff --git a/src/Surrogates.jl b/src/Surrogates.jl
index ef2137fdf..eaf50ed44 100644
--- a/src/Surrogates.jl
+++ b/src/Surrogates.jl
@@ -12,12 +12,12 @@ include("Lobachevsky.jl")
 include("LinearSurrogate.jl")
 include("InverseDistanceSurrogate.jl")
 include("SecondOrderPolynomialSurrogate.jl")
-
 include("Wendland.jl")
 include("MOE.jl") #rewrite gaussian mixture with own algorithm to fix deps issue
 include("VariableFidelity.jl")
 include("Earth.jl")
 include("GEK.jl")
+include("GEKPLS.jl")
 
 current_surrogates = ["Kriging", "LinearSurrogate", "LobachevskySurrogate",
     "NeuralSurrogate",
@@ -81,6 +81,7 @@ function PolyChaosStructure(; op)
 end
 
 export current_surrogates
+export GEKPLS
 export RadialBasisStructure, KrigingStructure, LinearStructure, InverseDistanceStructure
 export LobachevskyStructure, NeuralStructure, RandomForestStructure,
        SecondOrderPolynomialStructure
diff --git a/test/GEKPLS.jl b/test/GEKPLS.jl
new file mode 100644
index 000000000..26f33f39b
--- /dev/null
+++ b/test/GEKPLS.jl
@@ -0,0 +1,239 @@
+using Surrogates
+using Zygote
+
+function vector_of_tuples_to_matrix(v)
+    #convert training data generated by surrogate sampling into a matrix suitable for GEKPLS
+    num_rows = length(v)
+    num_cols = length(first(v))
+    K = zeros(num_rows, num_cols)
+    for row in 1:num_rows
+        for col in 1:num_cols
+            K[row, col] = v[row][col]
+        end
+    end
+    return K
+end
+
+function vector_of_tuples_to_matrix2(v)
+    #convert gradients into matrix form
+    num_rows = length(v)
+    num_cols = length(first(first(v)))
+    K = zeros(num_rows, num_cols)
+    for row in 1:num_rows
+        for col in 1:num_cols
+            K[row, col] = v[row][1][col]
+        end
+    end
+    return K
+end
+
+# # water flow function tests
+function water_flow(x)
+    r_w = x[1]
+    r = x[2]
+    T_u = x[3]
+    H_u = x[4]
+    T_l = x[5]
+    H_l = x[6]
+    L = x[7]
+    K_w = x[8]
+    log_val = log(r / r_w)
+    return (2 * pi * T_u * (H_u - H_l)) /
+           (log_val * (1 + (2 * L * T_u / (log_val * r_w^2 * K_w)) + T_u / T_l))
+end
+
+n = 1000
+d = 8
+lb = [0.05, 100, 63070, 990, 63.1, 700, 1120, 9855]
+ub = [0.15, 50000, 115600, 1110, 116, 820, 1680, 12045]
+x = sample(n, lb, ub, SobolSample())
+X = vector_of_tuples_to_matrix(x)
+grads = vector_of_tuples_to_matrix2(gradient.(water_flow, x))
+y = reshape(water_flow.(x), (size(x, 1), 1))
+xlimits = hcat(lb, ub)
+n_test = 100
+x_test = sample(n_test, lb, ub, GoldenSample())
+X_test = vector_of_tuples_to_matrix(x_test)
+y_true = water_flow.(x_test)
+
+@testset "Test 1: Water Flow Function Test (dimensions = 8; n_comp = 2; extra_points = 2)" begin
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 0.03, atol = 0.02) #rmse: 0.039
+end
+
+@testset "Test 2: Water Flow Function Test (dimensions = 8; n_comp = 3; extra_points = 2)" begin
+    n_comp = 3
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta) #change hard-coded 2 param to variable
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 0.02, atol = 0.01) #rmse: 0.027
+end
+
+@testset "Test 3: Water Flow Function Test (dimensions = 8; n_comp = 3; extra_points = 3)" begin
+    n_comp = 3
+    delta_x = 0.0001
+    extra_points = 3
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 0.02, atol = 0.01) #rmse: 0.027
+end
+
+## welded beam tests
+function welded_beam(x)
+    h = x[1]
+    l = x[2]
+    t = x[3]
+    a = 6000 / (sqrt(2) * h * l)
+    b = (6000 * (14 + 0.5 * l) * sqrt(0.25 * (l^2 + (h + t)^2))) /
+        (2 * (0.707 * h * l * (l^2 / 12 + 0.25 * (h + t)^2)))
+    return (sqrt(a^2 + b^2 + l * a * b)) / (sqrt(0.25 * (l^2 + (h + t)^2)))
+end
+
+n = 1000
+d = 3
+lb = [0.125, 5.0, 5.0]
+ub = [1.0, 10.0, 10.0]
+x = sample(n, lb, ub, SobolSample())
+X = vector_of_tuples_to_matrix(x)
+grads = vector_of_tuples_to_matrix2(gradient.(welded_beam, x))
+y = reshape(welded_beam.(x), (size(x, 1), 1))
+xlimits = hcat(lb, ub)
+n_test = 100
+x_test = sample(n_test, lb, ub, GoldenSample())
+X_test = vector_of_tuples_to_matrix(x_test)
+y_true = welded_beam.(x_test)
+
+@testset "Test 4: Welded Beam Function Test (dimensions = 3; n_comp = 3; extra_points = 2)" begin
+    n_comp = 3
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 39.0, atol = 0.5) #rmse: 38.988
+end
+
+@testset "Test 5: Welded Beam Function Test (dimensions = 3; n_comp = 2; extra_points = 2)" begin
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 39.5, atol = 0.5) #rmse: 39.481
+end
+
+## increasing extra points increases accuracy
+@testset "Test 6: Welded Beam Function Test (dimensions = 3; n_comp = 2; extra_points = 4)" begin
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 4
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 37.5, atol = 0.5) #rmse: 37.87
+end
+
+## sphere function tests
+function sphere_function(x)
+    return sum(x .^ 2)
+end
+
+## 3D
+n = 100
+lb = [-5.0, -5.0, -5.0]
+ub = [5.0, 5.0, 5.0]
+x = sample(n, lb, ub, SobolSample())
+X = vector_of_tuples_to_matrix(x)
+grads = vector_of_tuples_to_matrix2(gradient.(sphere_function, x))
+y = reshape(sphere_function.(x), (size(x, 1), 1))
+xlimits = hcat(lb, ub)
+n_test = 100
+x_test = sample(n_test, lb, ub, GoldenSample())
+X_test = vector_of_tuples_to_matrix(x_test)
+y_true = sphere_function.(x_test)
+
+@testset "Test 7: Sphere Function Test (dimensions = 3; n_comp = 2; extra_points = 2)" begin
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 0.001, atol = 0.05) #rmse: 0.00083
+end
+
+## 2D
+n = 50
+d = 2
+lb = [-10.0, -10.0]
+ub = [10.0, 10.0]
+x = sample(n, lb, ub, SobolSample())
+X = vector_of_tuples_to_matrix(x)
+grads = vector_of_tuples_to_matrix2(gradient.(sphere_function, x))
+y = reshape(sphere_function.(x), (size(x, 1), 1))
+xlimits = hcat(lb, ub)
+n_test = 10
+x_test = sample(n_test, lb, ub, GoldenSample())
+X_test = vector_of_tuples_to_matrix(x_test)
+y_true = sphere_function.(x_test)
+
+@testset "Test 8: Sphere Function Test (dimensions = 2; n_comp = 2; extra_points = 2" begin
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, initial_theta)
+    y_pred = g(X_test)
+    rmse = sqrt(sum(((y_pred - y_true) .^ 2) / n_test))
+    @test isapprox(rmse, 0.1, atol = 0.5) #rmse: 0.0022
+end
+
+@testset "Test 9: Add Point Test (dimensions = 3; n_comp = 2; extra_points = 2)" begin
+    #first we create a surrogate model with just 3 input points
+    initial_x_vec = [(1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0)]
+    initial_y = reshape(sphere_function.(initial_x_vec), (size(initial_x_vec, 1), 1))
+    initial_X = vector_of_tuples_to_matrix(initial_x_vec)
+    initial_grads = vector_of_tuples_to_matrix2(gradient.(sphere_function, initial_x_vec))
+    lb = [-5.0, -5.0, -5.0]
+    ub = [10.0, 10.0, 10.0]
+    xlimits = hcat(lb, ub)
+    n_comp = 2
+    delta_x = 0.0001
+    extra_points = 2
+    initial_theta = 0.01
+    g = GEKPLS(initial_X, initial_y, initial_grads, n_comp, delta_x, xlimits, extra_points,
+               initial_theta)
+    n_test = 100
+    x_test = sample(n_test, lb, ub, GoldenSample())
+    X_test = vector_of_tuples_to_matrix(x_test)
+    y_true = sphere_function.(x_test)
+    y_pred1 = g(X_test)
+    rmse1 = sqrt(sum(((y_pred1 - y_true) .^ 2) / n_test)) #rmse1 = 31.91
+
+    #then we update the model with more points to see if performance improves
+    n = 100
+    x = sample(n, lb, ub, SobolSample())
+    X = vector_of_tuples_to_matrix(x)
+    grads = vector_of_tuples_to_matrix2(gradient.(sphere_function, x))
+    y = reshape(sphere_function.(x), (size(x, 1), 1))
+    add_point!(g, X, y, grads)
+    y_pred2 = g(X_test)
+    rmse2 = sqrt(sum(((y_pred2 - y_true) .^ 2) / n_test)) #rmse2 = 0.0015
+    @test (rmse2 < rmse1)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9a5353848..e435b486f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,6 +15,7 @@ for pkg in ["SurrogatesAbstractGPs", "SurrogatesFlux", "SurrogatesPolyChaos",
     end
 end
 
+@time @safetestset "GEKPLS.jl" begin include("GEKPLS.jl") end
 @time @safetestset "Radials.jl" begin include("Radials.jl") end
 @time @safetestset "Kriging.jl" begin include("Kriging.jl") end
 @time @safetestset "Sampling" begin include("sampling.jl") end