diff --git a/README.md b/README.md index 41fc08d..fb0e1e5 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Note how a few "unlucky" outliers can bias the least squares estimate (LS), but - October 2014 by Z. Danziger: Original version. - September 2015 by Z. Danziger: Updated help, speed increase for 2D case - March 2022 by J. Keyser: Adjusted formatting, added documentation, improved example and added plot, replaced `nanmedian(X)` with `median(X, 'omitnan')`, removed 2D special case, restructured input and output parameters. +- August 2022 by J. Keyser: Explicit omission of identical x coordinates, added coefficient of determination (unadjusted R²) as optional output. ## Contributing and project status diff --git a/TheilSen.m b/TheilSen.m index 7d2f7df..adf0270 100644 --- a/TheilSen.m +++ b/TheilSen.m @@ -1,4 +1,4 @@ -function coef = TheilSen(X, y) +function [coef, rsqrd] = TheilSen(X, y) % THEILSEN performs Theil-Sen robust simple linear regression(s) of X on y. % % For convenience, the input X may contain more than one predictor variable; @@ -18,13 +18,14 @@ % y: A column vector containing the observations of the response variable. % % OUTPUT -% coef: Estimated regression coefficients for each predictor column in X, with -% respect to the response variable y. Each column in coef corresponds -% to one predictor in X, i.e. it will have as many columns as X. -% The first row, i.e. coef(1, :), contains the estimated offset(s). -% The second row, i.e. coef(2, :), contains the estimated slope(s). -% (This output format was chosen to avoid confusion, e.g. with previous -% versions of this code.) +% coef: Estimated regression coefficients for each predictor column in X, +% with respect to the response variable y. Each column in coef +% corresponds to one predictor in X, i.e. it has as many columns as X. +% The first row, i.e. coef(1, :), contains the estimated offset(s). +% The second row, i.e. coef(2, :), contains the estimated slope(s). +% (This output format was chosen to avoid confusion, e.g. with previous +% versions of this code.) +% rsqrd: Ordinary R² (coefficient of determination) per predictor column in X. % % EXAMPLE % See accompanying file example.m. @@ -109,4 +110,17 @@ warning('TheilSen:NaNoutput', ... 'Output contains NaN; check for degenerate inputs.') end + +% If requested, output the ordinary/unadjusted R², per predictor column in X. +if nargout > 1 + % compute total sum of squares relative to the mean + sum_sq_total = sum(power(y - mean(y), 2)); + sums_sq_total = repmat(sum_sq_total, 1, Num_Pred); + % compute the residual sum(s) of squares relative to the regression line(s) + ys_model = b0s + b1s .* X; + ys_data = repmat(y, 1, Num_Pred); + sums_sq_resid = sum(power(ys_data - ys_model, 2)); + % compute R² as 1 - SSresid / SStotal (i.e., the unadjusted version of R²) + rsqrd = 1 - sums_sq_resid ./ sums_sq_total; +end end diff --git a/example.m b/example.m index f3c4412..ad781c5 100644 --- a/example.m +++ b/example.m @@ -30,7 +30,7 @@ est_ls = [ones(N_total, 1), data_x] \ data_y; % Estimate Theil-Sen parameters. -est_ts = TheilSen(data_x, data_y); +[est_ts, ~] = TheilSen(data_x, data_y); % Plot everything and add comparison of estimates to title. figure() @@ -52,9 +52,9 @@ data_x2 = +10 - 2 * data_x1 + randn(size(data_x1)) * SDx_usual; data_x3 = -50 + 5 * data_x1 + randn(size(data_x1)) * SDx_usual; X = [data_x1, data_x2, data_x3]; -est_ts = TheilSen(X, data_y); +[est_ts, r_sqrd] = TheilSen(X, data_y); -% plot both simple regressions; note mainly the difference between x-axes +% plot all simple regressions; note mainly the difference between x-axes num_pred = size(X, 2); figure() for pp = 1:num_pred