From 4f2ec714eb84f5ba8af4e149ba4afce1c02539e2 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 12 Jul 2024 12:09:20 -0600 Subject: [PATCH 01/14] Add KQCEF First pass at adding the kernel density estimation based QCEF into latest version of DART. --- .../assimilation/algorithm_info_mod.f90 | 15 +- .../modules/assimilation/assim_tools_mod.f90 | 169 ++- .../assimilation/distribution_params_mod.f90 | 4 +- .../assimilation/kde_distribution_mod.f90 | 996 ++++++++++++++++++ .../assimilation/probit_transform_mod.f90 | 181 +++- .../modules/assimilation/rootfinding_mod.f90 | 267 +++++ .../programs/test_kde_dist/test_kde_dist.f90 | 8 + .../programs/test_kde_dist/test_kde_dist.rst | 1 + 8 files changed, 1632 insertions(+), 9 deletions(-) create mode 100644 assimilation_code/modules/assimilation/kde_distribution_mod.f90 create mode 100644 assimilation_code/modules/assimilation/rootfinding_mod.f90 create mode 100644 assimilation_code/programs/test_kde_dist/test_kde_dist.f90 create mode 100644 assimilation_code/programs/test_kde_dist/test_kde_dist.rst diff --git a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 index 42d5f0c218..1eb01207cd 100644 --- a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 +++ b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 @@ -22,7 +22,7 @@ module algorithm_info_mod use distribution_params_mod, only : NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, & GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, & - PARTICLE_FILTER_DISTRIBUTION + PARTICLE_FILTER_DISTRIBUTION, KDE_DISTRIBUTION implicit none private @@ -40,12 +40,13 @@ module algorithm_info_mod integer, parameter :: OBS_PARTICLE = 4 integer, parameter :: UNBOUNDED_RHF = 8 integer, parameter :: GAMMA_FILTER = 11 -integer, parameter :: BOUNDED_NORMAL_RHF = 101 +integer, parameter :: BOUNDED_NORMAL_RHF = 101 +integer, parameter :: KERNEL_QCEF = 102 public :: obs_error_info, probit_dist_info, obs_inc_info, & init_algorithm_info_mod, end_algorithm_info_mod, & EAKF, ENKF, BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, & - GAMMA_FILTER, KERNEL, OBS_PARTICLE + GAMMA_FILTER, KERNEL, OBS_PARTICLE, KERNEL_QCEF ! type definitions for the QCF table type obs_error_info_type @@ -223,6 +224,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_inflation%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_inflation%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_inflation%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -246,6 +249,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_state%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_state%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -268,6 +273,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_extended_state%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_extended_state%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_extended_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -290,6 +297,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%obs_inc_info%filter_kind = GAMMA_FILTER case ('BOUNDED_NORMAL_RHF') qceff_table_data(row)%obs_inc_info%filter_kind = BOUNDED_NORMAL_RHF + case ('KERNEL_QCEF') + qceff_table_data(row)%obs_inc_info%filter_kind = KERNEL_QCEF case default write(errstring, *) 'Invalid filter kind: ', trim(filter_kind_string(row)) call error_handler(E_ERR, 'read_qceff_table:', errstring, source) diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index d1b6e2bbfc..6054c0b9b9 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -78,13 +78,16 @@ module assim_tools_mod use algorithm_info_mod, only : probit_dist_info, obs_inc_info, EAKF, ENKF, & BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, GAMMA_FILTER, & - KERNEL, OBS_PARTICLE + KERNEL, OBS_PARTICLE, KERNEL_QCEF use gamma_distribution_mod, only : gamma_cdf, inv_gamma_cdf, gamma_mn_var_to_shape_scale, & gamma_gamma_prod use bnrh_distribution_mod, only : inv_bnrh_cdf, bnrh_cdf, inv_bnrh_cdf_like +use kde_distribution_mod, only : kde_cdf_params, inv_kde_cdf_params, obs_dist_types, & + pack_kde_params, likelihood_function, separate_ensemble + use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params @@ -887,7 +890,7 @@ end subroutine filter_assim !------------------------------------------------------------- subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & - inflate, my_cov_inflate, my_cov_inflate_sd, net_a) + inflate, my_cov_inflate, my_cov_inflate_sd, net_a, obs_dist_type) ! Given the ensemble prior for an observation, the observation, and ! the observation error variance, computes increments and adjusts @@ -900,6 +903,7 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & type(adaptive_inflate_type), intent(inout) :: inflate real(r8), intent(inout) :: my_cov_inflate, my_cov_inflate_sd real(r8), intent(out) :: net_a +integer, optional, intent(in) :: obs_dist_type real(r8) :: ens(ens_size), inflate_inc(ens_size) real(r8) :: prior_mean, prior_var, new_val(ens_size) @@ -1025,8 +1029,28 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & !!!endif !-------------------------------------------------------------------------- + + else if(filter_kind == KERNEL_QCEF) then + ! obs_dist_type is an integer whose value is provided by kde_distribution_mod + ! Options are uninformative, normal, binomial, gamma, + ! inv_gamma, lognormal, and truncated_normal. Details can be found in the + ! definition of the likelihood function in kde_distribution_mod.f90. Different + ! types of observations have different interpretations for `obs_var'. + if (.not. present(obs_dist_type)) then +! call error_handler(E_ERR,'obs_increment', & +! 'For kde filter, must specify obs_dist_type', source) + ! IG: Hack. Assume that the obs dist type is truncated normal with the same + ! bounds as the observed variable. + call obs_increment_kde(ens, ens_size, obs, obs_var, & + obs_dist_types%truncated_normal, bounded_below, & + bounded_above, lower_bound, upper_bound, obs_inc) + else + call obs_increment_kde(ens, ens_size, obs, obs_var, obs_dist_type, & + bounded_below, bounded_above, lower_bound, upper_bound, obs_inc) + end if + else - call error_handler(E_ERR,'obs_increment', & + call error_handler(E_ERR,'obs_increment', & 'Illegal value of filter_kind', source) endif endif @@ -1167,7 +1191,144 @@ subroutine obs_increment_bounded_norm_rhf(ens, ens_like, ens_size, prior_var, & end subroutine obs_increment_bounded_norm_rhf +subroutine obs_increment_kde(ens, ens_size, y, obs_param, obs_dist_type, & + bounded_below, bounded_above, lower_bound, upper_bound, obs_inc) + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(out) :: obs_inc(ens_size) + + ! Applies a QCEF based on kernel density estimation & quadrature + + real(r8) :: q + real(r8) :: p_lower_prior, p_int_prior, p_upper_prior + real(r8) :: p_lower_post, p_int_post, p_upper_post + real(r8) :: like_ens_mean ! ensemble mean of the likelihood + real(r8) :: ens_interior(ens_size) + real(r8) :: unif + real(r8) :: d(ens_size), d_max + type(distribution_params_type) :: params_interior_prior + type(distribution_params_type) :: params_interior_posterior + integer :: ens_size_interior + integer :: i, count_lower, count_upper + + ! If this is first time through, need to initialize the random sequence. + ! This will reproduce exactly for multiple runs with the same task count, + ! but WILL NOT reproduce for a different number of MPI tasks. + ! To make it independent of the number of MPI tasks, it would need to + ! use the global ensemble number or something else that remains constant + ! as the processor count changes. this is not currently an argument to + ! this function and so we are not trying to make it task-count invariant. + if(first_inc_ran_call) then + call random_seed() + call random_number(q) + i = int(q * 1000) + call init_random_seq(inc_ran_seq, my_task_id() + i) + first_inc_ran_call = .false. + endif + + ! If all ensemble members are identical, then there is no update + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + obs_inc(:) = 0._r8 + return + endif + + ! Get mixture component probabilities for the prior, and the interior ensemble, and its params + call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + p_lower_prior, p_int_prior, p_upper_prior) + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_types%uninformative, & + params_interior_prior) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_type, & + params_interior_posterior) + else + d_max = 0._r8 + endif + + ! Get mixture component probabilities for the posterior + if (bounded_below) then + p_lower_post = p_lower_prior * likelihood_function(lower_bound, y, obs_param, obs_dist_type) + else + p_lower_post = 0._r8 + end if + if (bounded_above) then + p_upper_post = p_upper_prior * likelihood_function(upper_bound, y, obs_param, obs_dist_type) + else + p_upper_post = 0._r8 + end if + p_int_post = 0._r8 + do i=1,ens_size_interior + p_int_post = p_int_post + likelihood_function(ens_interior(i), y, obs_param, obs_dist_type) + end do + p_int_post = p_int_prior * p_int_post / real(ens_size_interior, r8) + ! Get prior ensemble mean of likelihood + like_ens_mean = p_lower_post + p_int_post + p_upper_post + ! If likelihood underflow, assume flat likelihood, so no increments + if(like_ens_mean .le. 0.0_r8) then + obs_inc(:) = 0.0_r8 + return + else ! Finish getting mixture component probabilities for the posterior + p_lower_post = p_lower_post / like_ens_mean + p_upper_post = p_upper_post / like_ens_mean + p_int_post = 1._r8 - p_lower_post - p_upper_post + end if + + ! Update ensemble members -> get observation increments + + count_lower = 0 + count_upper = 0 + unif = random_uniform(inc_ran_seq) / real(ens_size, r8) + do i=1,ens_size + ! Map each ensemble member to a quantile using the prior cdf. Members + ! on the boundary get random quantiles. + if (bounded_below .and. (ens(i) .le. lower_bound)) then + ! Rather than draw a random for each member on the boundary, + ! I draw one random (above) then add 1/Nb to each subsequent value + q = unif + real(count_lower, r8) / real(ens_size, r8) + count_lower = count_lower + 1 + write(*,*) 'lower boundary quantile ', q + elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then + ! As above, I draw one random then add 1/Nb to each subsequent value + q = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) + count_upper = count_upper + 1 + write(*,*) 'upper boundary quantile ', q + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a random quantile + q = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior + else ! Use the interior cdf obtained using kde. + q = kde_cdf_params(ens(i), params_interior_prior) + q = (p_int_prior * q) + p_lower_prior + end if + ! Invert the posterior kde cdf to get the updated ensemble member, then + ! subtract to get the increment. + if (q .le. p_lower_post) then ! Posterior value on the lower boundary + obs_inc(i) = lower_bound - ens(i) + elseif (q .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary + obs_inc(i) = upper_bound - ens(i) + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! posterior value in the interior, but there's only one prior ensemble member/value + ! in the interior, so we just assign the posterior ensemble member equal to the prior one + obs_inc(i) = ens_interior(1) - ens(i) + else ! posterior value in the interior, can use kde + ! Rescale q to be between 0 and 1 before inverting interior cdf + q = (q - p_lower_post) / p_int_post + obs_inc(i) = inv_kde_cdf_params(q, params_interior_posterior) + obs_inc(i) = obs_inc(i) - ens(i) + end if + end do +end subroutine obs_increment_kde ! Computes a normal or truncated normal (above and/or below) likelihood. function get_truncated_normal_like(x, obs, obs_var, & @@ -1420,6 +1581,8 @@ subroutine obs_increment_kernel(ens, ens_size, obs, obs_var, obs_inc) endif ! Generate a uniform random number and a Gaussian for each new member +!! IG: This could be done with a single random number and in a way that +!! preserves the order of the ensemble members do i = 1, ens_size unif = random_uniform(inc_ran_seq) ! Figure out which kernel it's in diff --git a/assimilation_code/modules/assimilation/distribution_params_mod.f90 b/assimilation_code/modules/assimilation/distribution_params_mod.f90 index 7951a7c7bd..43849f7fbd 100644 --- a/assimilation_code/modules/assimilation/distribution_params_mod.f90 +++ b/assimilation_code/modules/assimilation/distribution_params_mod.f90 @@ -25,10 +25,12 @@ module distribution_params_mod integer, parameter :: LOG_NORMAL_DISTRIBUTION = 5 integer, parameter :: UNIFORM_DISTRIBUTION = 6 integer, parameter :: PARTICLE_FILTER_DISTRIBUTION = 7 +integer, parameter :: KDE_DISTRIBUTION = 8 public :: distribution_params_type, deallocate_distribution_params, & NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, & - LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, PARTICLE_FILTER_DISTRIBUTION + LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, PARTICLE_FILTER_DISTRIBUTION, & + KDE_DISTRIBUTION contains diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 new file mode 100644 index 0000000000..0b1ef1a042 --- /dev/null +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -0,0 +1,996 @@ +module kde_distribution_mod + +use types_mod, only : r8, missing_r8 + +use utilities_mod, only : E_ERR, E_MSG, error_handler + +use sort_mod, only : sort + +use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params, & + KDE_DISTRIBUTION + +use rootfinding_mod, only : inv_cdf + +use normal_distribution_mod, only : normal_cdf + +implicit none +private + +public :: kde_cdf, kde_cdf_params, inv_kde_cdf, inv_kde_cdf_params, & + test_kde, obs_dist_types, likelihood_function, pack_kde_params, & + separate_ensemble + +type available_obs_dist_types + integer :: uninformative, normal, binomial, gamma, & + inv_gamma, lognormal, truncated_normal +end type + +type(available_obs_dist_types), parameter :: obs_dist_types = & +available_obs_dist_types(uninformative=999, normal=998, binomial=997, & + gamma=996, inv_gamma=995, lognormal=994, truncated_normal=993) +character(len=512) :: errstring +character(len=*), parameter :: source = 'kde_distribution_mod.f90' + +contains + +!--------------------------------------------------------------------------- + +function likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above, bounded_below, upper_bound, lower_bound) result(l) + real(r8) :: l ! likelihood value + real(r8), intent(in) :: x ! state value + real(r8), intent(in) :: y ! obs value + real(r8), intent(in) :: obs_param ! meaning depends on obs_dist_type + integer, intent(in) :: obs_dist_type ! obs distribution type + logical, optional, intent(in) :: bounded_above, bounded_below + real(r8), optional, intent(in) :: upper_bound, lower_bound + + ! Evaluates the likelihood pdf(y | x) for various kinds of observation + ! distributions. The value returned is not equal to the observation + ! pdf evaluated at y, because normalization constants that don't depend + ! on x are omitted. + + real(r8) :: gamma_shape, gamma_scale + real(r8) :: inv_gamma_shape, inv_gamma_scale + real(r8) :: cdf(2) + logical :: bounded_above_val, bounded_below_val + + l = 1._r8 ! Initialize + + select case (obs_dist_type) + case (obs_dist_types%uninformative) + ! Uninformative observations have a likelihood equal to one. + l = 1._r8 + case (obs_dist_types%normal) + ! For a normal obs distribution, like_param is the obs error variance + if (obs_param <= 0._r8) then + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + l = exp( -0.5_r8 * (x - y)**2 / obs_param ) + end if + case (obs_dist_types%binomial) + ! For a binomial obs distribution 0<=x<=1 is the probability of + ! observing y successes of obs_param total trials + if (y < 0._r8) then + write(errstring, *) 'y value is negative with a binomial obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y > obs_param) then + write(errstring, *) 'successes greater than total trials with a binomial obs model ', y, obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif ((x < 0._r8) .or. (x > 1._r8)) then + write(errstring, *) 'x outside [0,1] with a binomial obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + l = (x**y) * (1._r8 - x)**(obs_param - y) + end if + case (obs_dist_types%gamma) + ! For a gamma obs distribution, the mean is x and the variance is obs_param * x^2, i.e. + ! the obs error sd is sqrt(obs_param) times the true value. If the error sd is p% of x, + ! set obs_param = (p/100._r8)**2. + ! For a gamma obs distribution, the likelihood is inverse gamma. + if (x < 0._r8) then + write(errstring, *) 'x value is negative with a gamma obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with a gamma obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs variance is non-positive with a gamma obs model ', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + gamma_shape = 1._r8 / obs_param ! k + gamma_scale = x * obs_param ! theta + if (x .eq. 0._r8) then + l = 0._r8 ! Technically x must be > 0, but just setting l=0 is convenient. + else + l = (y / gamma_scale)**gamma_shape * exp(-y / gamma_scale) + end if + end if + case (obs_dist_types%inv_gamma) + ! For an inverse gamma obs distribution, the mean is x and the variance is obs_param * x^2, + ! i.e. the obs error sd is sqrt(obs_param) times the true value. If the error sd is p% of x, + ! set obs_param = (p/100._r8)**2. + ! For an inverse gamma obs distribution, the likelihood is gamma. + if (x < 0._r8) then + write(errstring, *) 'x value is negative with an inverse gamma obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with an inverse gamma obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs variance is non-positive with an inverse gamma obs model ', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + inv_gamma_shape = (1._r8 / obs_param) + 2._r8 ! alpha + inv_gamma_scale = x * (inv_gamma_shape - 1._r8) ! beta + if (x .eq. 0._r8) then + l = 0._r8 ! Technically x must be > 0, but just setting l=0 is convenient. + else + l = (inv_gamma_scale**inv_gamma_shape) * exp( - inv_gamma_scale / y ) + end if + end if + case (obs_dist_types%lognormal) + ! For a lognormal obs distribution, ln(y) is normal with mean x and variance obs_param. + ! The likelihood is normal. + if (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with a lognormal obs model', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + l = exp( -0.5_r8 * (x - log(y))**2 / obs_param ) + end if + case (obs_dist_types%truncated_normal) + ! Code based on the version in assim_tools_mod.f90 + ! This implicitly assumes that the bounds on the y distribution are the same + ! as the bounds on x, which makes sense in observation space. The factor of + ! sigma * sqrt(2 * pi) is ignored. + ! A zero observation error variance is a degenerate case + if(obs_param <= 0.0_r8) then + if(x == y) then + l = 1.0_r8 + else + l = 0.0_r8 + endif + return + endif + + cdf = [0._r8, 1._r8] + if (present(bounded_above)) then + bounded_above_val = bounded_above + else + bounded_above_val = .false. + end if + + if (present(bounded_below)) then + bounded_below_val = bounded_below + else + bounded_below_val = .false. + end if + + if (bounded_below_val) cdf(1) = normal_cdf(lower_bound, x, sqrt(obs_param)) + if (bounded_above_val) cdf(2) = normal_cdf(upper_bound, x, sqrt(obs_param)) + + l = exp( -0.5_r8 * (x - y)**2 / obs_param ) / (cdf(2) - cdf(1)) + case DEFAULT + write(errstring, *) 'likelihood called with unrecognized obs_dist_type ', obs_dist_type + call error_handler(E_MSG, 'kde_distribution_mod:likelihood_function', errstring, source) + end select + +end function likelihood_function + +!--------------------------------------------------------------------------- + +elemental function epanechnikov_kernel(x) ! The fact that this is elemental is not (yet) used + real(r8) :: epanechnikov_kernel + real(r8), intent(in) :: x + real(r8), parameter :: norm_const = 3._r8 / 4._r8 + epanechnikov_kernel = norm_const * max(0._r8, 1._r8 - x**2) + +end function epanechnikov_kernel + +!--------------------------------------------------------------------------- + +elemental function epanechnikov_cdf(x) ! The fact that this is elemental is not (yet) used + real(r8) :: epanechnikov_cdf + real(r8), intent(in) :: x + real(r8), parameter :: norm_const = 1._r8 / 4._r8 + if (x <= -1._r8) then + epanechnikov_cdf = 0._r8 + elseif (x <= 1._r8) then + epanechnikov_cdf = min(1._r8, max(0._r8, norm_const * (2._r8 + 3._r8 * x - x**3))) + else + epanechnikov_cdf = 1._r8 + end if + +end function epanechnikov_cdf + +!--------------------------------------------------------------------------- + +elemental subroutine boundary_correction(x, lx, mx) + real(r8), intent(in) :: x + real(r8), intent(out) :: lx, mx + + ! Boundary correction for kde, from Jones (1993) and Jones & Foster (1996) + ! The fact that this is elemental is not (yet) used + + real(r8) :: denom + + denom = ((1._r8 + x)**4) * (19._r8 + 3._r8 * x * (x - 6._r8)) + lx =-64._r8 * (-2._r8 + x * (4._r8 + 3._r8 * x * (x - 2._r8))) / denom + mx = 240._r8 * (x - 1._r8)**2 / denom + +end subroutine boundary_correction + +!--------------------------------------------------------------------------- + +function kde_pdf(x, p) + real(r8) :: kde_pdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + + ! Returns the kernel estimate of the pdf evaluated at x. + + integer :: i + real(r8) :: u_lower, lx_lower, mx_lower + real(r8) :: u_upper, lx_upper, mx_upper + + kde_pdf = 0._r8 ! Initialize + if (p%bounded_below .and. (x <= p%lower_bound)) return + if (p%bounded_above .and. (x >= p%upper_bound)) return + do i=1, p%ens_size ! Reduction loop - parallelizable + u_lower = 0._r8; lx_lower = 1._r8 ; mx_lower = 0._r8 + u_upper = 0._r8; lx_upper = 1._r8 ; mx_upper = 0._r8 + if (p%bounded_below) then ! Bounded below + u_lower = min( 1._r8, max( 0._r8, (x - p%lower_bound) / p%more_params(i) ) ) ! p%more_params(i) holds kernel width for ensemble member i + call boundary_correction(u_lower, lx_lower, mx_lower) + end if + if (p%bounded_above) then ! Bounded above + u_upper = min( 1._r8, max( 0._r8, (p%upper_bound - x) / p%more_params(i) ) ) + call boundary_correction(u_upper, lx_upper, mx_upper) + end if + u_lower = (x - p%ens(i)) / p%more_params(i) ! Not u_lower any more, just (x-x_i)/h_i + kde_pdf = kde_pdf + (1._r8 / p%more_params(i)) * & + (lx_lower + mx_lower * u_lower) * & + (lx_upper - mx_upper * u_lower) * & + epanechnikov_kernel( u_lower ) + end do + kde_pdf = max(0._r8, kde_pdf) / (p%ens_size * p%more_params(p%ens_size + 1)) ! p%more_params(ens_size + 1) normalizes the pdf + +end function kde_pdf + +!----------------------------------------------------------------------- + +subroutine get_kde_bandwidths(ens_size, ens, bandwidths) + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(out) :: bandwidths(ens_size) + + real(r8) :: ens_mean, ens_sd, h0, g, d_max + real(r8), dimension(ens_size) :: f_tilde, d, lambda + integer :: i, j, k + + + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d(:)) + if (d_max <= 0._r8) then + errstring = 'Bandwidth = 0 ' + call error_handler(E_ERR, 'get_kde_bandwidths', errstring, source) + end if + ens_mean = sum(ens) / ens_size + ens_sd = sqrt( sum( (ens - ens_mean)**2 ) / (ens_size - 1._r8) ) + h0 = 2._r8 * ens_sd / (ens_size**0.2_r8) ! This would be the kernel width if the widths were not adaptive. + ! It would be better to use min(sd, iqr/1.34) but don't want to compute iqr + k = floor( sqrt( real(ens_size) ) ) ! distance to kth nearest neighbor used to set bandwidth + do i=1,ens_size + d(:) = sort( abs( ens(:) - ens(i) ) ) + d_max = maxval(d(:)) + where (d < 1.e-3_r8 * d_max) ! set small distances to zero + d = 0._r8 + end where + j = 1 + do while ((d(k+j) <= 0._r8) .and. (k+j < ens_size)) + j = j + 1 + end do + f_tilde(i) = 0.5_r8 * real(k+j-1, r8) / (real(ens_size, r8) * d(k+j)) ! Initial density estimate + end do + f_tilde(:) = f_tilde(:) / maxval(f_tilde(:)) ! Avoids overflow in the next line + g = product( f_tilde )**( 1._r8 / real(ens_size, r8) ) + lambda(:) = sqrt( g / f_tilde(:) ) + bandwidths(:) = h0 * lambda(:) + if (maxval(bandwidths(:)) <= 0._r8) then + write(*,*) 'Bandwidth breakdown' + write(*,*) h0, g, lambda(:), f_tilde(:), ens(:) + end if + +end subroutine get_kde_bandwidths + +!--------------------------------------------------------------------------- + +function gq5(left, right, p) result(q) + real(r8) :: q + real(r8), intent(in) :: left, right + type(distribution_params_type), intent(in) :: p + + ! Uses 5-point Gauss quadrature to approximate \int_left^right l(s; y) p(s) ds + ! where p(x) is the prior pdf and l(x; y) is the likelihood. This only works + ! correctly if left >= lower_bound and right <= upper_bound. The result is + ! **Not Normalized**. + + real(r8) :: y + real(r8) :: obs_param ! See likelihood function for interpretation + integer :: obs_dist_type ! See likelihood function for interpretation + real(r8) :: xi ! quadrature point + real(r8), save :: chi(5) = [-sqrt(5._r8 + 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + -sqrt(5._r8 - 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + 0._r8, & + sqrt(5._r8 - 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + sqrt(5._r8 + 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8] ! Gauss quadrature points + real(r8), save :: w(5) = [(322._r8 - 13._r8 * sqrt(70._r8)) / 900._r8, & + (322._r8 + 13._r8 * sqrt(70._r8)) / 900._r8, & + 128._r8/225._r8, & + (322._r8 + 13._r8 * sqrt(70._r8)) / 900._r8, & + (322._r8 - 13._r8 * sqrt(70._r8)) / 900._r8] ! GQ weights + real(r8) :: l ! value of the likelihood function + integer :: k + + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + + q = 0._r8 + do k=1,5 + xi = 0.5_r8 * ((right - left) * chi(k) + left + right) + if (obs_dist_type .eq. obs_dist_types%truncated_normal) then + l = likelihood_function(xi, y, obs_param, obs_dist_type, & + bounded_above=p%bounded_above, bounded_below=p%bounded_below, & + upper_bound=p%upper_bound, lower_bound=p%lower_bound) + else + l = likelihood_function(xi, y, obs_param, obs_dist_type) + end if + q = q + 0.5_r8 * (right - left) * w(k) * kde_pdf(xi, p) * l + end do + +end function gq5 + +!--------------------------------------------------------------------------- + +function integrate_pdf(x, p) result(q) + real(r8) :: q + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + + ! Uses quadrature to approximate \int_{-\infty}^x l(x; y) p(s) ds where + ! p(s) is the prior pdf and l(x; y) is the likelihood. The interval is + ! broken up into sub-intervals whose boundaries are either one of the bounds, + ! or the edge of support of one of the kernels, or the value of x. On each + ! sub-interval the integral is approximated using Gauss-Legendre quadrature + ! with 5 points. When the likelihood is flat and the boundaries are far + ! from the ensemble, the result is exact up to roundoff error. + + real(r8) :: y + real(r8) :: obs_param ! See likelihood function for interpretation + integer :: obs_dist_type ! See likelihood function for interpretation + real(r8) :: edges(2*p%ens_size) + real(r8) :: left, right ! edges of current sub-interval, quadrature point + integer :: i + + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + + edges(1:p%ens_size) = p%ens(1:p%ens_size) - p%more_params(1:p%ens_size) + edges(p%ens_size + 1:2*p%ens_size) = p%ens(1:p%ens_size) + p%more_params(1:p%ens_size) + edges(:) = sort(edges(:)) ! If bandwidths were constant we would not need to sort + + ! If x is outside the support of the pdf then we can skip the quadrature. + left = edges(1) + if (p%bounded_below) left = max(left, p%lower_bound) + if (x <= left) then + q = 0._r8 + return + end if + + ! Important to use x > upper_bound here because I use + ! x = upper_bound to compute the normalization constant. + if ((p%bounded_above) .and. (x > p%upper_bound)) then + q = 1._r8 + return + end if + + ! If we haven't returned yet, then there is at least one subinterval. + i = 1 + right = min(x, edges(2)) ! left was computed above + q = gq5(left, right, p) + do while ((x > right) .and. (i+1 < 2*p%ens_size)) + i = i + 1 + left = right + right = min(x, edges(i+1)) + q = q + gq5(left, right, p) + end do + ! Note that it is possible to have maxval(edges) < x < upper_bound, + ! but that last sub-interval from maxval(edges) to x has zero integral, + ! so it can be safely skipped. + +end function integrate_pdf + +!----------------------------------------------------------------------- + +subroutine pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + integer, intent(in) :: ens_size + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + type(distribution_params_type), intent(out) :: p + + real(r8) :: bandwidths(ens_size) + real(r8) :: edge, edges(2*ens_size + 2) + integer :: i + + ! Set the fixed storage parameters in the distribution_params_type + p%ens_size = ens_size + p%distribution_type = KDE_DISTRIBUTION + + ! Allocate space needed for the parameters + allocate(p%ens(1:ens_size), source=0._r8) + allocate(p%more_params(1:5*ens_size + 8), source=0._r8) + + ! p%more_params(1:ens_size) are the kernel bandwidths + ! p%more_params(ens_size + 1) is the normalization constant for the pdf + ! p%more_params(ens_size + 2) is the observation value y; not used for prior + ! p%more_params(ens_size + 3) is the observation parameter (see likelihood for details); not used for prior + ! p%more_params(ens_size + 4) is the observation error distribution type. For prior set to uninformative + ! p%more_params(ens_size + 5:3*ens_size + 6) are the edges. + ! p%more_params(3*ens_size+7:5*ens_size+8) are the cdf evaluated at the edges. + + ! Save the ensemble values, sorted now so that they don't have to be re-sorted for the first guess + p%ens(:) = sort(ens(:)) + + ! Store the kernel bandwidths in the more_params array + ! Important to make sure that p%ens and p%more_params are sorted in same order by passing p%ens rather than ens + call get_kde_bandwidths(ens_size, p%ens, bandwidths) + p%more_params(1:ens_size) = bandwidths(:) + + ! Pack obs information + p%more_params(ens_size + 2) = y + p%more_params(ens_size + 3) = obs_param + p%more_params(ens_size + 4) = real(obs_dist_type, r8) ! This is not ideal because it involves a type conversion from int to real + + ! Get the edges of the subintervals on which the pdf is smooth + edges(1:ens_size) = p%ens(:) - bandwidths(:) + edges(ens_size+1:2*ens_size) = p%ens(:) + bandwidths(:) + if (bounded_below) then + edges(2*ens_size+1) = lower_bound + else + edges(2*ens_size+1) = minval(edges(1:ens_size)) + end if + if (bounded_above) then + edges(2*ens_size+2) = upper_bound + else + edges(2*ens_size+2) = maxval(edges(ens_size+1:2*ens_size)) + end if + edges(:) = sort(edges(:)) + p%more_params(ens_size+5:3*ens_size+6) = edges(:) + + ! If the ensemble is sufficiently far from the boundary, that we can use the unbounded code, which is cheaper. + edge = minval(p%ens(:) - 2*bandwidths(:)) + if (bounded_below .and. (edge > lower_bound)) then + p%bounded_below = .false. + p%lower_bound = minval(p%ens(:) - bandwidths(:)) + else + p%bounded_below = bounded_below + p%lower_bound = lower_bound + end if + + ! If the ensemble is sufficiently far from the boundary, that we can use the unbounded code, which is cheaper. + edge = maxval(p%ens(:) + 2*bandwidths(:)) + if (bounded_above .and. (edge < upper_bound)) then + p%bounded_above = .false. + p%upper_bound = maxval(p%ens(:) + bandwidths(:)) + else + p%bounded_above = bounded_above + p%upper_bound = upper_bound + end if + + ! Integrate across all subintervals + p%more_params(ens_size+1) = 1._r8 ! Temporary value + p%more_params(3*ens_size+7) = 0._r8 ! 0 at left edge + do i=2,2*ens_size+2 + if ((p%bounded_below .and. (edges(i) <= p%lower_bound)) .or. & + (edges(i) <= edges(i-1)) .or. & + (p%bounded_above .and. (edges(i) > p%upper_bound))) then + ! The subinterval [edges(i-1), edges(i)] is empty or outside the bounds + p%more_params(3*ens_size+6+i) = p%more_params(3*ens_size+6+i-1) + 0._r8 + else + p%more_params(3*ens_size+6+i) = p%more_params(3*ens_size+6+i-1) + gq5(edges(i-1), edges(i), p) + end if + end do + p%more_params(ens_size+1) = maxval(p%more_params(3*ens_size+7:5*ens_size+8)) + p%more_params(3*ens_size+7:5*ens_size+8) = p%more_params(3*ens_size+7:5*ens_size+8) / p%more_params(ens_size+1) + +end subroutine pack_kde_params + +!----------------------------------------------------------------------- + +function kde_cdf_params(x, p) result(quantile) + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + real(r8) :: quantile + + ! Returns the cumulative distribution of the kernel density estimate + ! at the value x. + + integer :: i + logical :: use_analytical_cdf + ! It is a waste of memory to unpack p%more_params, but it aids readability + real(r8) :: bandwidths(p%ens_size) + real(r8) :: y, obs_param + integer :: obs_dist_type + real(r8), dimension(2*p%ens_size+2) :: edges, cdfs + + bandwidths(:) = p%more_params(1:p%ens_size) + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + edges(:) = p%more_params( p%ens_size+5:3*p%ens_size+6) + cdfs(:) = p%more_params(3*p%ens_size+7:5*p%ens_size+8) + + quantile = 0._r8 + ! If the likelihood is uninformative and the distribution is unbounded, we can evaluate + ! the cdf analytically (instead of using quadrature) to save computation. + use_analytical_cdf = ( (.not.p%bounded_below) .and. (.not.p%bounded_above) .and. & + (obs_dist_type .eq. obs_dist_types%uninformative) ) + if (use_analytical_cdf) then + do i=1,p%ens_size + quantile = quantile + epanechnikov_cdf( (x - p%ens(i)) / bandwidths(i) ) / p%ens_size + end do + else ! Compute cdf using quadrature + !quantile = min(1._r8, max(0._r8, integrate_pdf(x, p))) + if (x < edges(1)) then + return + end if + do i=2,2*p%ens_size+2 + if ((edges(i-1) <= x) .and. (x <= edges(i))) then + quantile = min(1._r8, max(0._r8, cdfs(i-1) + gq5(edges(i-1), x, p))) + return + else + cycle + end if + end do + quantile = 1._r8 + end if + +end function kde_cdf_params + +!--------------------------------------------------------------------------- + +function kde_cdf(x, ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, y, obs_param, obs_dist_type) result(quantile) + real(r8), intent(in) :: x + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + real(r8) :: quantile + + ! Returns the cumulative distribution of the kernel density estimate + ! at the value x. + + type(distribution_params_type) :: p + + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + quantile = kde_cdf_params(x,p) + +end function kde_cdf + +!--------------------------------------------------------------------------- + +function inv_kde_cdf(quantile, ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, y, obs_param, obs_dist_type) result(x) + real(r8), intent(in) :: quantile + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + real(r8) :: x + + ! Returns the value x such that cdf(x) = quantile. + + type(distribution_params_type) :: p + + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + + x = inv_cdf(quantile, kde_cdf_params, inv_kde_first_guess_params, p) + +end function inv_kde_cdf + +!--------------------------------------------------------------------------- + +function inv_kde_cdf_params(quantile, p) result(x) + real(r8) :: x + real(r8), intent(in) :: quantile + type(distribution_params_type), intent(in) :: p + + ! Returns the value x such that cdf(x) = quantile. + + x = inv_cdf(quantile, kde_cdf_params, inv_kde_first_guess_params, p) + +end function inv_kde_cdf_params + +!--------------------------------------------------------------------------- + +function inv_kde_first_guess_params(quantile, p) result(x) + real(r8) :: x + real(r8), intent(in) :: quantile + type(distribution_params_type), intent(in) :: p + real(r8) :: edge ! edge of support of the cdf + + ! This first-guess subroutine evaluates the cdf at the ensemble members, + ! then finds a pair of ensemble members whose quantiles bracket the + ! target quantile, then sets the first guess to a convex combination of + ! these two ensemble members. If the target quantile is outside the + ! ensemble, the first guess is a combination of the nearest ensemble + ! member and the edge of the support of the pdf. If the + ! target quantile is 0 or 1, the appropriate bound is returned. + + integer :: i + real(r8) :: dq + real(r8), dimension(2*p%ens_size+2) :: edges, cdfs + + edges(:) = p%more_params( p%ens_size+5:3*p%ens_size+6) + cdfs(:) = p%more_params(3*p%ens_size+7:5*p%ens_size+8) + + if (quantile <= 0._r8) then + x = minval(edges(:)) + if (p%bounded_below) then + x = max(x, p%lower_bound) + end if + return + end if + + if (quantile >= 1._r8) then + x = maxval(edges(:)) + if (p%bounded_above) then + x = min(x, p%upper_bound) + end if + return + end if + + do i=2,2*p%ens_size+2 + if (quantile <= cdfs(i)) then + dq = cdfs(i) - cdfs(i-1) + x = (cdfs(i ) - quantile) * edges(i-1) / dq & + + (quantile - cdfs(i-1)) * edges(i ) / dq + return + end if + end do + +end function inv_kde_first_guess_params + +!----------------------------------------------------------------------- + +subroutine separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + q_lower, q_int, q_upper) + + ! Extracts the ensemble members that are not on the boundaries, and + ! also returns the fraction of ensemble members on each boundary and in + ! the interior. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(out) :: ens_interior(ens_size) + integer, intent(out) :: ens_size_interior + real(r8), intent(out) :: q_lower, q_int, q_upper + + ! local variables + integer :: i, j + logical :: is_interior(ens_size) + real(r8), parameter :: eps = 0._r8 !epsilon(1._r8) + + ! Initialize + ens_interior(:) = missing_r8 + + q_lower = 0._r8 + q_int = 0._r8 + q_upper = 0._r8 + is_interior(:) = .true. + j = 0 + do i=1,ens_size + if (bounded_below) then + if (ens(i) <= lower_bound + eps) then + q_lower = q_lower + 1._r8 + is_interior(i) = .false. + end if + end if + if (bounded_above) then + if (ens(i) >= upper_bound - eps) then + q_upper = q_upper + 1._r8 + is_interior(i) = .false. + end if + end if + if (is_interior(i)) then + j = j + 1 + ens_interior(j) = ens(i) + end if + end do + ens_size_interior = ens_size - nint(q_lower + q_upper) + if (j .ne. ens_size_interior) then + errstring = 'Total number of interior ensemble members incorrect ' + call error_handler(E_ERR, 'separate_ensemble', errstring, source) + end if + q_lower = q_lower / real(ens_size, r8) + q_upper = q_upper / real(ens_size, r8) + q_int = 1._r8 - (q_lower + q_upper) + +end subroutine separate_ensemble + +!----------------------------------------------------------------------- + +subroutine test_kde + ! This routine provides limited tests of the numerics in this module. It tests + ! the boundary correction function, the likelihood, the bandwidth selection, + ! and the cdf inversion. It uses an ensemble [-1, 1]. It tests the bandwidth + ! selection and the cdf inversion with zero, one, or two bounds at [-2, 2]. + ! Failing these tests suggests a serious problem. Passing them does not indicate + ! that there are acceptable results for all possible inputs. + + integer, parameter :: ens_size = 2 + real(r8), dimension(ens_size) :: ensemble, bandwidths, target_bandwidths + type(distribution_params_type) :: p + real(r8) :: x, y, inv, max_diff, lx, mx, like, obs_param + integer :: i, obs_dist_type + + ! Test the boundary correction code against a Mathematica implementation + x = 0.5_r8 + call boundary_correction(x, lx, mx) + write(*, *) '----------------------------' + write(*, *) 'test boundary correction' + write(*, *) 'abs difference in lx is ', abs(1.322997416020672_r8 - lx) + write(*, *) 'abs difference in mx is ', abs(1.102497846683893_r8 - mx) + write(*, *) 'abs differences should be less than 1e-15' + + ! Test uninformative likelihood + max_diff = -1.0_r8 + y = 1._r8 + obs_param = 1._r8 + obs_dist_type = obs_dist_types%uninformative + do i = 0, 1000 + x = (real(i, r8) - 500.0_r8) / 500.0_r8 + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = max(abs(1._r8 - like), max_diff) + end do + write(*, *) '----------------------------' + write(*, *) 'Uninformative likelihood test' + write(*, *) 'max difference in likelihood is ', max_diff + write(*, *) 'max difference should be less than 1e-15' + + ! Test normal likelihood + x = 0.5_r8 + y = 0._r8 + obs_param = 1._r8 + obs_dist_type = obs_dist_types%normal + like = likelihood_function(x, y, obs_param, obs_dist_type) + write(*, *) '----------------------------' + write(*, *) 'Normal likelihood test' + write(*, *) 'abs difference in likelihood is ', abs(0.8824969025845955_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test binomial obs distribution (beta likelihood) + x = 0.25_r8 + y = 3._r8 + obs_param = 5._r8 + obs_dist_type = obs_dist_types%binomial + like = likelihood_function(x, y, obs_param, obs_dist_type) + write(*, *) '----------------------------' + write(*, *) 'Binomial obs distribution test' + write(*, *) 'abs difference in likelihood is ', abs(0.0087890625_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test gamma obs distribution (inverse gamma likelihood) + y = 1._r8 + x = 2._r8 + obs_param = 3._r8 + obs_dist_type = obs_dist_types%gamma + like = likelihood_function(x, y, obs_param, obs_dist_type) + write(*, *) '----------------------------' + write(*, *) 'Gamma obs distribution test' + write(*, *) 'abs difference in likelihood is ', abs(0.4658368455179406_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test inverse gamma obs distribution (gamma likelihood) + y = 3._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%inv_gamma + like = likelihood_function(x, y, obs_param, obs_dist_type) + write(*, *) '----------------------------' + write(*, *) 'Inverse gamma obs distribution test' + write(*, *) 'abs difference in likelihood is ', abs(3.415489474106968_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test lognormal obs distribution (normal likelihood) + y = 1._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%lognormal + like = likelihood_function(x, y, obs_param, obs_dist_type) + write(*, *) '----------------------------' + write(*, *) 'Lognormal obs distribution test' + write(*, *) 'abs difference in likelihood is ', abs(0.6065306597126334_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test truncated-normal obs distribution + y = 1._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%truncated_normal + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.true.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, doubly-bounded' + write(*, *) 'abs difference in likelihood is ', abs(1.292676850528392_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.false.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, bounded below' + write(*, *) 'abs difference in likelihood is ', abs(1.048912359301403_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.true.,bounded_below=.false.,upper_bound=4._r8,lower_bound=0._r8) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, bounded above' + write(*, *) 'abs difference in likelihood is ', abs(1.048912359301403_r8 - like) + write(*, *) 'abs difference should be less than 1e-15' + + ! Test bandwidth selection + target_bandwidths(:) = 2.462288826689833_r8 * [1._r8, 1._r8] + + ensemble(:) = [-1._r8, 1._r8] + + call get_kde_bandwidths(ens_size, ensemble, bandwidths) + + ! Compare computed bandwidths to exact + write(*, *) '----------------------------' + write(*, *) 'kde bandwidths test: Absolute value of differences should be less than 1e-15' + do i = 1, ens_size + write(*, *) i, abs(bandwidths(i) - target_bandwidths(i)) + end do + + ! Test the inversion of the cdf over the support of the pdf, unbounded + write(*, *) '----------------------------' + write(*, *) 'Unbounded cdf/icdf test' + call pack_kde_params(ens_size, .false., .false., 0._r8, 0._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = (real(i - 500, r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max around 1E-8' + ! The accuracy degrades near the boundary. The slope of the cdf goes to zero on the boundary + ! But at least the second derivative is nonzero. So near the boundary it behaves like a + ! double root, and the accuracy of the rootfinding degrades. + + call deallocate_distribution_params(p) + + ! Test the inversion of the cdf over the entire support of the pdf, bounded below + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test bounded below' + call pack_kde_params(ens_size, .true., .false., -2._r8, 0._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = (real(i - 500, r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if (x <= -2._r8) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + + call deallocate_distribution_params(p) + + + ! Test the inversion of the cdf over the entire support of the pdf, bounded above + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test bounded above' + call pack_kde_params(ens_size, .false., .true., 0._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = ((real(i, r8) - 500.0_r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if (x >= 2._r8) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + + call deallocate_distribution_params(p) + + ! Test the inversion of the cdf over the entire support of the pdf, doubly bounded + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test doubly-bounded' + call pack_kde_params(ens_size, .true., .true., -2._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = ((real(i, r8) - 500.0_r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if ((x <= -2._r8) .or. (x >= 2._r8)) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + + call deallocate_distribution_params(p) + + ! Test the quadrature: Construct a case with bounds, but where the bounds are + ! far enough from the data that they are not used. In this case the kernel + ! density estimate should integrate exactly to one. + call pack_kde_params(ens_size, .true., .true., -2._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + p%lower_bound = -20._r8 + p%upper_bound = 20._r8 + p%more_params(ens_size + 1) = 1._r8 + + y = integrate_pdf(0._r8, p) + write(*, *) '----------------------------' + write(*, *) 'test quadrature' + write(*, *) 'abs difference is ', abs(0.5_r8 - y) + write(*, *) 'abs difference should be less than 1e-15' + + call deallocate_distribution_params(p) + +end subroutine test_kde + +!------------------------------------------------------------------------ + +end module kde_distribution_mod diff --git a/assimilation_code/modules/assimilation/probit_transform_mod.f90 b/assimilation_code/modules/assimilation/probit_transform_mod.f90 index f0ea534a27..03cf0401b3 100644 --- a/assimilation_code/modules/assimilation/probit_transform_mod.f90 +++ b/assimilation_code/modules/assimilation/probit_transform_mod.f90 @@ -18,7 +18,7 @@ module probit_transform_mod NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, & GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, & LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, & - PARTICLE_FILTER_DISTRIBUTION + PARTICLE_FILTER_DISTRIBUTION, KDE_DISTRIBUTION use normal_distribution_mod, only : normal_cdf, inv_std_normal_cdf @@ -31,6 +31,9 @@ module probit_transform_mod use bnrh_distribution_mod, only : bnrh_cdf_initialized_vector, bnrh_cdf_params, & inv_bnrh_cdf_params, get_bnrh_sd +use kde_distribution_mod, only : kde_cdf_params, inv_kde_cdf_params, pack_kde_params, & + obs_dist_types, separate_ensemble + implicit none private @@ -183,7 +186,9 @@ subroutine transform_to_probit(ens_size, state_ens_in, distribution_type, p, & endif !---------------------------------------------------------------------------------- - +elseif(p%distribution_type == KDE_DISTRIBUTION) then + call to_probit_kde(ens_size, state_ens, p, probit_ens, & + use_input_p, bounded_below, bounded_above, lower_bound, upper_bound) !!!elseif(p%distribution_type == PARTICLE_FILTER_DISTRIBUTION) then !!!call to_probit_particle(ens_size, state_ens, p, probit_ens, use_input_p, & !!!bounded_below, bounded_above, lower_bound, upper_bound) @@ -446,6 +451,101 @@ end subroutine to_probit_bounded_normal_rh !!! !------------------------------------------------------------------------ +subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & + bounded_below, bounded_above, lower_bound, upper_bound) + + ! Transforms the values in state_ens. The transform is either defined + ! by state_ens (when use_input_p is false), or by p%ens (when use_input_p + ! is true). Handles the case where ensemble members are on the boundary. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: state_ens(ens_size) + type(distribution_params_type), intent(inout) :: p + real(r8), intent(out) :: probit_ens(ens_size) + logical, intent(in) :: use_input_p + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + + ! local variables + real(r8) :: quantile + integer :: i + real(r8) :: y = 0._r8 ! Dummy value, not used + real(r8) :: obs_param = 1._r8 ! Dummy value, not used + real(r8) :: ens(ens_size) ! Ensemble that defines the transform + real(r8) :: ens_interior(ens_size) ! Ensemble members that are not on the boundaries + integer :: ens_size_interior ! Number of ensemble members that are not on the boundaries + type(distribution_params_type) :: p_interior + real(r8) :: d(ens_size), d_max + real(r8) :: p_lower, p_int, p_upper + real(r8), parameter :: eps = 0._r8 !epsilon(1._r8) + + ! Get the ensemble that defines the transform + if(use_input_p) then + ens(:) = p%ens(:) + else + ! The input ensemble (state_ens) defines the transform. + ens(:) = state_ens(:) + endif + + ! If all ensemble members that define the transform are identical, then we can't + ! really define the transform, so we simply set all probit values to 0, pack the + ! ensemble members into p, and return. + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + probit_ens(:) = 0._r8 + if(.not. use_input_p) then + allocate(p%ens(1:ens_size)) + p%ens(:) = ens(:) + endif + return + endif + + ! If we reach this point then the ensemble members that define the transform are + ! not all identical, and if we are not using the input p then we need to define it. + if (.not. use_input_p) then + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, & + upper_bound, ens, y, obs_param, obs_dist_types%uninformative, p) + endif + + ! Get mixture component probabilities and interior ensemble + call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + p_lower, p_int, p_upper) + + ! Get parameters of the kde distribution for the interior + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + if (d_max .gt. 0._r8) then + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, & + upper_bound, ens_interior(1:ens_size_interior), y, obs_param, & + obs_dist_types%uninformative, p_interior) + endif + else + d_max = 0._r8 + endif + do i=1,ens_size + ! Map each state_ens member to a quantile using the prior cdf. Members + ! on the boundary map to p_lower and p_upper. + if (bounded_below .and. (state_ens(i) .le. lower_bound + eps)) then + quantile = p_lower + elseif (bounded_above .and. (state_ens(i) .ge. upper_bound - eps)) then + quantile = 1._r8 - p_upper + elseif ((ens_size_interior .le. 1) .or. (d_max .le. 0._r8)) then + ! Can't use kde with only one ensemble member, so assign to middle of interior range + quantile = (p_int * 0.5_r8) + p_lower + else ! Use the interior cdf obtained using kde. + quantile = kde_cdf_params(state_ens(i), p_interior) + quantile = (p_int * quantile) + p_lower + endif + ! Transform to probit/logit space + probit_ens(i) = probit_or_logit_transform(quantile) + end do + +end subroutine to_probit_kde + + subroutine transform_all_from_probit(ens_size, num_vars, probit_ens, p, state_ens) integer, intent(in) :: ens_size @@ -493,6 +593,8 @@ subroutine transform_from_probit(ens_size, probit_ens, p, state_ens) call from_probit_bounded_normal_rh(ens_size, probit_ens, p, state_ens) !!!elseif(p%distribution_type == PARTICLE_FILTER_DISTRIBUTION) then !!!call from_probit_particle(ens_size, probit_ens, p, state_ens) +elseif(p%distribution_type == KDE_DISTRIBUTION) then + call from_probit_kde(ens_size, probit_ens, p, state_ens) else write(errstring, *) 'Illegal distribution type', p%distribution_type call error_handler(E_ERR, 'transform_from_probit', errstring, source) @@ -643,6 +745,81 @@ end subroutine from_probit_bounded_normal_rh !------------------------------------------------------------------------ +subroutine from_probit_kde(ens_size, probit_ens, p, state_ens) + + ! Handles the case where ensemble members are on the boundary. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: probit_ens(ens_size) + type(distribution_params_type), intent(inout) :: p + real(r8), intent(out) :: state_ens(ens_size) + + ! local variables + integer :: i + real(r8) :: quantile + real(r8) :: ens(ens_size) ! Ensemble that defines the transform + real(r8) :: ens_interior(ens_size) ! Ensemble members that are not on the boundaries + integer :: ens_size_interior ! Number of ensemble members that are not on the boundaries + type(distribution_params_type) :: p_interior + real(r8) :: p_lower, p_int, p_upper + real(r8) :: y, obs_param + real(r8) :: d(ens_size), d_max + + ! The ensemble that defines the transform is contained in p, so unpack it + ens(:) = p%ens(:) + + ! If all ensemble members are identical, then there is no update + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + state_ens(:) = ens(1) + return + endif + ! Get mixture components on the boundaries and interior ensemble + call separate_ensemble(ens, ens_size, p%bounded_below, p%bounded_above, & + p%lower_bound, p%upper_bound, ens_interior, ens_size_interior, & + p_lower, p_int, p_upper) + + ! Get parameters of the kde distribution for the interior + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + if (d_max .gt. 0._r8) then + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + call pack_kde_params(ens_size_interior, p%bounded_below, p%bounded_above, p%lower_bound, & + p%upper_bound, ens_interior(1:ens_size_interior), y, obs_param, & + obs_dist_types%uninformative, p_interior) + endif + else + d_max = 0._r8 + endif + + ! Transform each probit ensemble member back to physical space + do i = 1, ens_size + ! First, invert the probit/logit to get quantiles + quantile = inv_probit_or_logit_transform(probit_ens(i)) + ! Next invert the mixture CDF to get the physical space ensemble + if (p%bounded_below .and. (quantile .le. p_lower)) then + state_ens(i) = p%lower_bound + elseif (p%bounded_above .and. (quantile .ge. 1._r8-p_upper)) then + state_ens(i) = p%upper_bound + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! If there is only one interior ensemble member in the prior then any + ! ensemble member that moves into the interior just gets mapped to the + ! one interior value from the prior. + state_ens(i) = ens_interior(1) + else + quantile = (quantile - p_lower) / p_int + state_ens(i) = inv_kde_cdf_params(quantile, p_interior) + endif + end do + +end subroutine from_probit_kde + +!------------------------------------------------------------------------ + function probit_or_logit_transform(quantile) real(r8) :: probit_or_logit_transform diff --git a/assimilation_code/modules/assimilation/rootfinding_mod.f90 b/assimilation_code/modules/assimilation/rootfinding_mod.f90 new file mode 100644 index 0000000000..226637bb68 --- /dev/null +++ b/assimilation_code/modules/assimilation/rootfinding_mod.f90 @@ -0,0 +1,267 @@ +module rootfinding_mod + +use types_mod, only : r8 + +use utilities_mod, only : E_ERR, E_MSG, error_handler + +use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params + +implicit none +private + +public :: inv_cdf + +character(len=512) :: errstring +character(len=*), parameter :: source = 'rootfinding_mod.f90' + +contains + +!------------------------------------------------------------------------ + +function inv_cdf(quantile_in, cdf, first_guess, p) result(x) + + interface + function cdf(x, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: cdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + end function + end interface + + interface + function first_guess(quantile, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: first_guess + real(r8), intent(in) :: quantile + type(distribution_params_type), intent(in) :: p + end function + end interface + + real(r8) :: x + real(r8), intent(in) :: quantile_in + type(distribution_params_type), intent(in) :: p + + ! Given a quantile q, finds the value of x for which cdf(x) is approximately q + ! Starts with Newton's method using an approximate gradient until either convergence + ! or overshoot. If it overshoots then it switches to the bisection-like ITP method + ! from Oliveira & Takahashi, ACM Trans Math Soft, 2020. Here f(x) = cdf(x) - quantile + + ! Local variables: + integer, parameter :: max_iterations = 50 ! Limit on the total number of iterations. + real(r8), parameter :: min_quantile = 0.0_r8, max_quantile = 0.999999999999999_r8 + real(r8), parameter :: tol = epsilon(1._r8)**(0.75_r8) ! Absolute on q, relative on x + real(r8) :: quantile, q_err + real(r8) :: delta_x, delta_f + real(r8) :: x_guess, q_guess, x0, x1, f0, f1 + real(r8) :: a, b, fa, fb + integer :: iter + + real(r8) :: lower_bound, upper_bound + logical :: bounded_below, bounded_above + + ! Extract the required information from the p type + bounded_below = p%bounded_below; bounded_above = p%bounded_above + lower_bound = p%lower_bound; upper_bound = p%upper_bound + + quantile = quantile_in + + ! Do a test for illegal values on the quantile + if(quantile < 0.0_r8 .or. quantile > 1.0_r8) then + write(errstring, *) 'Illegal Quantile input', quantile + call error_handler(E_ERR, 'inv_cdf', errstring, source) + endif + + ! If the distribution is bounded, quantiles at the limits have values at the bounds + if(bounded_below .and. quantile == 0.0_r8) then + x = lower_bound + return + endif + if(bounded_above .and. quantile == 1.0_r8) then + x = upper_bound + return + endif + + ! If input quantiles are outside the numerically supported range, move them to the extremes + quantile = min(quantile, max_quantile) + ! code tests stably for many distributions with min_quantile of 0.0, could remove this + quantile = max(quantile, min_quantile) + + ! Get first guess from functional approximation + x_guess = first_guess(quantile, p) + + ! Evaluate the cdf + q_guess = cdf(x_guess, p) + + ! Check if first guess is close enough + q_err = quantile - q_guess + if (abs(q_err) .le. tol) then + x = x_guess + return + end if + + ! If bounded below and target quantile is between 0 and q_guess, go to ITP + if (bounded_below .and. (quantile .lt. q_guess)) then + a = lower_bound + b = x_guess + fa = 0._r8 - quantile + fb = q_guess - quantile + x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) + return + end if + + ! If bounded above and target quantile is between q_guess and 1, go to ITP + if (bounded_above .and. (q_guess .lt. quantile)) then + a = x_guess + b = upper_bound + fa = q_guess + fb = 1._r8 + x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) + return + end if + + ! If we reach this line, then we can't yet bracket the true value of x, so we start + ! doing steps of the secant method, either until convergence or until we bracket + ! the root. If we bracket the root then we finish using the ITP method. + x0 = x_guess + f0 = q_guess - quantile + delta_x = max(1e-2_r8, 1e-2_r8 * abs(x_guess)) + if (f0 .gt. 0._r8) then + delta_x = -delta_x + end if + do iter=1,7 + x1 = x_guess + delta_x + if(bounded_below .and. (x1 .lt. lower_bound)) x1 = lower_bound + if(bounded_above .and. (x1 .gt. upper_bound)) x1 = upper_bound + f1 = cdf(x1, p) - quantile + delta_f = abs(f1 - f0) + if (delta_f .le. 0._r8) then + delta_x = 2._r8 * delta_x + else + exit + end if + end do + do iter = 1, max_iterations + ! If f0 * f1 < 0 then x0 and x1 bracket the root, so go to ITP + if (f0 * f1 .lt. 0._r8 ) then + a = min(x0, x1) + b = max(x0, x1) + fa = min(f0, f1) + fb = max(f0, f1) + x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations - iter + 1, p) + return + else ! Do a secant step + delta_f = f1 - f0 + if (abs(delta_f) .eq. 0._r8) then ! Stop. Failed step: Flat CDF + write(errstring, *) 'Failed to converge; flat cdf for quantile ', quantile + write(*,*) p%bounded_below, p%bounded_above + write(*,*) 'More params:', p%more_params(:) + write(*,*) 'iter, x0, x1, f0, f1:', iter, x0, x1, f0, f1 + write(*,*) 'ens:', p%ens(:) + call error_handler(E_MSG, 'rootfinding_mod:inv_cdf', errstring, source) + x = x1 + return + else + x_guess = x1 - f1 * delta_x / delta_f + if (bounded_above .and. (x_guess .gt. upper_bound)) x_guess = upper_bound + if (bounded_below .and. (x_guess .lt. lower_bound)) x_guess = lower_bound + x0 = x1 + f0 = f1 + x1 = x_guess + delta_x = x1 - x0 + f1 = cdf(x1, p) - quantile + x = x1 + end if + end if + + ! Finished secant step. Check for convergence. + if (abs(delta_x) .le. tol * max(abs(x0), abs(x1)) .or. (abs(f1) .le. tol)) then + return + endif + end do + + ! For now, have switched a failed convergence to return the latest guess + ! This has implications for stability of probit algorithms that require further study + ! Not currently happening for any of the test cases on gfortran + write(errstring, *) 'Failed to converge for quantile ', quantile + call error_handler(E_MSG, 'inv_cdf', errstring, source) + !!!call error_handler(E_ERR, 'inv_cdf', errstring, source) + +end function inv_cdf + +!----------------------------------------------------------------------- + +function inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) result(x) + interface + function cdf(x, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: cdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + end function + end interface + + real(r8), intent(in) :: quantile + real(r8), intent(inout) :: a, b, fa, fb + integer, intent(in) :: max_iterations + type(distribution_params_type), intent(in) :: p + real(r8) :: x + + ! Given a quantile q, finds the value of x for which cdf(x) is approximately q + ! Uses the bisection-like ITP method from Oliveira & Takahashi, ACM Trans Math + ! Soft, 2020. Here f(x) = cdf(x) - quantile. Assumes f(a) < 0 and f(b) > 0 on + ! input. + + ! Local variables: + integer, parameter :: n0 = 1._r8 + real(r8), parameter :: kappa2 = 2._r8 + real(r8), parameter :: phi = 0.5_r8 * (1._r8 + sqrt(5._r8)) ! Golden ratio + real(r8), parameter :: tol = epsilon(1._r8)**(0.75_r8) ! abs on f, rel on x + real(r8) :: kappa1 + real(r8) :: delta + real(r8) :: x_t, x_f, x_half, f_ITP + real(r8) :: eps, r + integer :: i, n_max, n_half + + kappa1 = 0.2_r8 / (b - a) + eps = tol * max(abs(a), abs(b)) + ! Max number of iterations is either (i) input max, or (ii) number of bisection + ! iterations (plus n0) needed to achieve the desired relative tolerance on x. + n_half = max(0, ceiling(log(0.5_r8 * (b - a) / eps) / log(2._r8))) + n_max = min(max_iterations, n0 + n_half) + + do i=1,n_max + x_half = 0.5_r8 * (a + b) ! Bisection guess + x_f = (b* fa - a * fb) / (fa - fb) ! Secant/Regula-Falsi guess + delta = kappa1 * abs(b - a)**kappa2 + if (delta .le. abs(x_half - x_f)) then + x_t = x_f + sign(delta, x_half - x_f) + else + x_t = x_half + end if + r = max(0._r8, eps * 2**(n_half - i) - 0.5_r8 * (b - a)) + if (abs(x_t - x_half) .le. r) then + x = x_t + else + x = x_half - sign(r, x_half - x_f) + end if + f_ITP = cdf(x, p) - quantile + if (abs(f_ITP) .le. tol) then + return + elseif (f_ITP .gt. 0._r8) then + b = x + fb = f_ITP + else + a = x + fa = f_ITP + end if + end do + +end function inv_cdf_ITP + +!----------------------------------------------------------------------- + +end module rootfinding_mod diff --git a/assimilation_code/programs/test_kde_dist/test_kde_dist.f90 b/assimilation_code/programs/test_kde_dist/test_kde_dist.f90 new file mode 100644 index 0000000000..b190390e1a --- /dev/null +++ b/assimilation_code/programs/test_kde_dist/test_kde_dist.f90 @@ -0,0 +1,8 @@ +program test_kde_mod + use utilities_mod, only : initialize_utilities, finalize_utilities + use kde_distribution_mod, only : test_kde + call initialize_utilities() + call test_kde() + call finalize_utilities() + +end program test_kde_mod \ No newline at end of file diff --git a/assimilation_code/programs/test_kde_dist/test_kde_dist.rst b/assimilation_code/programs/test_kde_dist/test_kde_dist.rst new file mode 100644 index 0000000000..47401fe34a --- /dev/null +++ b/assimilation_code/programs/test_kde_dist/test_kde_dist.rst @@ -0,0 +1 @@ +documentation should be here From d95b6a4127ece64d1e16688cf7e6544d84b6acd1 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 12 Jul 2024 15:00:39 -0600 Subject: [PATCH 02/14] Change names Fixes how the word 'quantile' is used in the code, along with variable names. The output of the inverse cdf (aka the quantile function) is a quantile. The input to the quantile function is a probability. --- .../modules/assimilation/assim_tools_mod.f90 | 37 ++++--- .../assimilation/probit_transform_mod.f90 | 30 ++--- .../modules/assimilation/rootfinding_mod.f90 | 104 +++++++++--------- 3 files changed, 86 insertions(+), 85 deletions(-) diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index 6054c0b9b9..af20abf925 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -1204,7 +1204,7 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, obs_dist_type, & ! Applies a QCEF based on kernel density estimation & quadrature - real(r8) :: q + real(r8) :: u real(r8) :: p_lower_prior, p_int_prior, p_upper_prior real(r8) :: p_lower_post, p_int_post, p_upper_post real(r8) :: like_ens_mean ! ensemble mean of the likelihood @@ -1225,8 +1225,8 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, obs_dist_type, & ! this function and so we are not trying to make it task-count invariant. if(first_inc_ran_call) then call random_seed() - call random_number(q) - i = int(q * 1000) + call random_number(unif) + i = int(unif * 1000) call init_random_seq(inc_ran_seq, my_task_id() + i) first_inc_ran_call = .false. endif @@ -1290,40 +1290,41 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, obs_dist_type, & count_upper = 0 unif = random_uniform(inc_ran_seq) / real(ens_size, r8) do i=1,ens_size - ! Map each ensemble member to a quantile using the prior cdf. Members - ! on the boundary get random quantiles. + ! Map each ensemble member to a probability u in [0,1] using the prior cdf. Members + ! on the boundary get random probabilities. if (bounded_below .and. (ens(i) .le. lower_bound)) then ! Rather than draw a random for each member on the boundary, ! I draw one random (above) then add 1/Nb to each subsequent value - q = unif + real(count_lower, r8) / real(ens_size, r8) + u = unif + real(count_lower, r8) / real(ens_size, r8) count_lower = count_lower + 1 - write(*,*) 'lower boundary quantile ', q + write(*,*) 'lower boundary probability ', u elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then ! As above, I draw one random then add 1/Nb to each subsequent value - q = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) + u = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) count_upper = count_upper + 1 - write(*,*) 'upper boundary quantile ', q + write(*,*) 'upper boundary probability ', u elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then - ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a random quantile - q = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior + ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a + ! random probability + u = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior else ! Use the interior cdf obtained using kde. - q = kde_cdf_params(ens(i), params_interior_prior) - q = (p_int_prior * q) + p_lower_prior + u = kde_cdf_params(ens(i), params_interior_prior) + u = (p_int_prior * u) + p_lower_prior end if ! Invert the posterior kde cdf to get the updated ensemble member, then ! subtract to get the increment. - if (q .le. p_lower_post) then ! Posterior value on the lower boundary + if (u .le. p_lower_post) then ! Posterior value on the lower boundary obs_inc(i) = lower_bound - ens(i) - elseif (q .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary + elseif (u .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary obs_inc(i) = upper_bound - ens(i) elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then ! posterior value in the interior, but there's only one prior ensemble member/value ! in the interior, so we just assign the posterior ensemble member equal to the prior one obs_inc(i) = ens_interior(1) - ens(i) else ! posterior value in the interior, can use kde - ! Rescale q to be between 0 and 1 before inverting interior cdf - q = (q - p_lower_post) / p_int_post - obs_inc(i) = inv_kde_cdf_params(q, params_interior_posterior) + ! Rescale u to be between 0 and 1 before inverting interior cdf + u = (u - p_lower_post) / p_int_post + obs_inc(i) = inv_kde_cdf_params(u, params_interior_posterior) obs_inc(i) = obs_inc(i) - ens(i) end if end do diff --git a/assimilation_code/modules/assimilation/probit_transform_mod.f90 b/assimilation_code/modules/assimilation/probit_transform_mod.f90 index 03cf0401b3..c574e11680 100644 --- a/assimilation_code/modules/assimilation/probit_transform_mod.f90 +++ b/assimilation_code/modules/assimilation/probit_transform_mod.f90 @@ -467,7 +467,7 @@ subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & real(r8), intent(in) :: lower_bound, upper_bound ! local variables - real(r8) :: quantile + real(r8) :: u integer :: i real(r8) :: y = 0._r8 ! Dummy value, not used real(r8) :: obs_param = 1._r8 ! Dummy value, not used @@ -526,21 +526,21 @@ subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & d_max = 0._r8 endif do i=1,ens_size - ! Map each state_ens member to a quantile using the prior cdf. Members + ! Map each state_ens member to a probability u using the prior cdf. Members ! on the boundary map to p_lower and p_upper. if (bounded_below .and. (state_ens(i) .le. lower_bound + eps)) then - quantile = p_lower + u = p_lower elseif (bounded_above .and. (state_ens(i) .ge. upper_bound - eps)) then - quantile = 1._r8 - p_upper + u = 1._r8 - p_upper elseif ((ens_size_interior .le. 1) .or. (d_max .le. 0._r8)) then ! Can't use kde with only one ensemble member, so assign to middle of interior range - quantile = (p_int * 0.5_r8) + p_lower + u = (p_int * 0.5_r8) + p_lower else ! Use the interior cdf obtained using kde. - quantile = kde_cdf_params(state_ens(i), p_interior) - quantile = (p_int * quantile) + p_lower + u = kde_cdf_params(state_ens(i), p_interior) + u = (p_int * u) + p_lower endif ! Transform to probit/logit space - probit_ens(i) = probit_or_logit_transform(quantile) + probit_ens(i) = probit_or_logit_transform(u) end do end subroutine to_probit_kde @@ -756,7 +756,7 @@ subroutine from_probit_kde(ens_size, probit_ens, p, state_ens) ! local variables integer :: i - real(r8) :: quantile + real(r8) :: u real(r8) :: ens(ens_size) ! Ensemble that defines the transform real(r8) :: ens_interior(ens_size) ! Ensemble members that are not on the boundaries integer :: ens_size_interior ! Number of ensemble members that are not on the boundaries @@ -798,12 +798,12 @@ subroutine from_probit_kde(ens_size, probit_ens, p, state_ens) ! Transform each probit ensemble member back to physical space do i = 1, ens_size - ! First, invert the probit/logit to get quantiles - quantile = inv_probit_or_logit_transform(probit_ens(i)) + ! First, invert the probit/logit to get probabilities u + u = inv_probit_or_logit_transform(probit_ens(i)) ! Next invert the mixture CDF to get the physical space ensemble - if (p%bounded_below .and. (quantile .le. p_lower)) then + if (p%bounded_below .and. (u .le. p_lower)) then state_ens(i) = p%lower_bound - elseif (p%bounded_above .and. (quantile .ge. 1._r8-p_upper)) then + elseif (p%bounded_above .and. (u .ge. 1._r8-p_upper)) then state_ens(i) = p%upper_bound elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then ! If there is only one interior ensemble member in the prior then any @@ -811,8 +811,8 @@ subroutine from_probit_kde(ens_size, probit_ens, p, state_ens) ! one interior value from the prior. state_ens(i) = ens_interior(1) else - quantile = (quantile - p_lower) / p_int - state_ens(i) = inv_kde_cdf_params(quantile, p_interior) + u = (u - p_lower) / p_int + state_ens(i) = inv_kde_cdf_params(u, p_interior) endif end do diff --git a/assimilation_code/modules/assimilation/rootfinding_mod.f90 b/assimilation_code/modules/assimilation/rootfinding_mod.f90 index 226637bb68..715dece906 100644 --- a/assimilation_code/modules/assimilation/rootfinding_mod.f90 +++ b/assimilation_code/modules/assimilation/rootfinding_mod.f90 @@ -18,7 +18,7 @@ module rootfinding_mod !------------------------------------------------------------------------ -function inv_cdf(quantile_in, cdf, first_guess, p) result(x) +function inv_cdf(cdf_in, cdf, first_guess, p) result(quantile) interface function cdf(x, p) @@ -31,31 +31,32 @@ function cdf(x, p) end interface interface - function first_guess(quantile, p) + function first_guess(u, p) use types_mod, only : r8 use distribution_params_mod, only : distribution_params_type real(r8) :: first_guess - real(r8), intent(in) :: quantile + real(r8), intent(in) :: u type(distribution_params_type), intent(in) :: p end function end interface - real(r8) :: x - real(r8), intent(in) :: quantile_in + real(r8) :: quantile + real(r8), intent(in) :: cdf_in type(distribution_params_type), intent(in) :: p - ! Given a quantile q, finds the value of x for which cdf(x) is approximately q - ! Starts with Newton's method using an approximate gradient until either convergence + ! Given a probability cdf_in, finds the value of x (i.e. the quantile) for which cdf(x) + ! is approximately cdf_in = u. + ! Starts with Newton's method using an approximate derivative until either convergence ! or overshoot. If it overshoots then it switches to the bisection-like ITP method - ! from Oliveira & Takahashi, ACM Trans Math Soft, 2020. Here f(x) = cdf(x) - quantile + ! from Oliveira & Takahashi, ACM Trans Math Soft, 2020. Here f(x) = cdf(x) - cdf_in ! Local variables: integer, parameter :: max_iterations = 50 ! Limit on the total number of iterations. - real(r8), parameter :: min_quantile = 0.0_r8, max_quantile = 0.999999999999999_r8 + real(r8), parameter :: min_probability = 0.0_r8, max_probability = 0.999999999999999_r8 real(r8), parameter :: tol = epsilon(1._r8)**(0.75_r8) ! Absolute on q, relative on x - real(r8) :: quantile, q_err + real(r8) :: u, u_err real(r8) :: delta_x, delta_f - real(r8) :: x_guess, q_guess, x0, x1, f0, f1 + real(r8) :: x_guess, u_guess, x0, x1, f0, f1 real(r8) :: a, b, fa, fb integer :: iter @@ -66,59 +67,59 @@ function first_guess(quantile, p) bounded_below = p%bounded_below; bounded_above = p%bounded_above lower_bound = p%lower_bound; upper_bound = p%upper_bound - quantile = quantile_in + u = cdf_in - ! Do a test for illegal values on the quantile - if(quantile < 0.0_r8 .or. quantile > 1.0_r8) then - write(errstring, *) 'Illegal Quantile input', quantile + ! Do a test for illegal values on the probability u + if(u < 0.0_r8 .or. u > 1.0_r8) then + write(errstring, *) 'Illegal cdf input', u call error_handler(E_ERR, 'inv_cdf', errstring, source) endif - ! If the distribution is bounded, quantiles at the limits have values at the bounds - if(bounded_below .and. quantile == 0.0_r8) then - x = lower_bound + ! If the distribution is bounded, probabilities of 0 and 1 have values at the bounds + if(bounded_below .and. u == 0.0_r8) then + quantile = lower_bound return endif - if(bounded_above .and. quantile == 1.0_r8) then - x = upper_bound + if(bounded_above .and. u == 1.0_r8) then + quantile = upper_bound return endif - ! If input quantiles are outside the numerically supported range, move them to the extremes - quantile = min(quantile, max_quantile) - ! code tests stably for many distributions with min_quantile of 0.0, could remove this - quantile = max(quantile, min_quantile) + ! If input probabilities are outside the numerically supported range, move them to the extremes + u = min(u, max_probability) + ! code tests stably for many distributions with min_probability of 0.0, could remove this + u = max(u, min_probability) ! Get first guess from functional approximation - x_guess = first_guess(quantile, p) + x_guess = first_guess(u, p) ! Evaluate the cdf - q_guess = cdf(x_guess, p) + u_guess = cdf(x_guess, p) ! Check if first guess is close enough - q_err = quantile - q_guess - if (abs(q_err) .le. tol) then - x = x_guess + u_err = u - u_guess + if (abs(u_err) .le. tol) then + quantile = x_guess return end if - ! If bounded below and target quantile is between 0 and q_guess, go to ITP - if (bounded_below .and. (quantile .lt. q_guess)) then + ! If bounded below and target probability is between 0 and u_guess, go to ITP + if (bounded_below .and. (u .lt. u_guess)) then a = lower_bound b = x_guess - fa = 0._r8 - quantile - fb = q_guess - quantile - x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) + fa = 0._r8 - u + fb = u_guess - u + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) return end if - ! If bounded above and target quantile is between q_guess and 1, go to ITP - if (bounded_above .and. (q_guess .lt. quantile)) then + ! If bounded above and target probability is between u_guess and 1, go to ITP + if (bounded_above .and. (u_guess .lt. u)) then a = x_guess b = upper_bound - fa = q_guess + fa = u_guess fb = 1._r8 - x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) return end if @@ -126,7 +127,7 @@ function first_guess(quantile, p) ! doing steps of the secant method, either until convergence or until we bracket ! the root. If we bracket the root then we finish using the ITP method. x0 = x_guess - f0 = q_guess - quantile + f0 = u_guess - u delta_x = max(1e-2_r8, 1e-2_r8 * abs(x_guess)) if (f0 .gt. 0._r8) then delta_x = -delta_x @@ -135,7 +136,7 @@ function first_guess(quantile, p) x1 = x_guess + delta_x if(bounded_below .and. (x1 .lt. lower_bound)) x1 = lower_bound if(bounded_above .and. (x1 .gt. upper_bound)) x1 = upper_bound - f1 = cdf(x1, p) - quantile + f1 = cdf(x1, p) - u delta_f = abs(f1 - f0) if (delta_f .le. 0._r8) then delta_x = 2._r8 * delta_x @@ -150,18 +151,18 @@ function first_guess(quantile, p) b = max(x0, x1) fa = min(f0, f1) fb = max(f0, f1) - x = inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations - iter + 1, p) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations - iter + 1, p) return else ! Do a secant step delta_f = f1 - f0 if (abs(delta_f) .eq. 0._r8) then ! Stop. Failed step: Flat CDF - write(errstring, *) 'Failed to converge; flat cdf for quantile ', quantile + write(errstring, *) 'Failed to converge; flat cdf' write(*,*) p%bounded_below, p%bounded_above write(*,*) 'More params:', p%more_params(:) write(*,*) 'iter, x0, x1, f0, f1:', iter, x0, x1, f0, f1 write(*,*) 'ens:', p%ens(:) call error_handler(E_MSG, 'rootfinding_mod:inv_cdf', errstring, source) - x = x1 + quantile = x1 return else x_guess = x1 - f1 * delta_x / delta_f @@ -171,8 +172,8 @@ function first_guess(quantile, p) f0 = f1 x1 = x_guess delta_x = x1 - x0 - f1 = cdf(x1, p) - quantile - x = x1 + f1 = cdf(x1, p) - u + quantile = x1 end if end if @@ -185,7 +186,7 @@ function first_guess(quantile, p) ! For now, have switched a failed convergence to return the latest guess ! This has implications for stability of probit algorithms that require further study ! Not currently happening for any of the test cases on gfortran - write(errstring, *) 'Failed to converge for quantile ', quantile + write(errstring, *) 'Failed to converge for probability ', u call error_handler(E_MSG, 'inv_cdf', errstring, source) !!!call error_handler(E_ERR, 'inv_cdf', errstring, source) @@ -193,7 +194,7 @@ end function inv_cdf !----------------------------------------------------------------------- -function inv_cdf_ITP(cdf, quantile, a, b, fa, fb, max_iterations, p) result(x) +function inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) result(x) interface function cdf(x, p) use types_mod, only : r8 @@ -204,16 +205,15 @@ function cdf(x, p) end function end interface - real(r8), intent(in) :: quantile + real(r8), intent(in) :: u real(r8), intent(inout) :: a, b, fa, fb integer, intent(in) :: max_iterations type(distribution_params_type), intent(in) :: p real(r8) :: x - ! Given a quantile q, finds the value of x for which cdf(x) is approximately q + ! Given a probability u, finds the value of x for which cdf(x) is approximately u ! Uses the bisection-like ITP method from Oliveira & Takahashi, ACM Trans Math - ! Soft, 2020. Here f(x) = cdf(x) - quantile. Assumes f(a) < 0 and f(b) > 0 on - ! input. + ! Soft, 2020. Here f(x) = cdf(x) - u. Assumes f(a) < 0 and f(b) > 0 on input. ! Local variables: integer, parameter :: n0 = 1._r8 @@ -248,7 +248,7 @@ function cdf(x, p) else x = x_half - sign(r, x_half - x_f) end if - f_ITP = cdf(x, p) - quantile + f_ITP = cdf(x, p) - u if (abs(f_ITP) .le. tol) then return elseif (f_ITP .gt. 0._r8) then From 11619e2cc22d5bb48fe3b3bf0c494d0006073779 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 19 Jul 2024 15:47:39 -0600 Subject: [PATCH 03/14] Move KDE test --- .../test_kde_dist/test_kde_dist.f90 | 4 ++ .../test_kde_dist/test_kde_dist.rst | 0 developer_tests/test_kde_dist/work/input.nml | 5 +++ .../test_kde_dist/work/quickbuild.sh | 37 +++++++++++++++++++ 4 files changed, 46 insertions(+) rename {assimilation_code/programs => developer_tests}/test_kde_dist/test_kde_dist.f90 (55%) rename {assimilation_code/programs => developer_tests}/test_kde_dist/test_kde_dist.rst (100%) create mode 100644 developer_tests/test_kde_dist/work/input.nml create mode 100755 developer_tests/test_kde_dist/work/quickbuild.sh diff --git a/assimilation_code/programs/test_kde_dist/test_kde_dist.f90 b/developer_tests/test_kde_dist/test_kde_dist.f90 similarity index 55% rename from assimilation_code/programs/test_kde_dist/test_kde_dist.f90 rename to developer_tests/test_kde_dist/test_kde_dist.f90 index b190390e1a..ae071996cc 100644 --- a/assimilation_code/programs/test_kde_dist/test_kde_dist.f90 +++ b/developer_tests/test_kde_dist/test_kde_dist.f90 @@ -1,3 +1,7 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + program test_kde_mod use utilities_mod, only : initialize_utilities, finalize_utilities use kde_distribution_mod, only : test_kde diff --git a/assimilation_code/programs/test_kde_dist/test_kde_dist.rst b/developer_tests/test_kde_dist/test_kde_dist.rst similarity index 100% rename from assimilation_code/programs/test_kde_dist/test_kde_dist.rst rename to developer_tests/test_kde_dist/test_kde_dist.rst diff --git a/developer_tests/test_kde_dist/work/input.nml b/developer_tests/test_kde_dist/work/input.nml new file mode 100644 index 0000000000..0d3e6e2380 --- /dev/null +++ b/developer_tests/test_kde_dist/work/input.nml @@ -0,0 +1,5 @@ +&utilities_nml + TERMLEVEL = 1, + module_details = .false. + logfilename = 'dart_log.out' + / diff --git a/developer_tests/test_kde_dist/work/quickbuild.sh b/developer_tests/test_kde_dist/work/quickbuild.sh new file mode 100755 index 0000000000..1a31852f1b --- /dev/null +++ b/developer_tests/test_kde_dist/work/quickbuild.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# DART software - Copyright UCAR. This open source software is provided +# by UCAR, "as is", without charge, subject to all terms of use at +# http://www.image.ucar.edu/DAReS/DART/DART_download + +main() { + + +export DART=$(git rev-parse --show-toplevel) +source "$DART"/build_templates/buildfunctions.sh + +MODEL="none" +EXTRA="$DART"/models/template/threed_model_mod.f90 +dev_test=1 +LOCATION="threed_sphere" +TEST="test_kde_dist" + +serial_programs=( +test_kde_dist +) + +# quickbuild arguments +arguments "$@" + +# clean the directory +\rm -f -- *.o *.mod Makefile .cppdefs + +# build +buildit + +# clean up +\rm -f -- *.o *.mod + +} + +main "$@" From 61ac931ef1b7f20fa599bfae02dffd61830bc8c7 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 19 Jul 2024 17:06:21 -0600 Subject: [PATCH 04/14] Move obs_increment_kde This commit moves the obs_increment_kde subroutine out of assim_tools and into the kde distribution module. It also changes whitespace, adds the copyright header, and fixes minor bugs in the probit mod. --- .../assimilation/algorithm_info_mod.f90 | 4 +- .../modules/assimilation/assim_tools_mod.f90 | 167 +---------------- .../assimilation/distribution_params_mod.f90 | 2 +- .../assimilation/kde_distribution_mod.f90 | 172 +++++++++++++++++- .../modules/assimilation/rootfinding_mod.f90 | 8 + 5 files changed, 187 insertions(+), 166 deletions(-) diff --git a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 index 1eb01207cd..ad095a205b 100644 --- a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 +++ b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 @@ -250,7 +250,7 @@ subroutine read_qceff_table(qceff_table_filename) case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_state%dist_type = UNIFORM_DISTRIBUTION case ('KDE_DISTRIBUTION') - qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION + qceff_table_data(row)%probit_state%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -274,7 +274,7 @@ subroutine read_qceff_table(qceff_table_filename) case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_extended_state%dist_type = UNIFORM_DISTRIBUTION case ('KDE_DISTRIBUTION') - qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION + qceff_table_data(row)%probit_extended_state%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_extended_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index af20abf925..7c25290a45 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -86,7 +86,8 @@ module assim_tools_mod use bnrh_distribution_mod, only : inv_bnrh_cdf, bnrh_cdf, inv_bnrh_cdf_like use kde_distribution_mod, only : kde_cdf_params, inv_kde_cdf_params, obs_dist_types, & - pack_kde_params, likelihood_function, separate_ensemble + pack_kde_params, likelihood_function, separate_ensemble, & + obs_increment_kde use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params @@ -890,7 +891,7 @@ end subroutine filter_assim !------------------------------------------------------------- subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & - inflate, my_cov_inflate, my_cov_inflate_sd, net_a, obs_dist_type) + inflate, my_cov_inflate, my_cov_inflate_sd, net_a) ! Given the ensemble prior for an observation, the observation, and ! the observation error variance, computes increments and adjusts @@ -903,7 +904,6 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & type(adaptive_inflate_type), intent(inout) :: inflate real(r8), intent(inout) :: my_cov_inflate, my_cov_inflate_sd real(r8), intent(out) :: net_a -integer, optional, intent(in) :: obs_dist_type real(r8) :: ens(ens_size), inflate_inc(ens_size) real(r8) :: prior_mean, prior_var, new_val(ens_size) @@ -1031,24 +1031,8 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & !-------------------------------------------------------------------------- else if(filter_kind == KERNEL_QCEF) then - ! obs_dist_type is an integer whose value is provided by kde_distribution_mod - ! Options are uninformative, normal, binomial, gamma, - ! inv_gamma, lognormal, and truncated_normal. Details can be found in the - ! definition of the likelihood function in kde_distribution_mod.f90. Different - ! types of observations have different interpretations for `obs_var'. - if (.not. present(obs_dist_type)) then -! call error_handler(E_ERR,'obs_increment', & -! 'For kde filter, must specify obs_dist_type', source) - ! IG: Hack. Assume that the obs dist type is truncated normal with the same - ! bounds as the observed variable. - call obs_increment_kde(ens, ens_size, obs, obs_var, & - obs_dist_types%truncated_normal, bounded_below, & - bounded_above, lower_bound, upper_bound, obs_inc) - else - call obs_increment_kde(ens, ens_size, obs, obs_var, obs_dist_type, & - bounded_below, bounded_above, lower_bound, upper_bound, obs_inc) - end if - + call obs_increment_kde(ens, ens_size, obs, obs_var, bounded_below, & + bounded_above, lower_bound, upper_bound, obs_inc) else call error_handler(E_ERR,'obs_increment', & 'Illegal value of filter_kind', source) @@ -1190,147 +1174,6 @@ subroutine obs_increment_bounded_norm_rhf(ens, ens_like, ens_size, prior_var, & end subroutine obs_increment_bounded_norm_rhf - -subroutine obs_increment_kde(ens, ens_size, y, obs_param, obs_dist_type, & - bounded_below, bounded_above, lower_bound, upper_bound, obs_inc) - integer, intent(in) :: ens_size - real(r8), intent(in) :: ens(ens_size) - real(r8), intent(in) :: y - real(r8), intent(in) :: obs_param - integer, intent(in) :: obs_dist_type - logical, intent(in) :: bounded_below, bounded_above - real(r8), intent(in) :: lower_bound, upper_bound - real(r8), intent(out) :: obs_inc(ens_size) - - ! Applies a QCEF based on kernel density estimation & quadrature - - real(r8) :: u - real(r8) :: p_lower_prior, p_int_prior, p_upper_prior - real(r8) :: p_lower_post, p_int_post, p_upper_post - real(r8) :: like_ens_mean ! ensemble mean of the likelihood - real(r8) :: ens_interior(ens_size) - real(r8) :: unif - real(r8) :: d(ens_size), d_max - type(distribution_params_type) :: params_interior_prior - type(distribution_params_type) :: params_interior_posterior - integer :: ens_size_interior - integer :: i, count_lower, count_upper - - ! If this is first time through, need to initialize the random sequence. - ! This will reproduce exactly for multiple runs with the same task count, - ! but WILL NOT reproduce for a different number of MPI tasks. - ! To make it independent of the number of MPI tasks, it would need to - ! use the global ensemble number or something else that remains constant - ! as the processor count changes. this is not currently an argument to - ! this function and so we are not trying to make it task-count invariant. - if(first_inc_ran_call) then - call random_seed() - call random_number(unif) - i = int(unif * 1000) - call init_random_seq(inc_ran_seq, my_task_id() + i) - first_inc_ran_call = .false. - endif - - ! If all ensemble members are identical, then there is no update - d(:) = abs( ens(:) - ens(1) ) - d_max = maxval(d) - if(d_max .le. 0.0_r8) then - obs_inc(:) = 0._r8 - return - endif - - ! Get mixture component probabilities for the prior, and the interior ensemble, and its params - call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & - lower_bound, upper_bound, ens_interior, ens_size_interior, & - p_lower_prior, p_int_prior, p_upper_prior) - if (ens_size_interior .gt. 1) then - d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) - d_max = maxval(d(1:ens_size_interior)) - call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & - ens_interior, y, obs_param, obs_dist_types%uninformative, & - params_interior_prior) - call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & - ens_interior, y, obs_param, obs_dist_type, & - params_interior_posterior) - else - d_max = 0._r8 - endif - - ! Get mixture component probabilities for the posterior - if (bounded_below) then - p_lower_post = p_lower_prior * likelihood_function(lower_bound, y, obs_param, obs_dist_type) - else - p_lower_post = 0._r8 - end if - if (bounded_above) then - p_upper_post = p_upper_prior * likelihood_function(upper_bound, y, obs_param, obs_dist_type) - else - p_upper_post = 0._r8 - end if - p_int_post = 0._r8 - do i=1,ens_size_interior - p_int_post = p_int_post + likelihood_function(ens_interior(i), y, obs_param, obs_dist_type) - end do - p_int_post = p_int_prior * p_int_post / real(ens_size_interior, r8) - ! Get prior ensemble mean of likelihood - like_ens_mean = p_lower_post + p_int_post + p_upper_post - ! If likelihood underflow, assume flat likelihood, so no increments - if(like_ens_mean .le. 0.0_r8) then - obs_inc(:) = 0.0_r8 - return - else ! Finish getting mixture component probabilities for the posterior - p_lower_post = p_lower_post / like_ens_mean - p_upper_post = p_upper_post / like_ens_mean - p_int_post = 1._r8 - p_lower_post - p_upper_post - end if - - ! Update ensemble members -> get observation increments - - count_lower = 0 - count_upper = 0 - unif = random_uniform(inc_ran_seq) / real(ens_size, r8) - do i=1,ens_size - ! Map each ensemble member to a probability u in [0,1] using the prior cdf. Members - ! on the boundary get random probabilities. - if (bounded_below .and. (ens(i) .le. lower_bound)) then - ! Rather than draw a random for each member on the boundary, - ! I draw one random (above) then add 1/Nb to each subsequent value - u = unif + real(count_lower, r8) / real(ens_size, r8) - count_lower = count_lower + 1 - write(*,*) 'lower boundary probability ', u - elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then - ! As above, I draw one random then add 1/Nb to each subsequent value - u = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) - count_upper = count_upper + 1 - write(*,*) 'upper boundary probability ', u - elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then - ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a - ! random probability - u = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior - else ! Use the interior cdf obtained using kde. - u = kde_cdf_params(ens(i), params_interior_prior) - u = (p_int_prior * u) + p_lower_prior - end if - ! Invert the posterior kde cdf to get the updated ensemble member, then - ! subtract to get the increment. - if (u .le. p_lower_post) then ! Posterior value on the lower boundary - obs_inc(i) = lower_bound - ens(i) - elseif (u .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary - obs_inc(i) = upper_bound - ens(i) - elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then - ! posterior value in the interior, but there's only one prior ensemble member/value - ! in the interior, so we just assign the posterior ensemble member equal to the prior one - obs_inc(i) = ens_interior(1) - ens(i) - else ! posterior value in the interior, can use kde - ! Rescale u to be between 0 and 1 before inverting interior cdf - u = (u - p_lower_post) / p_int_post - obs_inc(i) = inv_kde_cdf_params(u, params_interior_posterior) - obs_inc(i) = obs_inc(i) - ens(i) - end if - end do - -end subroutine obs_increment_kde - ! Computes a normal or truncated normal (above and/or below) likelihood. function get_truncated_normal_like(x, obs, obs_var, & bounded_below, bounded_above, lower_bound, upper_bound) diff --git a/assimilation_code/modules/assimilation/distribution_params_mod.f90 b/assimilation_code/modules/assimilation/distribution_params_mod.f90 index 43849f7fbd..0507810780 100644 --- a/assimilation_code/modules/assimilation/distribution_params_mod.f90 +++ b/assimilation_code/modules/assimilation/distribution_params_mod.f90 @@ -25,7 +25,7 @@ module distribution_params_mod integer, parameter :: LOG_NORMAL_DISTRIBUTION = 5 integer, parameter :: UNIFORM_DISTRIBUTION = 6 integer, parameter :: PARTICLE_FILTER_DISTRIBUTION = 7 -integer, parameter :: KDE_DISTRIBUTION = 8 +integer, parameter :: KDE_DISTRIBUTION = 8 public :: distribution_params_type, deallocate_distribution_params, & NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, & diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 index 0b1ef1a042..7386b918b5 100644 --- a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -1,3 +1,11 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + +! This module originally written by I. Grooms and described (briefly) +! in "A Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation" +! by Grooms & Riedel (Remote Sensing, 2024; DOI:10.3390/rs16132377). + module kde_distribution_mod use types_mod, only : r8, missing_r8 @@ -6,6 +14,11 @@ module kde_distribution_mod use sort_mod, only : sort +use random_seq_mod, only : random_seq_type, random_gaussian, init_random_seq, & + random_uniform + +use mpi_utilities_mod, only : my_task_id + use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params, & KDE_DISTRIBUTION @@ -18,7 +31,7 @@ module kde_distribution_mod public :: kde_cdf, kde_cdf_params, inv_kde_cdf, inv_kde_cdf_params, & test_kde, obs_dist_types, likelihood_function, pack_kde_params, & - separate_ensemble + separate_ensemble, obs_increment_kde type available_obs_dist_types integer :: uninformative, normal, binomial, gamma, & @@ -31,6 +44,10 @@ module kde_distribution_mod character(len=512) :: errstring character(len=*), parameter :: source = 'kde_distribution_mod.f90' +! True if random sequence needs to be initialized +logical :: first_inc_ran_call = .true. +type (random_seq_type) :: inc_ran_seq + contains !--------------------------------------------------------------------------- @@ -745,6 +762,159 @@ end subroutine separate_ensemble !----------------------------------------------------------------------- +subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & + bounded_above, lower_bound, upper_bound, obs_inc) + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(out) :: obs_inc(ens_size) + + ! Applies a QCEF based on kernel density estimation & quadrature + ! Assumes a truncated normal likelihood whose bounds are the + ! same as the state variable. If the state variable is unbounded + ! then it uses a normal likelihood. Obviously this assumtion is + ! limiting. It could eventually be replaced with logic based on + ! obs_kind, which would be added as an input argument. + + real(r8) :: u + real(r8) :: p_lower_prior, p_int_prior, p_upper_prior + real(r8) :: p_lower_post, p_int_post, p_upper_post + real(r8) :: like_ens_mean ! ensemble mean of the likelihood + real(r8) :: ens_interior(ens_size) + real(r8) :: unif + real(r8) :: d(ens_size), d_max + type(distribution_params_type) :: params_interior_prior + type(distribution_params_type) :: params_interior_posterior + integer :: ens_size_interior + integer :: i, count_lower, count_upper + integer :: obs_dist_type + + if (bounded_below .or. bounded_above) then + obs_dist_type = obs_dist_types%truncated_normal + else + obs_dist_type = obs_dist_types%normal + endif + + ! If this is first time through, need to initialize the random sequence. + ! This will reproduce exactly for multiple runs with the same task count, + ! but WILL NOT reproduce for a different number of MPI tasks. + ! To make it independent of the number of MPI tasks, it would need to + ! use the global ensemble number or something else that remains constant + ! as the processor count changes. this is not currently an argument to + ! this function and so we are not trying to make it task-count invariant. + if (first_inc_ran_call) then + call random_seed() + call random_number(unif) + i = int(unif * 1000) + call init_random_seq(inc_ran_seq, my_task_id() + i) + first_inc_ran_call = .false. + endif + + ! If all ensemble members are identical, then there is no update + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + obs_inc(:) = 0._r8 + return + endif + + ! Get mixture component probabilities for the prior, and the interior ensemble, and its params + call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + p_lower_prior, p_int_prior, p_upper_prior) + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_types%uninformative, & + params_interior_prior) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_type, & + params_interior_posterior) + else + d_max = 0._r8 + endif + + ! Get mixture component probabilities for the posterior + if (bounded_below) then + p_lower_post = p_lower_prior * likelihood_function(lower_bound, y, obs_param, obs_dist_type) + else + p_lower_post = 0._r8 + end if + if (bounded_above) then + p_upper_post = p_upper_prior * likelihood_function(upper_bound, y, obs_param, obs_dist_type) + else + p_upper_post = 0._r8 + end if + p_int_post = 0._r8 + do i=1,ens_size_interior + p_int_post = p_int_post + likelihood_function(ens_interior(i), y, obs_param, obs_dist_type) + end do + p_int_post = p_int_prior * p_int_post / real(ens_size_interior, r8) + ! Get prior ensemble mean of likelihood + like_ens_mean = p_lower_post + p_int_post + p_upper_post + ! If likelihood underflow, assume flat likelihood, so no increments + if(like_ens_mean .le. 0.0_r8) then + obs_inc(:) = 0.0_r8 + return + else ! Finish getting mixture component probabilities for the posterior + p_lower_post = p_lower_post / like_ens_mean + p_upper_post = p_upper_post / like_ens_mean + p_int_post = 1._r8 - p_lower_post - p_upper_post + end if + + ! Update ensemble members -> get observation increments + + count_lower = 0 + count_upper = 0 + unif = random_uniform(inc_ran_seq) / real(ens_size, r8) + do i=1,ens_size + ! Map each ensemble member to a probability u in [0,1] using the prior cdf. Members + ! on the boundary get random probabilities. + if (bounded_below .and. (ens(i) .le. lower_bound)) then + ! Rather than draw a random for each member on the boundary, + ! I draw one random (above) then add 1/Nb to each subsequent value + u = unif + real(count_lower, r8) / real(ens_size, r8) + count_lower = count_lower + 1 + write(*,*) 'lower boundary probability ', u + elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then + ! As above, I draw one random then add 1/Nb to each subsequent value + u = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) + count_upper = count_upper + 1 + write(*,*) 'upper boundary probability ', u + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a + ! random probability + u = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior + else ! Use the interior cdf obtained using kde. + u = kde_cdf_params(ens(i), params_interior_prior) + u = (p_int_prior * u) + p_lower_prior + end if + ! Invert the posterior kde cdf to get the updated ensemble member, then + ! subtract to get the increment. + if (u .le. p_lower_post) then ! Posterior value on the lower boundary + obs_inc(i) = lower_bound - ens(i) + elseif (u .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary + obs_inc(i) = upper_bound - ens(i) + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! posterior value in the interior, but there's only one prior ensemble member/value + ! in the interior, so we just assign the posterior ensemble member equal to the prior one + obs_inc(i) = ens_interior(1) - ens(i) + else ! posterior value in the interior, can use kde + ! Rescale u to be between 0 and 1 before inverting interior cdf + u = (u - p_lower_post) / p_int_post + obs_inc(i) = inv_kde_cdf_params(u, params_interior_posterior) + obs_inc(i) = obs_inc(i) - ens(i) + end if + end do + +end subroutine obs_increment_kde + +!----------------------------------------------------------------------- + subroutine test_kde ! This routine provides limited tests of the numerics in this module. It tests ! the boundary correction function, the likelihood, the bandwidth selection, diff --git a/assimilation_code/modules/assimilation/rootfinding_mod.f90 b/assimilation_code/modules/assimilation/rootfinding_mod.f90 index 715dece906..34c0a9cf75 100644 --- a/assimilation_code/modules/assimilation/rootfinding_mod.f90 +++ b/assimilation_code/modules/assimilation/rootfinding_mod.f90 @@ -1,3 +1,11 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + +! This module originally written by I. Grooms and described (briefly) +! in "A Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation" +! by Grooms & Riedel (Remote Sensing, 2024; DOI:10.3390/rs16132377). + module rootfinding_mod use types_mod, only : r8 From 212ff4a3fb6b230d73c70c5d9cfefa0e1181b29c Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Tue, 23 Jul 2024 16:18:56 -0600 Subject: [PATCH 05/14] Add KQCEF to docs --- guide/qceff_probit.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/guide/qceff_probit.rst b/guide/qceff_probit.rst index dddd4fbf7f..e7bfa4ce72 100644 --- a/guide/qceff_probit.rst +++ b/guide/qceff_probit.rst @@ -119,6 +119,7 @@ Available filter kinds * UNBOUNDED_RHF * GAMMA_FILTER * BOUNDED_NORMAL_RHF + * KERNEL_QCEF .. _Distributions: @@ -131,7 +132,7 @@ Available distributions * BETA_DISTRIBUTION * LOG_NORMAL_DISTRIBUTION * UNIFORM_DISTRIBUTION - + * KDE_DISTRIBUTION .. _Default values: From 6990fa999d89e5197f63ad74d0a630a503501e92 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Wed, 24 Jul 2024 16:39:53 -0600 Subject: [PATCH 06/14] Error mesage cleanup Based on suggestions by @hkershaw-brown, I changed some errors from messages to fatal errors. Also replaced the case where the observation distribution is truncated-normal and the observation error variance is non-positive with a fatal error. --- .../modules/assimilation/assim_tools_mod.f90 | 2 +- .../modules/assimilation/kde_distribution_mod.f90 | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index 7c25290a45..55a79b867c 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -1034,7 +1034,7 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & call obs_increment_kde(ens, ens_size, obs, obs_var, bounded_below, & bounded_above, lower_bound, upper_bound, obs_inc) else - call error_handler(E_ERR,'obs_increment', & + call error_handler(E_ERR,'obs_increment', & 'Illegal value of filter_kind', source) endif endif diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 index 7386b918b5..402da5fd37 100644 --- a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -8,7 +8,7 @@ module kde_distribution_mod -use types_mod, only : r8, missing_r8 +use types_mod, only : r8, MISSING_R8 use utilities_mod, only : E_ERR, E_MSG, error_handler @@ -82,7 +82,7 @@ function likelihood_function(x, y, obs_param, obs_dist_type, & ! For a normal obs distribution, like_param is the obs error variance if (obs_param <= 0._r8) then write(errstring, *) 'obs error sd <= 0', obs_param - call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + call error_handler(E_ERR, 'kde_distribution_mod:likelihood', errstring, source) else l = exp( -0.5_r8 * (x - y)**2 / obs_param ) end if @@ -166,12 +166,8 @@ function likelihood_function(x, y, obs_param, obs_dist_type, & ! sigma * sqrt(2 * pi) is ignored. ! A zero observation error variance is a degenerate case if(obs_param <= 0.0_r8) then - if(x == y) then - l = 1.0_r8 - else - l = 0.0_r8 - endif - return + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_ERR, 'kde_distribution_mod:likelihood', errstring, source) endif cdf = [0._r8, 1._r8] @@ -193,7 +189,7 @@ function likelihood_function(x, y, obs_param, obs_dist_type, & l = exp( -0.5_r8 * (x - y)**2 / obs_param ) / (cdf(2) - cdf(1)) case DEFAULT write(errstring, *) 'likelihood called with unrecognized obs_dist_type ', obs_dist_type - call error_handler(E_MSG, 'kde_distribution_mod:likelihood_function', errstring, source) + call error_handler(E_ERR, 'kde_distribution_mod:likelihood_function', errstring, source) end select end function likelihood_function From 3d48c4fca5f3d8a57cb6293334d824afbd65b117 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Wed, 24 Jul 2024 18:01:51 -0600 Subject: [PATCH 07/14] Code cleanup Many minor changes following suggestions from @hkershaw-brown. I also removed an unused parameter in rootfinding.mod --- .../assimilation/kde_distribution_mod.f90 | 128 ++++++++++++++---- .../assimilation/probit_transform_mod.f90 | 5 +- .../modules/assimilation/rootfinding_mod.f90 | 45 +++--- .../test_kde_dist/test_kde_dist.f90 | 2 +- 4 files changed, 130 insertions(+), 50 deletions(-) diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 index 402da5fd37..7afa415c7b 100644 --- a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -118,7 +118,7 @@ function likelihood_function(x, y, obs_param, obs_dist_type, & else gamma_shape = 1._r8 / obs_param ! k gamma_scale = x * obs_param ! theta - if (x .eq. 0._r8) then + if (x == 0._r8) then l = 0._r8 ! Technically x must be > 0, but just setting l=0 is convenient. else l = (y / gamma_scale)**gamma_shape * exp(-y / gamma_scale) @@ -196,7 +196,7 @@ end function likelihood_function !--------------------------------------------------------------------------- -elemental function epanechnikov_kernel(x) ! The fact that this is elemental is not (yet) used +elemental function epanechnikov_kernel(x) real(r8) :: epanechnikov_kernel real(r8), intent(in) :: x real(r8), parameter :: norm_const = 3._r8 / 4._r8 @@ -206,7 +206,7 @@ end function epanechnikov_kernel !--------------------------------------------------------------------------- -elemental function epanechnikov_cdf(x) ! The fact that this is elemental is not (yet) used +elemental function epanechnikov_cdf(x) real(r8) :: epanechnikov_cdf real(r8), intent(in) :: x real(r8), parameter :: norm_const = 1._r8 / 4._r8 @@ -227,7 +227,6 @@ elemental subroutine boundary_correction(x, lx, mx) real(r8), intent(out) :: lx, mx ! Boundary correction for kde, from Jones (1993) and Jones & Foster (1996) - ! The fact that this is elemental is not (yet) used real(r8) :: denom @@ -717,10 +716,6 @@ subroutine separate_ensemble(ens, ens_size, bounded_below, bounded_above, & ! local variables integer :: i, j logical :: is_interior(ens_size) - real(r8), parameter :: eps = 0._r8 !epsilon(1._r8) - - ! Initialize - ens_interior(:) = missing_r8 q_lower = 0._r8 q_int = 0._r8 @@ -729,13 +724,13 @@ subroutine separate_ensemble(ens, ens_size, bounded_below, bounded_above, & j = 0 do i=1,ens_size if (bounded_below) then - if (ens(i) <= lower_bound + eps) then + if (ens(i) <= lower_bound) then q_lower = q_lower + 1._r8 is_interior(i) = .false. end if end if if (bounded_above) then - if (ens(i) >= upper_bound - eps) then + if (ens(i) >= upper_bound) then q_upper = q_upper + 1._r8 is_interior(i) = .false. end if @@ -797,10 +792,6 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & ! If this is first time through, need to initialize the random sequence. ! This will reproduce exactly for multiple runs with the same task count, ! but WILL NOT reproduce for a different number of MPI tasks. - ! To make it independent of the number of MPI tasks, it would need to - ! use the global ensemble number or something else that remains constant - ! as the processor count changes. this is not currently an argument to - ! this function and so we are not trying to make it task-count invariant. if (first_inc_ran_call) then call random_seed() call random_number(unif) @@ -933,6 +924,11 @@ subroutine test_kde write(*, *) 'abs difference in lx is ', abs(1.322997416020672_r8 - lx) write(*, *) 'abs difference in mx is ', abs(1.102497846683893_r8 - mx) write(*, *) 'abs differences should be less than 1e-15' + if ((abs(1.322997416020672_r8 - lx) > 1E-15) .or. (abs(1.102497846683893_r8 - mx) > 1E-15)) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test uninformative likelihood max_diff = -1.0_r8 @@ -948,6 +944,11 @@ subroutine test_kde write(*, *) 'Uninformative likelihood test' write(*, *) 'max difference in likelihood is ', max_diff write(*, *) 'max difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test normal likelihood x = 0.5_r8 @@ -955,10 +956,16 @@ subroutine test_kde obs_param = 1._r8 obs_dist_type = obs_dist_types%normal like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.8824969025845955_r8 - like) write(*, *) '----------------------------' write(*, *) 'Normal likelihood test' - write(*, *) 'abs difference in likelihood is ', abs(0.8824969025845955_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test binomial obs distribution (beta likelihood) x = 0.25_r8 @@ -966,10 +973,16 @@ subroutine test_kde obs_param = 5._r8 obs_dist_type = obs_dist_types%binomial like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.0087890625_r8 - like) write(*, *) '----------------------------' write(*, *) 'Binomial obs distribution test' - write(*, *) 'abs difference in likelihood is ', abs(0.0087890625_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test gamma obs distribution (inverse gamma likelihood) y = 1._r8 @@ -977,10 +990,16 @@ subroutine test_kde obs_param = 3._r8 obs_dist_type = obs_dist_types%gamma like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.4658368455179406_r8 - like) write(*, *) '----------------------------' write(*, *) 'Gamma obs distribution test' - write(*, *) 'abs difference in likelihood is ', abs(0.4658368455179406_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test inverse gamma obs distribution (gamma likelihood) y = 3._r8 @@ -988,10 +1007,16 @@ subroutine test_kde obs_param = 4._r8 obs_dist_type = obs_dist_types%inv_gamma like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(3.415489474106968_r8 - like) write(*, *) '----------------------------' write(*, *) 'Inverse gamma obs distribution test' - write(*, *) 'abs difference in likelihood is ', abs(3.415489474106968_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test lognormal obs distribution (normal likelihood) y = 1._r8 @@ -999,10 +1024,16 @@ subroutine test_kde obs_param = 4._r8 obs_dist_type = obs_dist_types%lognormal like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.6065306597126334_r8 - like) write(*, *) '----------------------------' write(*, *) 'Lognormal obs distribution test' - write(*, *) 'abs difference in likelihood is ', abs(0.6065306597126334_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test truncated-normal obs distribution y = 1._r8 @@ -1011,22 +1042,40 @@ subroutine test_kde obs_dist_type = obs_dist_types%truncated_normal like = likelihood_function(x, y, obs_param, obs_dist_type, & bounded_above=.true.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.292676850528392_r8 - like) write(*, *) '----------------------------' write(*, *) 'Truncated-normal obs distribution test, doubly-bounded' - write(*, *) 'abs difference in likelihood is ', abs(1.292676850528392_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif like = likelihood_function(x, y, obs_param, obs_dist_type, & bounded_above=.false.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.048912359301403_r8 - like) write(*, *) '----------------------------' write(*, *) 'Truncated-normal obs distribution test, bounded below' - write(*, *) 'abs difference in likelihood is ', abs(1.048912359301403_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif like = likelihood_function(x, y, obs_param, obs_dist_type, & bounded_above=.true.,bounded_below=.false.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.048912359301403_r8 - like) write(*, *) '----------------------------' write(*, *) 'Truncated-normal obs distribution test, bounded above' - write(*, *) 'abs difference in likelihood is ', abs(1.048912359301403_r8 - like) + write(*, *) 'abs difference in likelihood is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test bandwidth selection target_bandwidths(:) = 2.462288826689833_r8 * [1._r8, 1._r8] @@ -1038,9 +1087,16 @@ subroutine test_kde ! Compare computed bandwidths to exact write(*, *) '----------------------------' write(*, *) 'kde bandwidths test: Absolute value of differences should be less than 1e-15' + max_diff = 0._r8 do i = 1, ens_size + max_diff = max(abs(bandwidths(i) - target_bandwidths(i)), max_diff) write(*, *) i, abs(bandwidths(i) - target_bandwidths(i)) end do + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif ! Test the inversion of the cdf over the support of the pdf, unbounded write(*, *) '----------------------------' @@ -1062,6 +1118,11 @@ subroutine test_kde ! The accuracy degrades near the boundary. The slope of the cdf goes to zero on the boundary ! But at least the second derivative is nonzero. So near the boundary it behaves like a ! double root, and the accuracy of the rootfinding degrades. + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif call deallocate_distribution_params(p) @@ -1086,6 +1147,11 @@ subroutine test_kde write(*, *) 'max difference in inversion is ', max_diff write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif call deallocate_distribution_params(p) @@ -1111,6 +1177,11 @@ subroutine test_kde write(*, *) 'max difference in inversion is ', max_diff write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif call deallocate_distribution_params(p) @@ -1135,6 +1206,11 @@ subroutine test_kde write(*, *) 'max difference in inversion is ', max_diff write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif call deallocate_distribution_params(p) @@ -1148,10 +1224,16 @@ subroutine test_kde p%more_params(ens_size + 1) = 1._r8 y = integrate_pdf(0._r8, p) + max_diff = abs(0.5_r8 - y) write(*, *) '----------------------------' write(*, *) 'test quadrature' - write(*, *) 'abs difference is ', abs(0.5_r8 - y) + write(*, *) 'abs difference is ', max_diff write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif call deallocate_distribution_params(p) diff --git a/assimilation_code/modules/assimilation/probit_transform_mod.f90 b/assimilation_code/modules/assimilation/probit_transform_mod.f90 index c574e11680..c5645c15a1 100644 --- a/assimilation_code/modules/assimilation/probit_transform_mod.f90 +++ b/assimilation_code/modules/assimilation/probit_transform_mod.f90 @@ -477,7 +477,6 @@ subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & type(distribution_params_type) :: p_interior real(r8) :: d(ens_size), d_max real(r8) :: p_lower, p_int, p_upper - real(r8), parameter :: eps = 0._r8 !epsilon(1._r8) ! Get the ensemble that defines the transform if(use_input_p) then @@ -528,9 +527,9 @@ subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & do i=1,ens_size ! Map each state_ens member to a probability u using the prior cdf. Members ! on the boundary map to p_lower and p_upper. - if (bounded_below .and. (state_ens(i) .le. lower_bound + eps)) then + if (bounded_below .and. (state_ens(i) .le. lower_bound)) then u = p_lower - elseif (bounded_above .and. (state_ens(i) .ge. upper_bound - eps)) then + elseif (bounded_above .and. (state_ens(i) .ge. upper_bound)) then u = 1._r8 - p_upper elseif ((ens_size_interior .le. 1) .or. (d_max .le. 0._r8)) then ! Can't use kde with only one ensemble member, so assign to middle of interior range diff --git a/assimilation_code/modules/assimilation/rootfinding_mod.f90 b/assimilation_code/modules/assimilation/rootfinding_mod.f90 index 34c0a9cf75..b0db8a2c13 100644 --- a/assimilation_code/modules/assimilation/rootfinding_mod.f90 +++ b/assimilation_code/modules/assimilation/rootfinding_mod.f90 @@ -10,7 +10,7 @@ module rootfinding_mod use types_mod, only : r8 -use utilities_mod, only : E_ERR, E_MSG, error_handler +use utilities_mod, only : E_ERR, E_MSG, E_ALLMSG, error_handler use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params @@ -59,9 +59,9 @@ function first_guess(u, p) ! from Oliveira & Takahashi, ACM Trans Math Soft, 2020. Here f(x) = cdf(x) - cdf_in ! Local variables: - integer, parameter :: max_iterations = 50 ! Limit on the total number of iterations. - real(r8), parameter :: min_probability = 0.0_r8, max_probability = 0.999999999999999_r8 - real(r8), parameter :: tol = epsilon(1._r8)**(0.75_r8) ! Absolute on q, relative on x + integer, parameter :: MAX_ITERATIONS = 50 ! Limit on the total number of iterations. + real(r8), parameter :: MIN_PROBABILITY = 0.0_r8, MAX_PROBABILITY = 0.999999999999999_r8 + real(r8), parameter :: TOL = epsilon(1._r8)**(0.75_r8) ! Absolute on q, relative on x real(r8) :: u, u_err real(r8) :: delta_x, delta_f real(r8) :: x_guess, u_guess, x0, x1, f0, f1 @@ -94,9 +94,9 @@ function first_guess(u, p) endif ! If input probabilities are outside the numerically supported range, move them to the extremes - u = min(u, max_probability) + u = min(u, MAX_PROBABILITY) ! code tests stably for many distributions with min_probability of 0.0, could remove this - u = max(u, min_probability) + u = max(u, MIN_PROBABILITY) ! Get first guess from functional approximation x_guess = first_guess(u, p) @@ -106,7 +106,7 @@ function first_guess(u, p) ! Check if first guess is close enough u_err = u - u_guess - if (abs(u_err) .le. tol) then + if (abs(u_err) .le. TOL) then quantile = x_guess return end if @@ -117,7 +117,7 @@ function first_guess(u, p) b = x_guess fa = 0._r8 - u fb = u_guess - u - quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS, p) return end if @@ -127,7 +127,7 @@ function first_guess(u, p) b = upper_bound fa = u_guess fb = 1._r8 - quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS, p) return end if @@ -152,14 +152,14 @@ function first_guess(u, p) exit end if end do - do iter = 1, max_iterations + do iter = 1, MAX_ITERATIONS ! If f0 * f1 < 0 then x0 and x1 bracket the root, so go to ITP if (f0 * f1 .lt. 0._r8 ) then a = min(x0, x1) b = max(x0, x1) fa = min(f0, f1) fb = max(f0, f1) - quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations - iter + 1, p) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS - iter + 1, p) return else ! Do a secant step delta_f = f1 - f0 @@ -169,7 +169,7 @@ function first_guess(u, p) write(*,*) 'More params:', p%more_params(:) write(*,*) 'iter, x0, x1, f0, f1:', iter, x0, x1, f0, f1 write(*,*) 'ens:', p%ens(:) - call error_handler(E_MSG, 'rootfinding_mod:inv_cdf', errstring, source) + call error_handler(E_MSG, 'inv_cdf:', errstring, source) quantile = x1 return else @@ -186,7 +186,7 @@ function first_guess(u, p) end if ! Finished secant step. Check for convergence. - if (abs(delta_x) .le. tol * max(abs(x0), abs(x1)) .or. (abs(f1) .le. tol)) then + if (abs(delta_x) .le. TOL * max(abs(x0), abs(x1)) .or. (abs(f1) .le. TOL)) then return endif end do @@ -195,7 +195,7 @@ function first_guess(u, p) ! This has implications for stability of probit algorithms that require further study ! Not currently happening for any of the test cases on gfortran write(errstring, *) 'Failed to converge for probability ', u - call error_handler(E_MSG, 'inv_cdf', errstring, source) + call error_handler(E_ALLMSG, 'inv_cdf', errstring, source) !!!call error_handler(E_ERR, 'inv_cdf', errstring, source) end function inv_cdf @@ -224,10 +224,9 @@ function cdf(x, p) ! Soft, 2020. Here f(x) = cdf(x) - u. Assumes f(a) < 0 and f(b) > 0 on input. ! Local variables: - integer, parameter :: n0 = 1._r8 - real(r8), parameter :: kappa2 = 2._r8 - real(r8), parameter :: phi = 0.5_r8 * (1._r8 + sqrt(5._r8)) ! Golden ratio - real(r8), parameter :: tol = epsilon(1._r8)**(0.75_r8) ! abs on f, rel on x + integer, parameter :: N0 = 1._r8 + real(r8), parameter :: KAPPA2 = 2._r8 + real(r8), parameter :: TOL = epsilon(1._r8)**(0.75_r8) ! abs on f, rel on x real(r8) :: kappa1 real(r8) :: delta real(r8) :: x_t, x_f, x_half, f_ITP @@ -235,16 +234,16 @@ function cdf(x, p) integer :: i, n_max, n_half kappa1 = 0.2_r8 / (b - a) - eps = tol * max(abs(a), abs(b)) + eps = TOL * max(abs(a), abs(b)) ! Max number of iterations is either (i) input max, or (ii) number of bisection - ! iterations (plus n0) needed to achieve the desired relative tolerance on x. + ! iterations (plus N0) needed to achieve the desired relative tolerance on x. n_half = max(0, ceiling(log(0.5_r8 * (b - a) / eps) / log(2._r8))) - n_max = min(max_iterations, n0 + n_half) + n_max = min(max_iterations, N0 + n_half) do i=1,n_max x_half = 0.5_r8 * (a + b) ! Bisection guess x_f = (b* fa - a * fb) / (fa - fb) ! Secant/Regula-Falsi guess - delta = kappa1 * abs(b - a)**kappa2 + delta = kappa1 * abs(b - a)**KAPPA2 if (delta .le. abs(x_half - x_f)) then x_t = x_f + sign(delta, x_half - x_f) else @@ -257,7 +256,7 @@ function cdf(x, p) x = x_half - sign(r, x_half - x_f) end if f_ITP = cdf(x, p) - u - if (abs(f_ITP) .le. tol) then + if (abs(f_ITP) .le. TOL) then return elseif (f_ITP .gt. 0._r8) then b = x diff --git a/developer_tests/test_kde_dist/test_kde_dist.f90 b/developer_tests/test_kde_dist/test_kde_dist.f90 index ae071996cc..8e4d1e9e62 100644 --- a/developer_tests/test_kde_dist/test_kde_dist.f90 +++ b/developer_tests/test_kde_dist/test_kde_dist.f90 @@ -9,4 +9,4 @@ program test_kde_mod call test_kde() call finalize_utilities() -end program test_kde_mod \ No newline at end of file +end program test_kde_mod From 3d3e09981943c2c8a013c1e08c83c9183104b5d7 Mon Sep 17 00:00:00 2001 From: Helen Kershaw Date: Wed, 7 Aug 2024 07:57:07 -0600 Subject: [PATCH 08/14] style: remame test direcory test_kde_dist -> kde_test. This is for convention and for gitignore (program and directory not the same name) --- developer_tests/{test_kde_dist => kde_dist}/test_kde_dist.f90 | 0 developer_tests/{test_kde_dist => kde_dist}/test_kde_dist.rst | 0 developer_tests/{test_kde_dist => kde_dist}/work/input.nml | 0 developer_tests/{test_kde_dist => kde_dist}/work/quickbuild.sh | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename developer_tests/{test_kde_dist => kde_dist}/test_kde_dist.f90 (100%) rename developer_tests/{test_kde_dist => kde_dist}/test_kde_dist.rst (100%) rename developer_tests/{test_kde_dist => kde_dist}/work/input.nml (100%) rename developer_tests/{test_kde_dist => kde_dist}/work/quickbuild.sh (100%) diff --git a/developer_tests/test_kde_dist/test_kde_dist.f90 b/developer_tests/kde_dist/test_kde_dist.f90 similarity index 100% rename from developer_tests/test_kde_dist/test_kde_dist.f90 rename to developer_tests/kde_dist/test_kde_dist.f90 diff --git a/developer_tests/test_kde_dist/test_kde_dist.rst b/developer_tests/kde_dist/test_kde_dist.rst similarity index 100% rename from developer_tests/test_kde_dist/test_kde_dist.rst rename to developer_tests/kde_dist/test_kde_dist.rst diff --git a/developer_tests/test_kde_dist/work/input.nml b/developer_tests/kde_dist/work/input.nml similarity index 100% rename from developer_tests/test_kde_dist/work/input.nml rename to developer_tests/kde_dist/work/input.nml diff --git a/developer_tests/test_kde_dist/work/quickbuild.sh b/developer_tests/kde_dist/work/quickbuild.sh similarity index 100% rename from developer_tests/test_kde_dist/work/quickbuild.sh rename to developer_tests/kde_dist/work/quickbuild.sh From 0e9af1fe02bdeff4a5f182945ebd609fd0fb70a4 Mon Sep 17 00:00:00 2001 From: Helen Kershaw Date: Wed, 7 Aug 2024 08:17:13 -0600 Subject: [PATCH 09/14] fix: Added preprocess to quickbuild.sh so test_kde_dist compiles Updated input.nml for preprocess. Use default term level for errors Added test_kde_dist executable to .gitignore --- .gitignore | 1 + developer_tests/kde_dist/work/input.nml | 11 +++++++++-- developer_tests/kde_dist/work/quickbuild.sh | 7 +++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 5d1f27c778..f2344a85b9 100644 --- a/.gitignore +++ b/.gitignore @@ -197,6 +197,7 @@ test_quad_irreg_interp test_quad_reg_interp test_table_read test_ran_unif +test_kde_dist # Directories to NOT IGNORE ... same as executable names # as far as I know, these must be listed after the executables diff --git a/developer_tests/kde_dist/work/input.nml b/developer_tests/kde_dist/work/input.nml index 0d3e6e2380..ab9ab5619b 100644 --- a/developer_tests/kde_dist/work/input.nml +++ b/developer_tests/kde_dist/work/input.nml @@ -1,5 +1,12 @@ &utilities_nml - TERMLEVEL = 1, module_details = .false. - logfilename = 'dart_log.out' / + +&preprocess_nml + input_obs_qty_mod_file = '../../../assimilation_code/modules/observations/DEFAULT_obs_kind_mod.F90' + output_obs_qty_mod_file = '../../../assimilation_code/modules/observations/obs_kind_mod.f90' + input_obs_def_mod_file = '../../../observations/forward_operators/DEFAULT_obs_def_mod.F90' + output_obs_def_mod_file = '../../../observations/forward_operators/obs_def_mod.f90' + obs_type_files = '../../../observations/forward_operators/obs_def_gps_mod.f90' + quantity_files = '../../../assimilation_code/modules/observations/default_quantities_mod.f90' + / diff --git a/developer_tests/kde_dist/work/quickbuild.sh b/developer_tests/kde_dist/work/quickbuild.sh index 1a31852f1b..94912c0029 100755 --- a/developer_tests/kde_dist/work/quickbuild.sh +++ b/developer_tests/kde_dist/work/quickbuild.sh @@ -14,7 +14,7 @@ MODEL="none" EXTRA="$DART"/models/template/threed_model_mod.f90 dev_test=1 LOCATION="threed_sphere" -TEST="test_kde_dist" +TEST="kde_dist" serial_programs=( test_kde_dist @@ -26,7 +26,10 @@ arguments "$@" # clean the directory \rm -f -- *.o *.mod Makefile .cppdefs -# build +# build and run preprocess before making any other DART executables +buildpreprocess + +# build DART buildit # clean up From 4b9f50320bdd6204c152c031b895bfce697998a3 Mon Sep 17 00:00:00 2001 From: Helen Kershaw Date: Wed, 7 Aug 2024 08:24:15 -0600 Subject: [PATCH 10/14] remove empty docs tests print pass/fail https://github.com/NCAR/DART/pull/706#discussion_r1691508023 --- developer_tests/kde_dist/test_kde_dist.rst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 developer_tests/kde_dist/test_kde_dist.rst diff --git a/developer_tests/kde_dist/test_kde_dist.rst b/developer_tests/kde_dist/test_kde_dist.rst deleted file mode 100644 index 47401fe34a..0000000000 --- a/developer_tests/kde_dist/test_kde_dist.rst +++ /dev/null @@ -1 +0,0 @@ -documentation should be here From a8b05f534fe9b7b6d737faf6a8dd70187561f6a9 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Thu, 8 Aug 2024 15:13:11 -0600 Subject: [PATCH 11/14] Remove write --- assimilation_code/modules/assimilation/kde_distribution_mod.f90 | 2 -- 1 file changed, 2 deletions(-) diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 index 7afa415c7b..a007070a2e 100644 --- a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -866,12 +866,10 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & ! I draw one random (above) then add 1/Nb to each subsequent value u = unif + real(count_lower, r8) / real(ens_size, r8) count_lower = count_lower + 1 - write(*,*) 'lower boundary probability ', u elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then ! As above, I draw one random then add 1/Nb to each subsequent value u = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) count_upper = count_upper + 1 - write(*,*) 'upper boundary probability ', u elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a ! random probability From dbb491cde527774b95aa2232a8d4202e2c53b8c9 Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 16 Aug 2024 15:53:15 -0600 Subject: [PATCH 12/14] Obs increment kde Adds logic to handle the case where the observation is bounded, and all the observations _in the interior_ (i.e. not on the boundaries) are identical. This commit only fixes this case for the obs increment subroutine; the probit transform will be handled separately. --- .../assimilation/kde_distribution_mod.f90 | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 index a007070a2e..3dee62db75 100644 --- a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -780,7 +780,7 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & type(distribution_params_type) :: params_interior_prior type(distribution_params_type) :: params_interior_posterior integer :: ens_size_interior - integer :: i, count_lower, count_upper + integer :: i, count_lower, count_int, count_upper integer :: obs_dist_type if (bounded_below .or. bounded_above) then @@ -815,12 +815,14 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & if (ens_size_interior .gt. 1) then d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) d_max = maxval(d(1:ens_size_interior)) - call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & - ens_interior, y, obs_param, obs_dist_types%uninformative, & - params_interior_prior) - call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & - ens_interior, y, obs_param, obs_dist_type, & - params_interior_posterior) + if (d_max .gt. 0._r8) then + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_types%uninformative, & + params_interior_prior) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_type, & + params_interior_posterior) + endif else d_max = 0._r8 endif @@ -856,6 +858,7 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & ! Update ensemble members -> get observation increments count_lower = 0 + count_int = 0 count_upper = 0 unif = random_uniform(inc_ran_seq) / real(ens_size, r8) do i=1,ens_size @@ -872,8 +875,9 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & count_upper = count_upper + 1 elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a - ! random probability - u = (p_int_prior * random_uniform(inc_ran_seq)) + p_lower_prior + ! random probability. As above, I draw one random then add 1/Nb to each subsequent value + u = unif + p_lower_prior + real(count_int, r8) / real(ens_size, r8) + count_int = count_int + 1 else ! Use the interior cdf obtained using kde. u = kde_cdf_params(ens(i), params_interior_prior) u = (p_int_prior * u) + p_lower_prior @@ -886,7 +890,8 @@ subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & obs_inc(i) = upper_bound - ens(i) elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then ! posterior value in the interior, but there's only one prior ensemble member/value - ! in the interior, so we just assign the posterior ensemble member equal to the prior one + ! in the interior (or prior ensemble members in the interior are identical), so we + ! just assign the posterior ensemble member equal to the prior one obs_inc(i) = ens_interior(1) - ens(i) else ! posterior value in the interior, can use kde ! Rescale u to be between 0 and 1 before inverting interior cdf From b310b35419d2880510bc1f6854b26b7450edbc0a Mon Sep 17 00:00:00 2001 From: Ian Grooms Date: Fri, 23 Aug 2024 07:45:21 -0600 Subject: [PATCH 13/14] Update filter name --- .../assimilation/algorithm_info_mod.f90 | 8 ++-- .../modules/assimilation/assim_tools_mod.f90 | 4 +- guide/qceff_probit.rst | 42 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 index ad095a205b..41887d4cad 100644 --- a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 +++ b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 @@ -41,12 +41,12 @@ module algorithm_info_mod integer, parameter :: UNBOUNDED_RHF = 8 integer, parameter :: GAMMA_FILTER = 11 integer, parameter :: BOUNDED_NORMAL_RHF = 101 -integer, parameter :: KERNEL_QCEF = 102 +integer, parameter :: KDE_FILTER = 102 public :: obs_error_info, probit_dist_info, obs_inc_info, & init_algorithm_info_mod, end_algorithm_info_mod, & EAKF, ENKF, BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, & - GAMMA_FILTER, KERNEL, OBS_PARTICLE, KERNEL_QCEF + GAMMA_FILTER, KERNEL, OBS_PARTICLE, KDE_FILTER ! type definitions for the QCF table type obs_error_info_type @@ -297,8 +297,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%obs_inc_info%filter_kind = GAMMA_FILTER case ('BOUNDED_NORMAL_RHF') qceff_table_data(row)%obs_inc_info%filter_kind = BOUNDED_NORMAL_RHF - case ('KERNEL_QCEF') - qceff_table_data(row)%obs_inc_info%filter_kind = KERNEL_QCEF + case ('KDE_FILTER') + qceff_table_data(row)%obs_inc_info%filter_kind = KDE_FILTER case default write(errstring, *) 'Invalid filter kind: ', trim(filter_kind_string(row)) call error_handler(E_ERR, 'read_qceff_table:', errstring, source) diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index 55a79b867c..5787c0bebd 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -78,7 +78,7 @@ module assim_tools_mod use algorithm_info_mod, only : probit_dist_info, obs_inc_info, EAKF, ENKF, & BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, GAMMA_FILTER, & - KERNEL, OBS_PARTICLE, KERNEL_QCEF + KERNEL, OBS_PARTICLE, KDE_FILTER use gamma_distribution_mod, only : gamma_cdf, inv_gamma_cdf, gamma_mn_var_to_shape_scale, & gamma_gamma_prod @@ -1030,7 +1030,7 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & !-------------------------------------------------------------------------- - else if(filter_kind == KERNEL_QCEF) then + else if(filter_kind == KDE_FILTER) then call obs_increment_kde(ens, ens_size, obs, obs_var, bounded_below, & bounded_above, lower_bound, upper_bound, obs_inc) else diff --git a/guide/qceff_probit.rst b/guide/qceff_probit.rst index e7bfa4ce72..c46cf47bd6 100644 --- a/guide/qceff_probit.rst +++ b/guide/qceff_probit.rst @@ -4,8 +4,8 @@ Quantile-Conserving Ensemble Filter Framework ============================================== The Quantile-Conserving Ensemble Filter Framework (QCEFF) tools are available in DART -as of version v11. -The DART development team (dart@ucar.edu) would be happy to hear about your experiences +as of version v11. +The DART development team (dart@ucar.edu) would be happy to hear about your experiences and is anxious to build scientific collaborations using these new capabilities. The QCEFF options are set using a :ref:`qceff table ` given as a namelist option to &algorithm_info_nml. @@ -21,7 +21,7 @@ The QCEFF options are set using a :ref:`qceff table ` given as a na QCEFF options -------------- -QCEFF options are per quantity. For a given quantity, you specify the following +QCEFF options are per quantity. For a given quantity, you specify the following options as columns of the qceff_table: * Observation error information @@ -29,19 +29,19 @@ options as columns of the qceff_table: Provides information about boundedness constraints that control the likelihood distribution associated with an observed variable when using perfect_model_obs to generate noisy observations. - - * bounded_below (default .false.) + + * bounded_below (default .false.) * bounded_above (default .false.) - * lower_bound + * lower_bound * upper_bound -* Probit distribution information +* Probit distribution information Used in the computation of the probit transform. The values needed are the bounds and the distribution type. These can be different for all three cases (inflation, state, and extended_state priors) - + * distribution (one of :ref:`Distributions`) * bounded_below (default .false.) * bounded_above (default .false.) @@ -65,17 +65,17 @@ Creating a qceff table ----------------------- The table has two headers, row 1 and 2. -The first row is the version number. The second row describes the :ref:`QCEFF options` corresponding to each column of the table. -These two headers must be present in your qceff table. -The :ref:`qcf trunc table` below shows the table headers, -and an example quantity QTY_TRACER_CONCENTRATION for the first 5 columns of the table. +The first row is the version number. The second row describes the :ref:`QCEFF options` corresponding to each column of the table. +These two headers must be present in your qceff table. +The :ref:`qcf trunc table` below shows the table headers, +and an example quantity QTY_TRACER_CONCENTRATION for the first 5 columns of the table. There is a complete table with all 25 columns in `Google Sheets `_. You can copy and edit this table as needed. To add a quantity, add a row to the table. -For any quantity not listed in the table, the :ref:`Default values` values will be used for all 25 options. +For any quantity not listed in the table, the :ref:`Default values` values will be used for all 25 options. You only have to add rows for quantities that use non-default values for any of the input options. Ensure that there are no empty rows in between the quantities listed in the spreadsheet. -Save your spreadsheet as a .csv file. +Save your spreadsheet as a .csv file. To run filter or perfect_model_obs, put the .csv file in the directory where you are running. Edit input.nml to set the algorithm_info_nml option qceff_table_filename, for example: @@ -89,14 +89,14 @@ Edit input.nml to set the algorithm_info_nml option qceff_table_filename, for ex .. _qcf trunc table: -.. list-table:: truncated table +.. list-table:: truncated table :header-rows: 2 * - QCF table version: 1 - - - - - - - - + - + - + - + - * - QTY - bounded_below - bounded_above @@ -119,7 +119,7 @@ Available filter kinds * UNBOUNDED_RHF * GAMMA_FILTER * BOUNDED_NORMAL_RHF - * KERNEL_QCEF + * KDE_FILTER .. _Distributions: @@ -128,7 +128,7 @@ Available distributions * NORMAL_DISTRIBUTION (default) * BOUNDED_NORMAL_RH_DISTRIBUTION - * GAMMA_DISTRIBUTION + * GAMMA_DISTRIBUTION * BETA_DISTRIBUTION * LOG_NORMAL_DISTRIBUTION * UNIFORM_DISTRIBUTION From 4a6836972ae4c822ecea621b5e44c73d37f3adae Mon Sep 17 00:00:00 2001 From: Helen Kershaw Date: Fri, 23 Aug 2024 13:07:03 -0400 Subject: [PATCH 14/14] bump version and changlog for release --- CHANGELOG.rst | 8 ++++++++ conf.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index db39b19100..b0ed5bc384 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,6 +22,14 @@ individual files. The changes are now listed with the most recent at the top. +**August 26 2024 :: KQCEF. Tag 11.7.0** + +- Adds a Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation to DART. +- New distribution module kde_distribution_mod. +- New module rootfinding_mod that provides a different implementation of inv_cdf. + +*Contributed by Ian Grooms* + **August 15 2024 :: WRF fwd operator bug fixes. Tag v11.6.1** WRF-DART bug-fixes: diff --git a/conf.py b/conf.py index fdfe8997c0..c0c136b518 100644 --- a/conf.py +++ b/conf.py @@ -21,7 +21,7 @@ author = 'Data Assimilation Research Section' # The full version, including alpha/beta/rc tags -release = '11.6.1' +release = '11.7.0' root_doc = 'index' # -- General configuration ---------------------------------------------------