diff --git a/.gitignore b/.gitignore index 3fb7987..9182fb8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ docs/build docs/src/examples/generated /docs/Manifest.toml +Manifest.toml diff --git a/Project.toml b/Project.toml index a539814..3672bd5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "LineSearches" uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" -version = "7.2.0" +version = "7.3.0" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -13,6 +13,8 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" DoubleFloats = "1" NLSolversBase = "7" NaNMath = "1" +Optim = "1" +OptimTestProblems = "2" Parameters = "0.10, 0.11, 0.12" julia = "1.6" diff --git a/docs/src/index.md b/docs/src/index.md index baf4397..a11ca9e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -46,6 +46,14 @@ using LineSearches ``` to load the package. +## Debugging + +If you suspect a method of suboptimal performance or find that your code errors, +create a [`LineSearchCache`](@ref) to record intermediate values for later +inspection and analysis. If you're using this via Optim.jl, configure it inside +the method, e.g., `Newton(linesearch=LineSearches.MoreThuente(; cache))`. The +value stored in the cache will reflect the final iteration of line search during +optimization. ## References diff --git a/docs/src/reference/linesearch.md b/docs/src/reference/linesearch.md index 96f3c9e..f0b228e 100644 --- a/docs/src/reference/linesearch.md +++ b/docs/src/reference/linesearch.md @@ -11,3 +11,9 @@ MoreThuente Static StrongWolfe ``` + +## Debugging + +```@docs +LineSearchCache +``` diff --git a/src/LineSearches.jl b/src/LineSearches.jl index 7e462d1..4b3005a 100644 --- a/src/LineSearches.jl +++ b/src/LineSearches.jl @@ -1,5 +1,3 @@ -__precompile__() - module LineSearches using Printf @@ -9,13 +7,14 @@ using Parameters, NaNMath import NLSolversBase import NLSolversBase: AbstractObjective -export LineSearchException +export LineSearchException, LineSearchCache -export BackTracking, HagerZhang, Static, MoreThuente, StrongWolfe +export AbstractLineSearch, BackTracking, HagerZhang, Static, MoreThuente, StrongWolfe export InitialHagerZhang, InitialStatic, InitialPrevious, InitialQuadratic, InitialConstantChange + function make_ϕ(df, x_new, x, s) function ϕ(α) # Move a distance of alpha in the direction of s @@ -91,6 +90,26 @@ end include("types.jl") +# The following don't extend `empty!` and `push!` because we want implementations for `nothing` +# and that would be piracy +emptycache!(cache::LineSearchCache) = begin + empty!(cache.alphas) + empty!(cache.values) + empty!(cache.slopes) +end +emptycache!(::Nothing) = nothing +pushcache!(cache::LineSearchCache, α, val, slope) = begin + push!(cache.alphas, α) + push!(cache.values, val) + push!(cache.slopes, slope) +end +pushcache!(cache::LineSearchCache, α, val) = begin + push!(cache.alphas, α) + push!(cache.values, val) +end +pushcache!(::Nothing, α, val, slope) = nothing +pushcache!(::Nothing, α, val) = nothing + # Line Search Methods include("backtracking.jl") include("strongwolfe.jl") diff --git a/src/backtracking.jl b/src/backtracking.jl index 6fca2b6..1eb98db 100644 --- a/src/backtracking.jl +++ b/src/backtracking.jl @@ -8,13 +8,14 @@ there exists a factor ρ = ρ(c₁) such that α' ≦ ρ α. This is a modification of the algorithm described in Nocedal Wright (2nd ed), Sec. 3.5. """ -@with_kw struct BackTracking{TF, TI} +@with_kw struct BackTracking{TF, TI} <: AbstractLineSearch c_1::TF = 1e-4 ρ_hi::TF = 0.5 ρ_lo::TF = 0.1 iterations::TI = 1_000 order::TI = 3 maxstep::TF = Inf + cache::Union{Nothing,LineSearchCache{TF}} = nothing end BackTracking{TF}(args...; kwargs...) where TF = BackTracking{TF,Int}(args...; kwargs...) @@ -37,7 +38,9 @@ end # TODO: Should we deprecate the interface that only uses the ϕ argument? function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα - @unpack c_1, ρ_hi, ρ_lo, iterations, order = ls + @unpack c_1, ρ_hi, ρ_lo, iterations, order, cache = ls + emptycache!(cache) + pushcache!(cache, 0, ϕ_0, dϕ_0) # backtracking doesn't use the slope except here iterfinitemax = -log2(eps(real(Tα))) @@ -68,6 +71,8 @@ function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα ϕx_1 = ϕ(α_2) end + pushcache!(cache, αinitial, ϕx_1) + # TODO: check if value is finite (maybe iterfinite > iterfinitemax) # Backtrack until we satisfy sufficient decrease condition while ϕx_1 > ϕ_0 + c_1 * α_2 * dϕ_0 @@ -112,6 +117,7 @@ function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα # Evaluate f(x) at proposed position ϕx_0, ϕx_1 = ϕx_1, ϕ(α_2) + pushcache!(cache, α_2, ϕx_1) end return α_2, ϕx_1 diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl index cf17dbd..9d1be56 100644 --- a/src/hagerzhang.jl +++ b/src/hagerzhang.jl @@ -80,7 +80,7 @@ Conjugate gradient line search implementation from: conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software 32: 113–137. """ -@with_kw struct HagerZhang{T, Tm} +@with_kw struct HagerZhang{T, Tm} <: AbstractLineSearch delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) alphamax::T = Inf @@ -91,6 +91,7 @@ Conjugate gradient line search implementation from: psi3::T = 0.1 display::Int = 0 mayterminate::Tm = Ref{Bool}(false) + cache::Union{Nothing,LineSearchCache{T}} = nothing end HagerZhang{T}(args...; kwargs...) where T = HagerZhang{T, Base.RefValue{Bool}}(args...; kwargs...) @@ -109,9 +110,11 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, phi_0::Real, dphi_0::Real) where T # Should c and phi_0 be same type? @unpack delta, sigma, alphamax, rho, epsilon, gamma, - linesearchmax, psi3, display, mayterminate = ls + linesearchmax, psi3, display, mayterminate, cache = ls + emptycache!(cache) zeroT = convert(T, 0) + pushcache!(cache, zeroT, phi_0, dphi_0) if !(isfinite(phi_0) && isfinite(dphi_0)) throw(LineSearchException("Value and slope at step length = 0 must be finite.", T(0))) end @@ -124,9 +127,13 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, # Prevent values of x_new = x+αs that are likely to make # ϕ(x_new) infinite iterfinitemax::Int = ceil(Int, -log2(eps(T))) - alphas = [zeroT] # for bisection - values = [phi_0] - slopes = [dphi_0] + if cache !== nothing + @unpack alphas, values, slopes = cache + else + alphas = [zeroT] # for bisection + values = [phi_0] + slopes = [dphi_0] + end if display & LINESEARCH > 0 println("New linesearch") end @@ -203,10 +210,10 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, else # We'll still going downhill, expand the interval and try again. # Reaching this branch means that dphi_c < 0 and phi_c <= phi_0 + ϵ_k - # So cold = c has a lower objective than phi_0 up to epsilon. + # So cold = c has a lower objective than phi_0 up to epsilon. # This makes it a viable step to return if bracketing fails. - # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. + # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. # Going back to the loop with c = cold will only result in infinite cycling. # So returning (cold, phi_cold) and exiting the line search is the best move. cold = c diff --git a/src/morethuente.jl b/src/morethuente.jl index 27c4ec3..79576d9 100644 --- a/src/morethuente.jl +++ b/src/morethuente.jl @@ -138,13 +138,14 @@ The line search implementation from: Line search algorithms with guaranteed sufficient decrease. ACM Transactions on Mathematical Software (TOMS) 20.3 (1994): 286-307. """ -@with_kw struct MoreThuente{T} +@with_kw struct MoreThuente{T} <: AbstractLineSearch f_tol::T = 1e-4 # c_1 Wolfe sufficient decrease condition gtol::T = 0.9 # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) x_tol::T = 1e-8 alphamin::T = 1e-16 alphamax::T = 65536.0 maxfev::Int = 100 + cache::Union{Nothing,LineSearchCache{T}} = nothing end function (ls::MoreThuente)(df::AbstractObjective, x::AbstractArray{T}, @@ -161,13 +162,15 @@ function (ls::MoreThuente)(ϕdϕ, alpha::T, ϕ_0, dϕ_0) where T - @unpack f_tol, gtol, x_tol, alphamin, alphamax, maxfev = ls + @unpack f_tol, gtol, x_tol, alphamin, alphamax, maxfev, cache = ls + emptycache!(cache) iterfinitemax = -log2(eps(T)) info = 0 info_cstep = 1 # Info from step zeroT = convert(T, 0) + pushcache!(cache, zeroT, ϕ_0, dϕ_0) # # Check the input parameters for errors. @@ -236,7 +239,9 @@ function (ls::MoreThuente)(ϕdϕ, # Make stmax = (3/2)*alpha < 2alpha in the first iteration below stx = (convert(T, 7)/8)*alpha end + pushcache!(cache, alpha, f, dg) # END: Ensure that the initial step provides finite function values + # TODO: check if value is finite (maybe iterfinite > iterfinitemax) while true # @@ -282,6 +287,7 @@ function (ls::MoreThuente)(ϕdϕ, # and compute the directional derivative. # f, dg = ϕdϕ(alpha) + pushcache!(cache, alpha, f, dg) nfev += 1 # This includes calls to f() and g!() if isapprox(dg, 0, atol=eps(T)) # Should add atol value to MoreThuente diff --git a/src/static.jl b/src/static.jl index 3e68a27..62b244a 100644 --- a/src/static.jl +++ b/src/static.jl @@ -3,7 +3,7 @@ `Static` is intended for methods with well-scaled updates; i.e. Newton, on well-behaved problems. """ -struct Static end +struct Static <: AbstractLineSearch end function (ls::Static)(df::AbstractObjective, x, s, α, x_new = similar(x), ϕ_0 = nothing, dϕ_0 = nothing) ϕ = make_ϕ(df, x_new, x, s) diff --git a/src/strongwolfe.jl b/src/strongwolfe.jl index 6fb0792..335f248 100644 --- a/src/strongwolfe.jl +++ b/src/strongwolfe.jl @@ -14,10 +14,11 @@ use `MoreThuente`, `HagerZhang` or `BackTracking`. * `c_2 = 0.9` : second (strong) Wolfe condition * `ρ = 2.0` : bracket growth """ -@with_kw struct StrongWolfe{T} +@with_kw struct StrongWolfe{T} <: AbstractLineSearch c_1::T = 1e-4 c_2::T = 0.9 ρ::T = 2.0 + cache::Union{Nothing,LineSearchCache{T}} = nothing end """ @@ -49,9 +50,11 @@ Both `alpha` and `ϕ(alpha)` are returned. """ function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ, alpha0::T, ϕ_0, dϕ_0) where T<:Real - @unpack c_1, c_2, ρ = ls + @unpack c_1, c_2, ρ, cache = ls + emptycache!(cache) zeroT = convert(T, 0) + pushcache!(cache, zeroT, ϕ_0, dϕ_0) # Step-sizes a_0 = zeroT @@ -71,17 +74,21 @@ function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ, while a_i < a_max ϕ_a_i = ϕ(a_i) + pushcache!(cache, a_i, ϕ_a_i) # Test Wolfe conditions if (ϕ_a_i > ϕ_0 + c_1 * a_i * dϕ_0) || (ϕ_a_i >= ϕ_a_iminus1 && i > 1) a_star = zoom(a_iminus1, a_i, dϕ_0, ϕ_0, - ϕ, dϕ, ϕdϕ) + ϕ, dϕ, ϕdϕ, cache) return a_star, ϕ(a_star) end dϕ_a_i = dϕ(a_i) + if cache !== nothing + push!(cache.slopes, dϕ_a_i) + end # Check condition 2 if abs(dϕ_a_i) <= -c_2 * dϕ_0 @@ -91,7 +98,7 @@ function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ, # Check condition 3 if dϕ_a_i >= zeroT # FIXME untested! a_star = zoom(a_i, a_iminus1, - dϕ_0, ϕ_0, ϕ, dϕ, ϕdϕ) + dϕ_0, ϕ_0, ϕ, dϕ, ϕdϕ, cache) return a_star, ϕ(a_star) end @@ -117,6 +124,7 @@ function zoom(a_lo::T, ϕ, dϕ, ϕdϕ, + cache, c_1::Real = convert(T, 1)/10^4, c_2::Real = convert(T, 9)/10) where T @@ -133,8 +141,10 @@ function zoom(a_lo::T, iteration += 1 ϕ_a_lo, ϕprime_a_lo = ϕdϕ(a_lo) + pushcache!(cache, a_lo, ϕ_a_lo, ϕprime_a_lo) ϕ_a_hi, ϕprime_a_hi = ϕdϕ(a_hi) + pushcache!(cache, a_hi, ϕ_a_hi, ϕprime_a_hi) # Interpolate a_j if a_lo < a_hi @@ -150,6 +160,7 @@ function zoom(a_lo::T, # Evaluate ϕ(a_j) ϕ_a_j = ϕ(a_j) + pushcache!(cache, a_j, ϕ_a_j) # Check Armijo if (ϕ_a_j > ϕ_0 + c_1 * a_j * dϕ_0) || @@ -158,6 +169,9 @@ function zoom(a_lo::T, else # Evaluate ϕprime(a_j) ϕprime_a_j = dϕ(a_j) + if cache !== nothing + push!(cache.slopes, ϕprime_a_j) + end if abs(ϕprime_a_j) <= -c_2 * dϕ_0 return a_j diff --git a/src/types.jl b/src/types.jl index dd84f40..98e041d 100644 --- a/src/types.jl +++ b/src/types.jl @@ -2,3 +2,39 @@ mutable struct LineSearchException{T<:Real} <: Exception message::AbstractString alpha::T end + +abstract type AbstractLineSearch end + +# For debugging +struct LineSearchCache{T} + alphas::Vector{T} + values::Vector{T} + slopes::Vector{T} +end +""" + cache = LineSearchCache{T}() + +Initialize an empty cache for storing intermediate results during line search. +The `α`, `ϕ(α)`, and possibly `dϕ(α)` values computed during line search are +available in `cache.alphas`, `cache.values`, and `cache.slopes`, respectively. + +# Example + +```jldoctest +julia> ϕ(x) = (x - π)^4; dϕ(x) = 4*(x-π)^3; + +julia> cache = LineSearchCache{Float64}(); + +julia> ls = BackTracking(; cache); + +julia> ls(ϕ, 10.0, ϕ(0), dϕ(0)) +(1.8481462933284658, 2.7989406670901373) + +julia> cache +LineSearchCache{Float64}([0.0, 10.0, 1.8481462933284658], [97.40909103400242, 2212.550050116452, 2.7989406670901373], [-124.02510672119926]) +``` + +Because `BackTracking` doesn't use derivatives except at `α=0`, only the initial slope was stored in the cache. +Other methods may store all three. +""" +LineSearchCache{T}() where T = LineSearchCache{T}(T[], T[], T[]) diff --git a/test/REQUIRE b/test/REQUIRE deleted file mode 100644 index 8910520..0000000 --- a/test/REQUIRE +++ /dev/null @@ -1,3 +0,0 @@ -OptimTestProblems -DoubleFloats 0.1.10 -Optim 0.16 diff --git a/test/TestCases.jl b/test/TestCases.jl new file mode 100644 index 0000000..142daee --- /dev/null +++ b/test/TestCases.jl @@ -0,0 +1,75 @@ +module TestCases + +# A module for reproducing simplified versions of objective functions that cause trouble for line search methods + +using NLSolversBase + +export LineSearchTestCase + +struct LineSearchTestCase + alphas::Vector{Float64} + values::Vector{Float64} + slopes::Vector{Float64} + + function LineSearchTestCase(alphas, values, slopes) + Base.require_one_based_indexing(alphas, values, slopes) + n = length(alphas) + if n != length(values) || n != length(slopes) + throw(ArgumentError("Lengths of alphas, values, and slopes must match")) + end + # Ensure ordered & unique + perm = sortperm(alphas) + delidx = Int[] + for i in firstindex(perm)+1:lastindex(perm) + if alphas[perm[i]] == alphas[perm[i-1]] + push!(delidx, i) + end + end + deleteat!(perm, delidx) + alphas, values, slopes = alphas[perm], values[perm], slopes[perm] + # For interpolation, add a dummy value at the end + push!(alphas, alphas[end]+1) + push!(values, values[end]) + push!(slopes, 0.0) + return new(alphas, values, slopes) + end +end + +Base.minimum(tc::LineSearchTestCase) = minimum(tc.values) + +function NLSolversBase.OnceDifferentiable(tc::LineSearchTestCase) + function fdf(x) + x < zero(x) && throw(ArgumentError("x must be nonnegative, got $x")) + i = findfirst(>(x), tc.alphas) + (i === nothing || x > tc.alphas[end-1]) && throw(ArgumentError("x must be <=$(tc.alphas[end-1]), got $x")) + i -= 1 + xk = tc.alphas[i] + xkp1 = tc.alphas[i + 1] + dx = xkp1 - xk + t = (x - xk) / dx + h00t = 2t^3 - 3t^2 + 1 + h10t = t * (1 - t)^2 + h01t = t^2 * (3 - 2t) + h11t = t^2 * (t - 1) + val = + h00t * tc.values[i] + + h10t * dx * tc.slopes[i] + + h01t * tc.values[i + 1] + + h11t * dx * tc.slopes[i + 1] + h00tp = 6t^2 - 6t + h10tp = 3t^2 - 4t + 1 + h01tp = -6t^2 + 6 * t + h11tp = 3t^2 - 2t + slope = + ( + h00tp * tc.values[i] + + h10tp * dx * tc.slopes[i] + + h01tp * tc.values[i + 1] + + h11tp * dx * tc.slopes[i + 1] + ) / dx + return val, slope + end + return OnceDifferentiable(α -> fdf(α)[1], α -> fdf(α)[2], fdf, [0.0]) +end + +end \ No newline at end of file diff --git a/test/captured.jl b/test/captured.jl new file mode 100644 index 0000000..05b81fa --- /dev/null +++ b/test/captured.jl @@ -0,0 +1,35 @@ +using LineSearches +using NLSolversBase +using Test + +if !isdefined(@__MODULE__, :LineSearchTestCase) + include("TestCases.jl") + using .TestCases +end + +@testset "Capturing data" begin + cache = LineSearchCache{Float64}() + lsalgs = (HagerZhang(; cache), StrongWolfe(; cache), MoreThuente(; cache), + BackTracking(; cache), BackTracking(; order=2, cache) ) + ϕ(x) = (x - π)^4 + dϕ(x) = 4*(x-π)^3 + ϕdϕ(x) = (ϕ(x), dϕ(x)) + for ls in lsalgs + α, val = ls(ϕ, dϕ, ϕdϕ, 10.0, ϕ(0), dϕ(0)) + @test α < 10 + @test length(cache.alphas) == length(cache.values) && length(cache.alphas) > 1 + end +end + +# From PR#174 +@testset "PR#174" begin + tc = LineSearchTestCase( + [0.0, 1.0, 5.0, 3.541670844449739], + [3003.592409634743, 2962.0378569864743, 2891.4462095232184, 3000.9760725116876], + [-22332.321416890798, -20423.214551925797, 11718.185026267562, -22286.821227217057] + ) + fdf = OnceDifferentiable(tc) + hz = HagerZhang() + α, val = hz(fdf.f, fdf.fdf, 1.0, fdf.fdf(0.0)...) + @test_broken val <= minimum(tc) +end diff --git a/test/runtests.jl b/test/runtests.jl index 656c352..4910d53 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,7 +14,8 @@ my_tests = [ "initial.jl", "alphacalc.jl", "arbitrary_precision.jl", - "examples.jl" + "examples.jl", + "captured.jl" ] mutable struct StateDummy