From aa203ba277567738224ef6591efcde3102b10726 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Tue, 20 Feb 2024 05:19:50 +0000
Subject: [PATCH 01/28] docs: update callback and qualify Adam from
 OptimizationOptimisers

---
 docs/src/examples/3rd.md                        |  4 +---
 docs/src/examples/linear_parabolic.md           |  4 ++--
 docs/src/examples/nonlinear_elliptic.md         |  6 +++---
 docs/src/examples/nonlinear_hyperbolic.md       |  4 ++--
 docs/src/examples/wave.md                       |  6 ++----
 docs/src/tutorials/constraints.md               |  6 +++---
 docs/src/tutorials/dae.md                       |  4 ++--
 docs/src/tutorials/derivative_neural_network.md |  8 ++++----
 docs/src/tutorials/low_level.md                 |  4 ++--
 docs/src/tutorials/systems.md                   | 14 ++++----------
 10 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/docs/src/examples/3rd.md b/docs/src/examples/3rd.md
index 59c68644e2..829cd3f7e2 100644
--- a/docs/src/examples/3rd.md
+++ b/docs/src/examples/3rd.md
@@ -47,7 +47,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, Adam(0.01); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); callback = callback, maxiters = 2000)
 phi = discretization.phi
 ```
 
@@ -67,5 +67,3 @@ x_plot = collect(xs)
 plot(x_plot, u_real, title = "real")
 plot!(x_plot, u_predict, title = "predict")
 ```
-
-![hodeplot](https://user-images.githubusercontent.com/12683885/90276340-69bc3e00-de6c-11ea-89a7-7d291123a38b.png)
diff --git a/docs/src/examples/linear_parabolic.md b/docs/src/examples/linear_parabolic.md
index 0ae1432763..b83970f9b3 100644
--- a/docs/src/examples/linear_parabolic.md
+++ b/docs/src/examples/linear_parabolic.md
@@ -85,8 +85,8 @@ global iteration = 0
 callback = function (p, l)
     if iteration % 10 == 0
         println("loss: ", l)
-        println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-        println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
+        println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+        println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
     end
     global iteration += 1
     return false
diff --git a/docs/src/examples/nonlinear_elliptic.md b/docs/src/examples/nonlinear_elliptic.md
index db8d6e228f..d1bfa1f015 100644
--- a/docs/src/examples/nonlinear_elliptic.md
+++ b/docs/src/examples/nonlinear_elliptic.md
@@ -99,9 +99,9 @@ global iteration = 0
 callback = function (p, l)
     if iteration % 10 == 0
         println("loss: ", l)
-        println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-        println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
-        println("der_losses: ", map(l_ -> l_(p), aprox_derivative_loss_functions))
+        println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+        println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
+        println("der_losses: ", map(l_ -> l_(p.u), aprox_derivative_loss_functions))
     end
     global iteration += 1
     return false
diff --git a/docs/src/examples/nonlinear_hyperbolic.md b/docs/src/examples/nonlinear_hyperbolic.md
index 3f6896599d..6ac3401dbb 100644
--- a/docs/src/examples/nonlinear_hyperbolic.md
+++ b/docs/src/examples/nonlinear_hyperbolic.md
@@ -94,8 +94,8 @@ bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
     return false
 end
 
diff --git a/docs/src/examples/wave.md b/docs/src/examples/wave.md
index 6d99f0fa56..d65a7b7225 100644
--- a/docs/src/examples/wave.md
+++ b/docs/src/examples/wave.md
@@ -81,8 +81,6 @@ p3 = plot(ts, xs, diff_u, linetype = :contourf, title = "error");
 plot(p1, p2, p3)
 ```
 
-![waveplot](https://user-images.githubusercontent.com/12683885/101984293-74a7a380-3c91-11eb-8e78-72a50d88e3f8.png)
-
 ## 1D Damped Wave Equation with Dirichlet boundary conditions
 
 Now let's solve the 1-dimensional wave equation with damping.
@@ -159,8 +157,8 @@ bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
     return false
 end
 
diff --git a/docs/src/tutorials/constraints.md b/docs/src/tutorials/constraints.md
index 68aa391af7..4b790e24f0 100644
--- a/docs/src/tutorials/constraints.md
+++ b/docs/src/tutorials/constraints.md
@@ -78,9 +78,9 @@ aprox_derivative_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 cb_ = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
-    println("additional_loss: ", norm_loss_function(phi, p, nothing))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
+    println("additional_loss: ", norm_loss_function(phi, p.u, nothing))
     return false
 end
 
diff --git a/docs/src/tutorials/dae.md b/docs/src/tutorials/dae.md
index 24a7880a6e..389df7ec2f 100644
--- a/docs/src/tutorials/dae.md
+++ b/docs/src/tutorials/dae.md
@@ -16,8 +16,8 @@ Let's solve a simple DAE system:
 ```@example dae
 using NeuralPDE 
 using Random, Flux
-using OrdinaryDiffEq, Optimisers, Statistics
-import Lux, OptimizationOptimisers, OptimizationOptimJL
+using OrdinaryDiffEq, Statistics
+import Lux, OptimizationOptimisers
 
 example = (du, u, p, t) -> [cos(2pi * t) - du[1], u[2] + cos(2pi * t) - du[2]]
 u₀ = [1.0, -1.0]
diff --git a/docs/src/tutorials/derivative_neural_network.md b/docs/src/tutorials/derivative_neural_network.md
index 3432d58688..51792bdc6b 100644
--- a/docs/src/tutorials/derivative_neural_network.md
+++ b/docs/src/tutorials/derivative_neural_network.md
@@ -108,13 +108,13 @@ aprox_derivative_loss_functions = sym_prob.loss_functions.bc_loss_functions[9:en
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
-    println("der_losses: ", map(l_ -> l_(p), aprox_derivative_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
+    println("der_losses: ", map(l_ -> l_(p.u), aprox_derivative_loss_functions))
     return false
 end
 
-res = Optimization.solve(prob, Adam(0.01); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); callback = callback, maxiters = 2000)
 prob = remake(prob, u0 = res.u)
 res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 10000)
 
diff --git a/docs/src/tutorials/low_level.md b/docs/src/tutorials/low_level.md
index aad1347971..6935da90d4 100644
--- a/docs/src/tutorials/low_level.md
+++ b/docs/src/tutorials/low_level.md
@@ -53,8 +53,8 @@ bc_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bc_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bc_loss_functions))
     return false
 end
 
diff --git a/docs/src/tutorials/systems.md b/docs/src/tutorials/systems.md
index 9e20bc7f73..a2422a1b2a 100644
--- a/docs/src/tutorials/systems.md
+++ b/docs/src/tutorials/systems.md
@@ -79,8 +79,8 @@ bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_inner_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bcs_inner_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
     return false
 end
 
@@ -138,8 +138,8 @@ bc_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
 callback = function (p, l)
     println("loss: ", l)
-    println("pde_losses: ", map(l_ -> l_(p), pde_loss_functions))
-    println("bcs_losses: ", map(l_ -> l_(p), bc_loss_functions))
+    println("pde_losses: ", map(l_ -> l_(p.u), pde_loss_functions))
+    println("bcs_losses: ", map(l_ -> l_(p.u), bc_loss_functions))
     return false
 end
 
@@ -183,12 +183,6 @@ for i in 1:3
 end
 ```
 
-![sol_uq1](https://user-images.githubusercontent.com/12683885/122979254-03634e80-d3a0-11eb-985b-d3bae2dddfde.png)
-
-![sol_uq2](https://user-images.githubusercontent.com/12683885/122979278-09592f80-d3a0-11eb-8fee-de3652f138d8.png)
-
-![sol_uq3](https://user-images.githubusercontent.com/12683885/122979288-0e1de380-d3a0-11eb-9005-bfb501959b83.png)
-
 Notice here that the solution is represented in the `OptimizationSolution` with `u` as
 the parameters for the trained neural network. But, for the case where the neural network
 is from Lux.jl, it's given as a `ComponentArray` where `res.u.depvar.x` corresponds to the result

From 5ff91ad5c841e5e8429ea6642f2d36070762b21c Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Tue, 20 Feb 2024 05:20:21 +0000
Subject: [PATCH 02/28] build(docs): add LineSearches in Project.toml

---
 docs/Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/Project.toml b/docs/Project.toml
index 2c81ad0d60..c394a32569 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -7,6 +7,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DomainSets = "5b8099bc-c8ec-5219-889f-1d9e522a28bf"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Integrals = "de52edbc-65ea-441a-8357-d3a637375a31"
+LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
@@ -31,6 +32,7 @@ Documenter = "1"
 DomainSets = "0.6, 0.7"
 Flux = "0.14"
 Integrals = "4"
+LineSearches = "7.2"
 Lux = "0.5"
 ModelingToolkit = "8.33"
 MonteCarloMeasurements = "1"

From 285c8675f32ac3c9ecadaaf9ec6c1b424e23bc88 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Wed, 21 Feb 2024 09:30:04 +0000
Subject: [PATCH 03/28] docs: use GridTraining for a couple examples which
 trains better and faster

---
 docs/src/examples/linear_parabolic.md   | 10 +++-------
 docs/src/examples/nonlinear_elliptic.md | 12 ++----------
 docs/src/tutorials/constraints.md       |  6 ++----
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/docs/src/examples/linear_parabolic.md b/docs/src/examples/linear_parabolic.md
index b83970f9b3..6a584d5f3e 100644
--- a/docs/src/examples/linear_parabolic.md
+++ b/docs/src/examples/linear_parabolic.md
@@ -24,7 +24,7 @@ w(t, 1) = \frac{e^{\lambda_1} cos(\frac{x}{a})-e^{\lambda_2}cos(\frac{x}{a})}{\l
 with a physics-informed neural network.
 
 ```@example
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
 using Plots
 import ModelingToolkit: Interval, infimum, supremum
 
@@ -71,7 +71,7 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:2]
 
-strategy = QuadratureTraining()
+strategy = GridTraining(0.01)
 discretization = PhysicsInformedNN(chain, strategy)
 
 @named pdesystem = PDESystem(eqs, bcs, domains, [t, x], [u(t, x), w(t, x)])
@@ -92,7 +92,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 5000)
+res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 500)
 
 phi = discretization.phi
 
@@ -110,9 +110,5 @@ for i in 1:2
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
     plot(p1, p2, p3)
-    savefig("sol_u$i")
 end
 ```
-
-![linear_parabolic_sol_u1](https://user-images.githubusercontent.com/26853713/125745625-49c73760-0522-4ed4-9bdd-bcc567c9ace3.png)
-![linear_parabolic_sol_u2](https://user-images.githubusercontent.com/26853713/125745637-b12e1d06-e27b-46fe-89f3-076d415fcd7e.png)
diff --git a/docs/src/examples/nonlinear_elliptic.md b/docs/src/examples/nonlinear_elliptic.md
index d1bfa1f015..8ac44080cd 100644
--- a/docs/src/examples/nonlinear_elliptic.md
+++ b/docs/src/examples/nonlinear_elliptic.md
@@ -79,7 +79,7 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:6] # 1:number of @variables
 
-strategy = QuadratureTraining()
+strategy = GridTraining(0.01)
 discretization = PhysicsInformedNN(chain, strategy)
 
 vars = [u(x, y), w(x, y), Dxu(x, y), Dyu(x, y), Dxw(x, y), Dyw(x, y)]
@@ -87,10 +87,6 @@ vars = [u(x, y), w(x, y), Dxu(x, y), Dyu(x, y), Dxw(x, y), Dyw(x, y)]
 prob = NeuralPDE.discretize(pdesystem, discretization)
 sym_prob = NeuralPDE.symbolic_discretize(pdesystem, discretization)
 
-strategy = NeuralPDE.QuadratureTraining()
-discretization = PhysicsInformedNN(chain, strategy)
-sym_prob = NeuralPDE.symbolic_discretize(pdesystem, discretization)
-
 pde_inner_loss_functions = sym_prob.loss_functions.pde_loss_functions
 bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions[1:6]
 aprox_derivative_loss_functions = sym_prob.loss_functions.bc_loss_functions[7:end]
@@ -107,7 +103,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 5000)
+res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 100)
 
 phi = discretization.phi
 
@@ -125,9 +121,5 @@ for i in 1:2
     p2 = plot(xs, ys, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(xs, ys, diff_u[i], linetype = :contourf, title = "error")
     plot(p1, p2, p3)
-    savefig("non_linear_elliptic_sol_u$i")
 end
 ```
-
-![non_linear_elliptic_sol_u1](https://user-images.githubusercontent.com/26853713/125745550-0b667c10-b09a-4659-a543-4f7a7e025d6c.png)
-![non_linear_elliptic_sol_u2](https://user-images.githubusercontent.com/26853713/125745571-45a04739-7838-40ce-b979-43b88d149028.png)
diff --git a/docs/src/tutorials/constraints.md b/docs/src/tutorials/constraints.md
index 4b790e24f0..6c05b32492 100644
--- a/docs/src/tutorials/constraints.md
+++ b/docs/src/tutorials/constraints.md
@@ -63,7 +63,7 @@ function norm_loss_function(phi, θ, p)
 end
 
 discretization = PhysicsInformedNN(chain,
-                                   QuadratureTraining(),
+                                   GridTraining(0.01),
                                    additional_loss = norm_loss_function)
 
 @named pdesystem = PDESystem(eq, bcs, domains, [x], [p(x)])
@@ -86,7 +86,7 @@ end
 
 res = Optimization.solve(prob, LBFGS(), callback = cb_, maxiters = 400)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, BFGS(), callback = cb_, maxiters = 2000)
+res = Optimization.solve(prob, BFGS(), callback = cb_, maxiters = 500)
 ```
 
 And some analysis:
@@ -103,5 +103,3 @@ u_predict = [first(phi(x, res.u)) for x in xs]
 plot(xs, u_real, label = "analytic")
 plot!(xs, u_predict, label = "predict")
 ```
-
-![fp](https://user-images.githubusercontent.com/12683885/129405830-3d00c24e-adf1-443b-aa36-6af0e5305821.png)

From 0299a63552b690b6622e0b9671b310ac66f71d80 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 5 Mar 2024 02:38:07 -0500
Subject: [PATCH 04/28] Update ode_parameter_estimation.md

---
 docs/src/tutorials/ode_parameter_estimation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/tutorials/ode_parameter_estimation.md b/docs/src/tutorials/ode_parameter_estimation.md
index 2d3cba5517..ab95cf8216 100644
--- a/docs/src/tutorials/ode_parameter_estimation.md
+++ b/docs/src/tutorials/ode_parameter_estimation.md
@@ -10,6 +10,7 @@ We start by defining the problem:
 using NeuralPDE, OrdinaryDiffEq
 using Lux, Random
 using OptimizationOptimJL, LineSearches
+using Plots
 using Test # hide
 
 function lv(u, p, t)

From e6c3685588992a6836bde44d4db4e2711fda3b72 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 7 Mar 2024 04:46:22 +0000
Subject: [PATCH 05/28] docs: update tutorials

---
 docs/Project.toml                             |   6 +
 docs/src/tutorials/constraints.md             |   8 +-
 docs/src/tutorials/dae.md                     |  14 +--
 .../tutorials/derivative_neural_network.md    |  27 +----
 docs/src/tutorials/gpu.md                     |  39 +++----
 docs/src/tutorials/integro_diff.md            |  10 +-
 docs/src/tutorials/low_level.md               |   6 +-
 docs/src/tutorials/neural_adapter.md          | 106 +++++++++---------
 .../src/tutorials/ode_parameter_estimation.md |   4 +-
 docs/src/tutorials/param_estim.md             |  25 +----
 docs/src/tutorials/pdesystem.md               |  32 +++---
 docs/src/tutorials/systems.md                 |  32 ++----
 12 files changed, 123 insertions(+), 186 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index c394a32569..bb7441e6b5 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Cubature = "667455a9-e2ce-5579-9412-b964f529a492"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
@@ -10,6 +11,7 @@ Integrals = "de52edbc-65ea-441a-8357-d3a637375a31"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 MonteCarloMeasurements = "0987c9cc-fe09-11e8-30f0-b96dd679fdca"
 NeuralPDE = "315f7962-48a3-4962-8226-d0f33b1235f0"
@@ -20,11 +22,13 @@ OptimizationPolyalgorithms = "500b13db-7e66-49ce-bda4-eed966be6282"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [compat]
 AdvancedHMC = "0.6"
+ComponentArrays = "0.15"
 Cubature = "1.5"
 DiffEqBase = "6.106"
 Distributions = "0.25"
@@ -34,6 +38,7 @@ Flux = "0.14"
 Integrals = "4"
 LineSearches = "7.2"
 Lux = "0.5"
+LuxCUDA = "0.3"
 ModelingToolkit = "8.33"
 MonteCarloMeasurements = "1"
 NeuralPDE = "5.3"
@@ -44,5 +49,6 @@ OptimizationPolyalgorithms = "0.2"
 OrdinaryDiffEq = "6.31"
 Plots = "1.36"
 QuasiMonteCarlo = "0.3"
+Random = "1"
 Roots = "2.0"
 SpecialFunctions = "2.1"
diff --git a/docs/src/tutorials/constraints.md b/docs/src/tutorials/constraints.md
index 6c05b32492..5d96bff3f1 100644
--- a/docs/src/tutorials/constraints.md
+++ b/docs/src/tutorials/constraints.md
@@ -23,7 +23,7 @@ with Physics-Informed Neural Networks.
 ```@example fokkerplank
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
 using Integrals, Cubature
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 # the example is taken from this article https://arxiv.org/abs/1910.10503
 @parameters x
 @variables p(..)
@@ -63,7 +63,7 @@ function norm_loss_function(phi, θ, p)
 end
 
 discretization = PhysicsInformedNN(chain,
-                                   GridTraining(0.01),
+                                   QuadratureTraining(),
                                    additional_loss = norm_loss_function)
 
 @named pdesystem = PDESystem(eq, bcs, domains, [x], [p(x)])
@@ -84,9 +84,7 @@ cb_ = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, LBFGS(), callback = cb_, maxiters = 400)
-prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, BFGS(), callback = cb_, maxiters = 500)
+res = Optimization.solve(prob, BFGS(), callback = cb_, maxiters = 600)
 ```
 
 And some analysis:
diff --git a/docs/src/tutorials/dae.md b/docs/src/tutorials/dae.md
index 389df7ec2f..da17f4f46d 100644
--- a/docs/src/tutorials/dae.md
+++ b/docs/src/tutorials/dae.md
@@ -15,14 +15,14 @@ Let's solve a simple DAE system:
 
 ```@example dae
 using NeuralPDE 
-using Random, Flux
+using Random
 using OrdinaryDiffEq, Statistics
 import Lux, OptimizationOptimisers
 
 example = (du, u, p, t) -> [cos(2pi * t) - du[1], u[2] + cos(2pi * t) - du[2]]
 u₀ = [1.0, -1.0]
 du₀ = [0.0, 0.0]
-tspan = (0.0f0, 1.0f0)
+tspan = (0.0, 1.0)
 ```
 
 Differential_vars is a logical array which declares which variables are the differential (non-algebraic) vars
@@ -33,13 +33,10 @@ differential_vars = [true, false]
 
 ```@example dae
 prob = DAEProblem(example, du₀, u₀, tspan; differential_vars = differential_vars)
-chain = Flux.Chain(Dense(1, 15, cos), Dense(15, 15, sin), Dense(15, 2))
+chain = Lux.Chain(Lux.Dense(1, 15, cos), Lux.Dense(15, 15, sin), Lux.Dense(15, 2))
 opt = OptimizationOptimisers.Adam(0.1)
 alg = NNDAE(chain, opt; autodiff = false)
-
-sol = solve(prob,
-    alg, verbose = false, dt = 1 / 100.0f0,
-    maxiters = 3000, abstol = 1.0f-10)
+sol = solve(prob, alg, verbose = true, dt = 1 / 100.0, maxiters = 3000, abstol = 1e-10)
 ```
 
 Now lets compare the predictions from the learned network with the ground truth which we can obtain by numerically solving the DAE.
@@ -50,8 +47,7 @@ function example1(du, u, p, t)
     du[2] = u[2] + cos(2pi * t)
     nothing
 end
-M = [1.0 0
-    0 0]
+M = [1.0 0.0; 0.0 0.0]
 f = ODEFunction(example1, mass_matrix = M)
 prob_mm = ODEProblem(f, u₀, tspan)
 ground_sol = solve(prob_mm, Rodas5(), reltol = 1e-8, abstol = 1e-8)
diff --git a/docs/src/tutorials/derivative_neural_network.md b/docs/src/tutorials/derivative_neural_network.md
index 51792bdc6b..c803cdbf8e 100644
--- a/docs/src/tutorials/derivative_neural_network.md
+++ b/docs/src/tutorials/derivative_neural_network.md
@@ -53,9 +53,9 @@ using the second numeric derivative `Dt(Dtu1(t,x))`.
 
 ```@example derivativenn
 using NeuralPDE, Lux, ModelingToolkit
-using Optimization, OptimizationOptimisers, OptimizationOptimJL
+using Optimization, OptimizationOptimisers, OptimizationOptimJL, LineSearches
 using Plots
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 Dt = Differential(t)
@@ -71,8 +71,6 @@ bcs_ = [u1(0.0, x) ~ sin(pi * x),
     u2(0.0, x) ~ cos(pi * x),
     Dt(u1(0, x)) ~ -sin(pi * x),
     Dt(u2(0, x)) ~ -cos(pi * x),
-    #Dtu1(0,x) ~ -sin(pi*x),
-    # Dtu2(0,x) ~ -cos(pi*x),
     u1(t, 0.0) ~ 0.0,
     u2(t, 0.0) ~ exp(-t),
     u1(t, 1.0) ~ 0.0,
@@ -93,9 +91,8 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:7]
 
-training_strategy = NeuralPDE.QuadratureTraining()
-discretization = NeuralPDE.PhysicsInformedNN(chain,
-                                             training_strategy)
+training_strategy = NeuralPDE.QuadratureTraining(; batch = 200, reltol = 1e-6, abstol = 1e-6)
+discretization = NeuralPDE.PhysicsInformedNN(chain, training_strategy)
 
 vars = [u1(t, x), u2(t, x), u3(t, x), Dxu1(t, x), Dtu1(t, x), Dxu2(t, x), Dtu2(t, x)]
 @named pdesystem = PDESystem(eqs_, bcs__, domains, [t, x], vars)
@@ -116,7 +113,7 @@ end
 
 res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); callback = callback, maxiters = 2000)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 10000)
+res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 200)
 
 phi = discretization.phi
 ```
@@ -136,6 +133,7 @@ Dxu1_real(t, x) = pi * exp(-t) * cos(pi * x)
 Dtu1_real(t, x) = -exp(-t) * sin(pi * x)
 Dxu2_real(t, x) = -pi * exp(-t) * sin(pi * x)
 Dtu2_real(t, x) = -exp(-t) * cos(pi * x)
+
 function analytic_sol_func_all(t, x)
     [u1_real(t, x), u2_real(t, x), u3_real(t, x),
         Dxu1_real(t, x), Dtu1_real(t, x), Dxu2_real(t, x), Dtu2_real(t, x)]
@@ -151,18 +149,5 @@ for i in 1:7
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
     plot(p1, p2, p3)
-    savefig("3sol_ub$i")
 end
 ```
-
-![aprNN_sol_u1](https://user-images.githubusercontent.com/12683885/122998551-de79d600-d3b5-11eb-8f5d-59d00178c2ab.png)
-
-![aprNN_sol_u2](https://user-images.githubusercontent.com/12683885/122998567-e3d72080-d3b5-11eb-9024-4072f4b66cda.png)
-
-![aprNN_sol_u3](https://user-images.githubusercontent.com/12683885/122998578-e6d21100-d3b5-11eb-96a5-f64e5593b35e.png)
-
-## Comparison of the second numerical derivative and numerical + neural network derivative
-
-![DDu1](https://user-images.githubusercontent.com/12683885/123113394-3280cb00-d447-11eb-88e3-a8541bbf089f.png)
-
-![DDu2](https://user-images.githubusercontent.com/12683885/123113413-36ace880-d447-11eb-8f6a-4c3caa86e359.png)
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
index 5a95043450..4f47df8475 100644
--- a/docs/src/tutorials/gpu.md
+++ b/docs/src/tutorials/gpu.md
@@ -29,24 +29,28 @@ we must ensure that our initial parameters for the neural network are on the GPU
 is done, then the internal computations will all take place on the GPU. This is done by
 using the `gpu` function on the initial parameters, like:
 
-```julia
-using Lux, ComponentArrays
+```@example gpu
+using Lux, LuxCUDA, ComponentArrays, Random
+const gpud = gpu_device()
+inner = 25
 chain = Chain(Dense(3, inner, Lux.σ),
               Dense(inner, inner, Lux.σ),
               Dense(inner, inner, Lux.σ),
               Dense(inner, inner, Lux.σ),
               Dense(inner, 1))
 ps = Lux.setup(Random.default_rng(), chain)[1]
-ps = ps |> ComponentArray |> gpu .|> Float64
+ps = ps |> ComponentArray |> gpud .|> Float64
 ```
 
 In total, this looks like:
 
-```julia
-using NeuralPDE, Lux, CUDA, Random, ComponentArrays
+```@example gpu
+using NeuralPDE, Lux, LuxCUDA, Random, ComponentArrays
 using Optimization
 using OptimizationOptimisers
 import ModelingToolkit: Interval
+using Plots
+using Printf
 
 @parameters t x y
 @variables u(..)
@@ -84,9 +88,9 @@ chain = Chain(Dense(3, inner, Lux.σ),
               Dense(inner, inner, Lux.σ),
               Dense(inner, 1))
 
-strategy = QuadratureTraining()
+strategy = QuasiRandomTraining(100)
 ps = Lux.setup(Random.default_rng(), chain)[1]
-ps = ps |> ComponentArray |> gpu .|> Float64
+ps = ps |> ComponentArray |> gpud .|> Float64
 discretization = PhysicsInformedNN(chain,
                                    strategy,
                                    init_params = ps)
@@ -100,27 +104,23 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, Adam(0.01); callback = callback, maxiters = 2500)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); callback = callback, maxiters = 2500)
 ```
 
-We then use the `remake` function to rebuild the PDE problem to start a new
-optimization at the optimized parameters, and continue with a lower learning rate:
+We then use the `remake` function to rebuild the PDE problem to start a new optimization at the optimized parameters, and continue with a lower learning rate:
 
-```julia
+```@example gpu
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, Adam(0.001); callback = callback, maxiters = 2500)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-3); callback = callback, maxiters = 2500)
 ```
 
 Finally, we inspect the solution:
 
-```julia
+```@example gpu
 phi = discretization.phi
 ts, xs, ys = [infimum(d.domain):0.1:supremum(d.domain) for d in domains]
 u_real = [analytic_sol_func(t, x, y) for t in ts for x in xs for y in ys]
-u_predict = [first(Array(phi(gpu([t, x, y]), res.u))) for t in ts for x in xs for y in ys]
-
-using Plots
-using Printf
+u_predict = [first(Array(phi([t, x, y], res.u))) for t in ts for x in xs for y in ys]
 
 function plot_(res)
     # Animate
@@ -128,7 +128,7 @@ function plot_(res)
         @info "Animating frame $i..."
         u_real = reshape([analytic_sol_func(t, x, y) for x in xs for y in ys],
                          (length(xs), length(ys)))
-        u_predict = reshape([Array(phi(gpu([t, x, y]), res.u))[1] for x in xs for y in ys],
+        u_predict = reshape([Array(phi([t, x, y], res.u))[1] for x in xs for y in ys],
                             length(xs), length(ys))
         u_error = abs.(u_predict .- u_real)
         title = @sprintf("predict, t = %.3f", t)
@@ -145,8 +145,6 @@ end
 plot_(res)
 ```
 
-![3pde](https://user-images.githubusercontent.com/12683885/129949743-9471d230-c14f-4105-945f-6bc52677d40e.gif)
-
 ## Performance benchmarks
 
 Here are some performance benchmarks for 2d-pde with various number of input points and the
@@ -155,7 +153,6 @@ runtime with GPU and CPU.
 
 ```julia
 julia> CUDA.device()
-
 ```
 
 ![image](https://user-images.githubusercontent.com/12683885/110297207-49202500-8004-11eb-9e45-d4cb28045d87.png)
diff --git a/docs/src/tutorials/integro_diff.md b/docs/src/tutorials/integro_diff.md
index 3eb2323ba0..88e7683171 100644
--- a/docs/src/tutorials/integro_diff.md
+++ b/docs/src/tutorials/integro_diff.md
@@ -45,8 +45,9 @@ u(0) = 0
 ```
 
 ```@example integro
-using NeuralPDE, Flux, ModelingToolkit, Optimization, OptimizationOptimJL, DomainSets
-import ModelingToolkit: Interval, infimum, supremum
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, DomainSets
+using ModelingToolkit: Interval, infimum, supremum
+using Plots
 
 @parameters t
 @variables i(..)
@@ -55,7 +56,7 @@ Ii = Integral(t in DomainSets.ClosedInterval(0, t))
 eq = Di(i(t)) + 2 * i(t) + 5 * Ii(i(t)) ~ 1
 bcs = [i(0.0) ~ 0.0]
 domains = [t ∈ Interval(0.0, 2.0)]
-chain = Chain(Dense(1, 15, Flux.σ), Dense(15, 1)) |> f64
+chain = Lux.Chain(Lux.Dense(1, 15, Lux.σ), Lux.Dense(15, 1))
 
 strategy_ = QuadratureTraining()
 discretization = PhysicsInformedNN(chain,
@@ -78,9 +79,6 @@ u_predict = [first(phi([t], res.u)) for t in ts]
 
 analytic_sol_func(t) = 1 / 2 * (exp(-t)) * (sin(2 * t))
 u_real = [analytic_sol_func(t) for t in ts]
-using Plots
 plot(ts, u_real, label = "Analytical Solution")
 plot!(ts, u_predict, label = "PINN Solution")
 ```
-
-![IDE](https://user-images.githubusercontent.com/12683885/129749371-18b44bbc-18c8-49c5-bf30-0cd97ecdd977.png)
diff --git a/docs/src/tutorials/low_level.md b/docs/src/tutorials/low_level.md
index 6935da90d4..d625bbcf0c 100644
--- a/docs/src/tutorials/low_level.md
+++ b/docs/src/tutorials/low_level.md
@@ -14,7 +14,7 @@ with Physics-Informed Neural Networks. Here is an example of using the low-level
 
 ```@example low_level
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 @variables u(..)
@@ -87,7 +87,3 @@ p2 = plot(xs, u_predict[11], title = "t = 0.5");
 p3 = plot(xs, u_predict[end], title = "t = 1");
 plot(p1, p2, p3)
 ```
-
-![burgers](https://user-images.githubusercontent.com/12683885/90984874-a0870800-e580-11ea-9fd4-af8a4e3c523e.png)
-
-![burgers2](https://user-images.githubusercontent.com/12683885/90984856-8c430b00-e580-11ea-9206-1a88ebd24ca0.png)
diff --git a/docs/src/tutorials/neural_adapter.md b/docs/src/tutorials/neural_adapter.md
index 762a5108f6..6f50157448 100644
--- a/docs/src/tutorials/neural_adapter.md
+++ b/docs/src/tutorials/neural_adapter.md
@@ -1,9 +1,5 @@
 # Transfer Learning with Neural Adapter
 
-!!! warning
-    
-    This documentation page is out of date.
-
 Transfer learning is a machine learning technique where a model trained on one task is re-purposed on a second related task.
 
 `neural_adapter` is the method that trains a neural network using the results from an already obtained prediction.
@@ -17,9 +13,10 @@ Using the example of 2D Poisson equation, it is shown how, using the method neur
 
 ![image](https://user-images.githubusercontent.com/12683885/127149639-c2a8066f-9a25-4889-b313-5d4403567300.png)
 
-```julia
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, DiffEqBase
-import ModelingToolkit: Interval, infimum, supremum
+```@example neural_adapter
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimisers
+using ModelingToolkit: Interval, infimum, supremum
+using Random, ComponentArrays
 
 @parameters x y
 @variables u(..)
@@ -33,57 +30,58 @@ eq = Dxx(u(x, y)) + Dyy(u(x, y)) ~ -sin(pi * x) * sin(pi * y)
 bcs = [u(0, y) ~ 0.0, u(1, y) ~ -sin(pi * 1) * sin(pi * y),
     u(x, 0) ~ 0.0, u(x, 1) ~ -sin(pi * x) * sin(pi * 1)]
 # Space and time domains
-domains = [x ∈ Interval(0.0, 1.0),
-    y ∈ Interval(0.0, 1.0)]
-quadrature_strategy = NeuralPDE.QuadratureTraining(reltol = 1e-2, abstol = 1e-2,
+domains = [x ∈ Interval(0.0, 1.0), y ∈ Interval(0.0, 1.0)]
+quadrature_strategy = NeuralPDE.QuadratureTraining(reltol = 1e-3, abstol = 1e-6,
                                                    maxiters = 50, batch = 100)
 inner = 8
 af = Lux.tanh
-chain1 = Chain(Dense(2, inner, af),
-               Dense(inner, inner, af),
-               Dense(inner, 1))
+chain1 = Lux.Chain(Lux.Dense(2, inner, af),
+                   Lux.Dense(inner, inner, af),
+                   Lux.Dense(inner, 1))
 
-discretization = NeuralPDE.PhysicsInformedNN(chain1,
-                                             quadrature_strategy)
+discretization = NeuralPDE.PhysicsInformedNN(chain1, quadrature_strategy)
 
 @named pde_system = PDESystem(eq, bcs, domains, [x, y], [u(x, y)])
 prob = NeuralPDE.discretize(pde_system, discretization)
 sym_prob = NeuralPDE.symbolic_discretize(pde_system, discretization)
 
-res = Optimization.solve(prob, BFGS(); maxiters = 2000)
+callback = function (p, l)
+    println("Current loss is: $l")
+    return false
+end
+
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 10000)
 phi = discretization.phi
 
-inner_ = 12
+inner_ = 8
 af = Lux.tanh
 chain2 = Lux.Chain(Dense(2, inner_, af),
                    Dense(inner_, inner_, af),
                    Dense(inner_, inner_, af),
                    Dense(inner_, 1))
-
-init_params2 = Float64.(ComponentArray(Lux.setup(Random.default_rng(), chain)[1]))
+initp, st = Lux.setup(Random.default_rng(), chain2)
+init_params2 = Float64.(ComponentArrays.ComponentArray(initp))
 
 # the rule by which the training will take place is described here in loss function
 function loss(cord, θ)
-    chain2(cord, θ) .- phi(cord, res.u)
+    global st, chain2
+    ch2, st = chain2(cord, θ, st)
+    ch2 .- phi(cord, res.u)
 end
 
 strategy = NeuralPDE.QuadratureTraining()
 
 prob_ = NeuralPDE.neural_adapter(loss, init_params2, pde_system, strategy)
-callback = function (p, l)
-    println("Current loss is: $l")
-    return false
-end
-res_ = Optimization.solve(prob_, BFGS(); callback = callback, maxiters = 1000)
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 10000)
 
-phi_ = NeuralPDE.get_phi(chain2)
+phi_ = PhysicsInformedNN(chain2, strategy; init_params = res_.u).phi
 
 xs, ys = [infimum(d.domain):0.01:supremum(d.domain) for d in domains]
 analytic_sol_func(x, y) = (sin(pi * x) * sin(pi * y)) / (2pi^2)
 
 u_predict = reshape([first(phi([x, y], res.u)) for x in xs for y in ys],
                     (length(xs), length(ys)))
-u_predict_ = reshape([first(phi_([x, y], res_.minimizer)) for x in xs for y in ys],
+u_predict_ = reshape([first(phi_([x, y], res_.u)) for x in xs for y in ys],
                      (length(xs), length(ys)))
 u_real = reshape([analytic_sol_func(x, y) for x in xs for y in ys],
                  (length(xs), length(ys)))
@@ -91,16 +89,14 @@ diff_u = u_predict .- u_real
 diff_u_ = u_predict_ .- u_real
 
 using Plots
-p1 = plot(xs, ys, u_predict, linetype = :contourf, title = "first predict");
-p2 = plot(xs, ys, u_predict_, linetype = :contourf, title = "second predict");
-p3 = plot(xs, ys, u_real, linetype = :contourf, title = "analytic");
-p4 = plot(xs, ys, diff_u, linetype = :contourf, title = "error 1");
-p5 = plot(xs, ys, diff_u_, linetype = :contourf, title = "error 2");
+p1 = plot(xs, ys, u_predict, linetype = :contourf, title = "first predict")
+p2 = plot(xs, ys, u_predict_, linetype = :contourf, title = "second predict")
+p3 = plot(xs, ys, u_real, linetype = :contourf, title = "analytic")
+p4 = plot(xs, ys, diff_u, linetype = :contourf, title = "error 1")
+p5 = plot(xs, ys, diff_u_, linetype = :contourf, title = "error 2")
 plot(p1, p2, p3, p4, p5)
 ```
 
-![neural_adapter](https://user-images.githubusercontent.com/12683885/127645487-24226cc0-fff6-4bb9-8509-77cf3e120563.png)
-
 ## Domain decomposition
 
 In this example, we first obtain a prediction of 2D Poisson equation on subdomains. We split up full domain into 10 sub problems by x, and create separate neural networks for each sub interval. If x domain ∈ [x_0, x_end] so, it is decomposed on 10 part: sub x domains = {[x_0, x_1], ... [x_i,x_i+1], ..., x_9,x_end]}.
@@ -108,9 +104,9 @@ And then using the method neural_adapter, we retrain the batch of 10 predictions
 
 ![domain_decomposition](https://user-images.githubusercontent.com/12683885/127149752-a4ecea50-2984-45d8-b0d4-d2eadecf58e7.png)
 
-```julia
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, DiffEqBase
-import ModelingToolkit: Interval, infimum, supremum
+```@example neural_adapter
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimisers
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters x y
 @variables u(..)
@@ -127,8 +123,7 @@ x_0 = 0.0
 x_end = 1.0
 x_domain = Interval(x_0, x_end)
 y_domain = Interval(0.0, 1.0)
-domains = [x ∈ x_domain,
-    y ∈ y_domain]
+domains = [x ∈ x_domain, y ∈ y_domain]
 
 count_decomp = 10
 
@@ -137,8 +132,7 @@ af = Lux.tanh
 inner = 10
 chains = [Lux.Chain(Dense(2, inner, af), Dense(inner, inner, af), Dense(inner, 1))
           for _ in 1:count_decomp]
-init_params = map(c -> Float64.(ComponentArray(Lux.setup(Random.default_rng(), c)[1])),
-                  chains)
+init_params = map(c -> Float64.(ComponentArray(Lux.setup(Random.default_rng(), c)[1])), chains)
 
 xs_ = infimum(x_domain):(1 / count_decomp):supremum(x_domain)
 xs_domain = [(xs_[i], xs_[i + 1]) for i in 1:(length(xs_) - 1)]
@@ -172,9 +166,9 @@ pde_system_map = []
 for i in 1:count_decomp
     println("decomposition $i")
     domains_ = domains_map[i]
-    phi_in(cord) = phis[i - 1](cord, reses[i - 1].minimizer)
+    phi_in(cord) = phis[i - 1](cord, reses[i - 1].u)
     phi_bound(x, y) = phi_in(vcat(x, y))
-    @register phi_bound(x, y)
+    @register_symbolic phi_bound(x, y)
     Base.Broadcast.broadcasted(::typeof(phi_bound), x, y) = phi_bound(x, y)
     bcs_ = create_bcs(domains_[1].domain, phi_bound)
     @named pde_system_ = PDESystem(eq, bcs_, domains_, [x, y], [u(x, y)])
@@ -186,7 +180,7 @@ for i in 1:count_decomp
 
     prob = NeuralPDE.discretize(pde_system_, discretization)
     symprob = NeuralPDE.symbolic_discretize(pde_system_, discretization)
-    res_ = Optimization.solve(prob, BFGS(), maxiters = 1000)
+    res_ = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); maxiters = 10000, callback)
     phi = discretization.phi
     push!(reses, res_)
     push!(phis, phi)
@@ -207,7 +201,7 @@ function compose_result(dx)
     end
     for x_ in xs
         i = index_of_interval(x_)
-        u_predict_sub = [first(phis[i]([x_, y], reses[i].minimizer)) for y in ys]
+        u_predict_sub = [first(phis[i]([x_, y], reses[i].u)) for y in ys]
         u_real_sub = [analytic_sol_func(x_, y) for y in ys]
         diff_u_sub = abs.(u_predict_sub .- u_real_sub)
         append!(u_predict_array, u_predict_sub)
@@ -218,6 +212,7 @@ function compose_result(dx)
     diff_u = reshape(diff_u_array, (length(xs), length(ys)))
     u_predict, diff_u
 end
+
 dx = 0.01
 u_predict, diff_u = compose_result(dx)
 
@@ -229,12 +224,17 @@ chain2 = Lux.Chain(Dense(2, inner_, af),
                    Dense(inner_, inner_, af),
                    Dense(inner_, 1))
 
-init_params2 = Float64.(ComponentArray(Lux.setup(Random.default_rng(), chain2)[1]))
+initp, st = Lux.setup(Random.default_rng(), chain2)
+init_params2 = Float64.(ComponentArrays.ComponentArray(initp))
 
 @named pde_system = PDESystem(eq, bcs, domains, [x, y], [u(x, y)])
 
 losses = map(1:count_decomp) do i
-    loss(cord, θ) = chain2(cord, θ) .- phis[i](cord, reses[i].minimizer)
+    function loss(cord, θ)
+        global st, chain2, phis, reses
+        ch2, st = chain2(cord, θ, st)
+        ch2 .- phis[i](cord, reses[i].u)
+    end
 end
 
 callback = function (p, l)
@@ -244,15 +244,15 @@ end
 
 prob_ = NeuralPDE.neural_adapter(losses, init_params2, pde_system_map,
                                  NeuralPDE.QuadratureTraining())
-res_ = Optimization.solve(prob_, BFGS(); callback = callback, maxiters = 2000)
-prob_ = NeuralPDE.neural_adapter(losses, res_.minimizer, pde_system_map,
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 5000)
+prob_ = NeuralPDE.neural_adapter(losses, res_.u, pde_system_map,
                                  NeuralPDE.QuadratureTraining())
-res_ = Optimization.solve(prob_, BFGS(); callback = callback, maxiters = 1000)
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 5000)
 
-phi_ = NeuralPDE.get_phi(chain2)
+phi_ = PhysicsInformedNN(chain2, strategy; init_params = res_.u).phi
 
 xs, ys = [infimum(d.domain):dx:supremum(d.domain) for d in domains]
-u_predict_ = reshape([first(phi_([x, y], res_.minimizer)) for x in xs for y in ys],
+u_predict_ = reshape([first(phi_([x, y], res_.u)) for x in xs for y in ys],
                      (length(xs), length(ys)))
 u_real = reshape([analytic_sol_func(x, y) for x in xs for y in ys],
                  (length(xs), length(ys)))
@@ -267,5 +267,3 @@ p4 = plot(xs, ys, diff_u, linetype = :contourf, title = "error 1");
 p5 = plot(xs, ys, diff_u_, linetype = :contourf, title = "error 2");
 plot(p1, p2, p3, p4, p5)
 ```
-
-![decomp](https://user-images.githubusercontent.com/12683885/127651866-87ba95ad-10a8-471e-b4a5-093210a86c0e.png)
diff --git a/docs/src/tutorials/ode_parameter_estimation.md b/docs/src/tutorials/ode_parameter_estimation.md
index ab95cf8216..39bd4ef229 100644
--- a/docs/src/tutorials/ode_parameter_estimation.md
+++ b/docs/src/tutorials/ode_parameter_estimation.md
@@ -41,7 +41,7 @@ Now, lets define a neural network for the PINN using [Lux.jl](https://lux.csail.
 ```@example param_estim_lv
 rng = Random.default_rng()
 Random.seed!(rng, 0)
-n = 12
+n = 15
 chain = Lux.Chain(
             Lux.Dense(1, n, Lux.σ),
             Lux.Dense(n, n, Lux.σ),
@@ -63,7 +63,7 @@ Next we define the optimizer and [`NNODE`](@ref) which is then plugged into the
 
 ```@example param_estim_lv
 opt = LBFGS(linesearch = BackTracking())
-alg = NNODE(chain, opt, ps; strategy = GridTraining(0.01), param_estim = true, additional_loss = additional_loss)
+alg = NNODE(chain, opt, ps; strategy = WeightedIntervalTraining([0.7, 0.2, 0.1], 500), param_estim = true, additional_loss = additional_loss)
 ```
 
 Now we have all the pieces to solve the optimization problem.
diff --git a/docs/src/tutorials/param_estim.md b/docs/src/tutorials/param_estim.md
index 2f2ebc782e..149b1e7200 100644
--- a/docs/src/tutorials/param_estim.md
+++ b/docs/src/tutorials/param_estim.md
@@ -17,7 +17,7 @@ We start by defining the problem,
 ```@example param_estim
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, OrdinaryDiffEq,
       Plots
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 @parameters t, σ_, β, ρ
 @variables x(..), y(..), z(..)
 Dt = Differential(t)
@@ -90,25 +90,6 @@ function additional_loss(phi, θ, p)
 end
 ```
 
-#### Note about Flux
-
-If Flux neural network is used, then the subsetting must be computed manually as `θ`
-is simply a vector. This looks like:
-
-```julia
-init_params = [Flux.destructure(c)[1] for c in [chain1, chain2, chain3]]
-acum = [0; accumulate(+, length.(init_params))]
-sep = [(acum[i] + 1):acum[i + 1] for i in 1:(length(acum) - 1)]
-(u_, t_) = data
-len = length(data[2])
-
-function additional_loss(phi, θ, p)
-    return sum(sum(abs2, phi[i](t_, θ[sep[i]]) .- u_[[i], :]) / len for i in 1:1:3)
-end
-```
-
-#### Back to our originally scheduled programming
-
 Then finally defining and optimizing using the `PhysicsInformedNN` interface.
 
 ```@example param_estim
@@ -122,7 +103,7 @@ callback = function (p, l)
     println("Current loss is: $l")
     return false
 end
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 5000)
+res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 1000)
 p_ = res.u[(end - 2):end] # p_ = [9.93, 28.002, 2.667]
 ```
 
@@ -135,5 +116,3 @@ u_predict = [[discretization.phi[i]([t], minimizers[i])[1] for t in ts] for i in
 plot(sol)
 plot!(ts, u_predict, label = ["x(t)" "y(t)" "z(t)"])
 ```
-
-![Plot_Lorenz](https://user-images.githubusercontent.com/12683885/110944192-2ae05f00-834d-11eb-910b-f5c06d22ec8a.png)
diff --git a/docs/src/tutorials/pdesystem.md b/docs/src/tutorials/pdesystem.md
index 4ace039bd4..d5962f8d1e 100644
--- a/docs/src/tutorials/pdesystem.md
+++ b/docs/src/tutorials/pdesystem.md
@@ -27,9 +27,11 @@ Using physics-informed neural networks.
 
 ## Copy-Pasteable Code
 
-```@example
+```@example poisson
 using NeuralPDE, Lux, Optimization, OptimizationOptimJL
-import ModelingToolkit: Interval
+using LineSearches
+using ModelingToolkit: Interval
+using Plots
 
 @parameters x y
 @variables u(..)
@@ -51,26 +53,23 @@ dim = 2 # number of dimensions
 chain = Lux.Chain(Dense(dim, 16, Lux.σ), Dense(16, 16, Lux.σ), Dense(16, 1))
 
 # Discretization
-dx = 0.05
-discretization = PhysicsInformedNN(chain, QuadratureTraining())
+discretization = PhysicsInformedNN(chain, QuadratureTraining(; batch = 200, abstol = 1e-6, reltol = 1e-6))
 
 @named pde_system = PDESystem(eq, bcs, domains, [x, y], [u(x, y)])
 prob = discretize(pde_system, discretization)
 
-#Optimizer
-opt = OptimizationOptimJL.BFGS()
-
 #Callback function
 callback = function (p, l)
     println("Current loss is: $l")
     return false
 end
 
-res = Optimization.solve(prob, opt, callback = callback, maxiters = 1000)
+# Optimizer
+opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
+res = solve(prob, opt, callback = callback, maxiters = 2000)
 phi = discretization.phi
 
-using Plots
-
+dx = 0.05
 xs, ys = [infimum(d.domain):(dx / 10):supremum(d.domain) for d in domains]
 analytic_sol_func(x, y) = (sin(pi * x) * sin(pi * y)) / (2pi^2)
 
@@ -92,7 +91,8 @@ The ModelingToolkit PDE interface for this example looks like this:
 
 ```@example poisson
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
-import ModelingToolkit: Interval
+using ModelingToolkit: Interval
+using Plots
 
 @parameters x y
 @variables u(..)
@@ -122,8 +122,7 @@ Here, we build PhysicsInformedNN algorithm where `dx` is the step of discretizat
 `strategy` stores information for choosing a training strategy.
 
 ```@example poisson
-dx = 0.05
-discretization = PhysicsInformedNN(chain, QuadratureTraining())
+discretization = PhysicsInformedNN(chain, QuadratureTraining(; batch = 200, abstol = 1e-6, reltol = 1e-6))
 ```
 
 As described in the API docs, we now need to define the `PDESystem` and create PINNs
@@ -139,7 +138,7 @@ Here, we define the callback function and the optimizer. And now we can solve th
 
 ```@example poisson
 #Optimizer
-opt = OptimizationOptimJL.BFGS()
+opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
 
 callback = function (p, l)
     println("Current loss is: $l")
@@ -153,6 +152,7 @@ phi = discretization.phi
 We can plot the predicted solution of the PDE and compare it with the analytical solution to plot the relative error.
 
 ```@example poisson
+dx = 0.05
 xs, ys = [infimum(d.domain):(dx / 10):supremum(d.domain) for d in domains]
 analytic_sol_func(x, y) = (sin(pi * x) * sin(pi * y)) / (2pi^2)
 
@@ -162,12 +162,8 @@ u_real = reshape([analytic_sol_func(x, y) for x in xs for y in ys],
                  (length(xs), length(ys)))
 diff_u = abs.(u_predict .- u_real)
 
-using Plots
-
 p1 = plot(xs, ys, u_real, linetype = :contourf, title = "analytic");
 p2 = plot(xs, ys, u_predict, linetype = :contourf, title = "predict");
 p3 = plot(xs, ys, diff_u, linetype = :contourf, title = "error");
 plot(p1, p2, p3)
 ```
-
-![poissonplot](https://user-images.githubusercontent.com/12683885/90962648-2db35980-e4ba-11ea-8e58-f4f07c77bcb9.png)
diff --git a/docs/src/tutorials/systems.md b/docs/src/tutorials/systems.md
index a2422a1b2a..d4b277b1f3 100644
--- a/docs/src/tutorials/systems.md
+++ b/docs/src/tutorials/systems.md
@@ -35,8 +35,8 @@ with physics-informed neural networks.
 ## Solution
 
 ```@example system
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
-import ModelingToolkit: Interval, infimum, supremum
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 @variables u1(..), u2(..), u3(..)
@@ -67,7 +67,7 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:3]
 
-strategy = QuadratureTraining()
+strategy = QuadratureTraining(; batch = 200, abstol = 1e-6, reltol = 1e-6)
 discretization = PhysicsInformedNN(chain, strategy)
 
 @named pdesystem = PDESystem(eqs, bcs, domains, [t, x], [u1(t, x), u2(t, x), u3(t, x)])
@@ -84,8 +84,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 5000)
-
+res = solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
 phi = discretization.phi
 ```
 
@@ -96,7 +95,7 @@ interface. Here is an example using the components from `symbolic_discretize` to
 reproduce the `discretize` optimization:
 
 ```@example system
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
 import ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
@@ -152,8 +151,7 @@ end
 f_ = OptimizationFunction(loss_function, Optimization.AutoZygote())
 prob = Optimization.OptimizationProblem(f_, sym_prob.flat_init_params)
 
-res = Optimization.solve(prob, OptimizationOptimJL.BFGS(); callback = callback,
-                         maxiters = 5000)
+res = Optimization.solve(prob, OptimizationOptimJL.LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
 ```
 
 ## Solution Representation
@@ -179,7 +177,6 @@ for i in 1:3
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
     plot(p1, p2, p3)
-    savefig("sol_u$i")
 end
 ```
 
@@ -194,25 +191,16 @@ Subsetting the array also works, but is inelegant.
 
 (If `param_estim == true`, then `res.u.p` are the fit parameters)
 
-If Flux.jl is used, then subsetting the array is required. This looks like:
-
-```julia
-init_params = [Flux.destructure(c)[1] for c in chain]
-acum = [0; accumulate(+, length.(init_params))]
-sep = [(acum[i] + 1):acum[i + 1] for i in 1:(length(acum) - 1)]
-minimizers_ = [res.minimizer[s] for s in sep]
-```
-
 #### Note: Solving Matrices of PDEs
 
 Also, in addition to vector systems, we can use the matrix form of PDEs:
 
-```@example
+```julia
 using ModelingToolkit, NeuralPDE
 @parameters x y
-@variables u[1:2, 1:2](..)
-@derivatives Dxx'' ~ x
-@derivatives Dyy'' ~ y
+@variables (u(..))[1:2, 1:2]
+Dxx = Differential(x)^2
+Dyy = Differential(y)^2
 
 # Initial and boundary conditions
 bcs = [u[1](x, 0) ~ x, u[2](x, 0) ~ 2, u[3](x, 0) ~ 3, u[4](x, 0) ~ 4]

From cb00f40f49162818ef461ca2147bf19a9a954f6b Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Sun, 10 Mar 2024 03:59:47 +0000
Subject: [PATCH 06/28] ci: increase timeout of docs build

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ff287e15bd..7a4e4387ae 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -55,7 +55,7 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft
-    timeout_in_minutes: 1000
+    timeout_in_minutes: 1440
 
 env:
   SECRET_DOCUMENTER_KEY: "yBJHrf5zUPu3VFotb0W4TRdauMTGgNUei3ax0xrVTmcrrP3EX8zSGaj9MNZji9H6JqyNEhCqZMGcHhxR5XK0f97YjhAp5rDNpwV6FbIXY8FXgFyIOLAenYUklta1W6fNM7KTz3Dq3UnKBAprKhQBwUw3SldTuvtm+IhiVT2rFmADqxMSQcv+5LivYEgAFrrd6LX+PHhonj38VN46z5Bi3TOIGAnByVlZssX7cOwaRg/8TloLPsWAUlQiPr89Vdlow9k6SQV8W9mf00/Rq4LFd1Eq1EYTCSmReVawMxVpXh1mj7MSabf9ppVWYOmjP5Rzatk8YFWlJf80HVvzY7tXRQ==;U2FsdGVkX1/UmKrEQkyqoZY6GckLGmpV5SsPa2byZhZbomiJKIrLntTIHK9GclgfCJ1MDCKhuo3hMVxq+NKVm0QmnMZk0Hlzszx8Up5pbcuSb2BA0Wm7IJzZ5/2uXfdBOXIejwk+GUabJtSpiDlYJUW/8bOWYZUwl9rmrOM44tPesEcs5yMZPShIowZnJqrvQUcWXZ/+aZjftX+Pe7nP0KddzYRYzhIJVzYmU394V9MWqZDqaNU19TJqnL8dNQxFLfyDibXuCqSD9QjV2I/iCfwr1xI81h11vmaatpZCDUUGYyxLRh1w5BuT1hHvLqktoAGliBD2lSOEB5KbwJu0nQCbEXPsM6CpMb4JygFloViQJ1L0kb91JEZIrnP6ruM1rrgrRu8Ufb+hob+BsCiEhSxwLy6ynOll2ZggMU/BVjQjkVQ3TmxBLvJ4T3QwzCSPpyN6lyHgSWNodN2Fn+PaWd0Sc0gSs8BY9PmOOc9m5ErVCrsscTr7aqcIbLZBnNA3y+mtLN5Vpuk3bmv9Bb9SAllXMLFFd604wtlxJi2Ecrb+6b4rc+QUmr5gnYqOYLCCOruFJfvMY63eFuuHWHKT+qroiGeuUhprUJDZUzhKUKnFFhhUZKtI11RAhVikZMPyMAsW4+gshQkEjXTRZqBpadpMh+c0uO8V2tRZAPIn7GXSdsCaGWbzL6yVZx79mM0W4sDKAp0Xs2wc04FgPbDu2sHGA+VxokrGctRovGVhSELx65aAj7x/ByhYdIByPCxHa7TNBUHb7n/4XLw8KIzVKr6jX2ep5m3KlYdjI7uxq8Hlpeu0hCRG3tdCqwOZsEOm3yhC3B/jsrKLzOsDP/x3ByAp8RvSVPP9WsWP55CxZUvc+q5LiVXBc7PhUC4nRB5/FMykjm6PboB92Y0DkP8Wql09FDr3vs8B3TkVLzOvzcz888oZTQpTaoixaAlVBs51al4N7UXhp5BInUCUIkknIyAEzXgK/5SpzixVoEkeNPkrMqg4hDaSHlKu0VDuqcP0Uv/9IKf/qs0+XK+2v9QBgqV16upbHK2EptgII3QJpXf2sq/IQTPXq3aly3NnpPUcIZJZtfG/Wk7qOFg/NhBVMvXWkrTRwQhJ5VXFTP0kXVpbgBml4Eq/zw+tAn5mmtieVaeFonZgqCIa+gi+tWiMy2V3aTubEYUGWTb3WOtxMt9i3bu9KhvOIr+fwCgpYUG9Vb/6v7H84zt5HT59sNlo9J8Acih8EfVZseC5JVF6ugxfnHc8BBOtvuUUFtOjIWwOgcfCiPXvsZdMQh0Ow3u9pYyJUZ3s+enHkBwtsu3+kXBkeL77eggslREVUz2g9eK8G5sKwSCsltgv8HQbHYARkXqq14Unb3NNakvab7BrQ2znWy7zU4P04Thtqn2fOqiAOUxuGRF5iNnkSnIZavtataT6rezB1efn4V2HYANcR4JoGj1JBXWa/dOJAYVnRLB7pbpS94yjbEMjRMB5aWSnjf6FzaDnXwfUAjAHvHNZsLxVhtIejyPQhg2kbSftwWWqw9zVvc3dz2a18V+MyNakcRiRS0CEqSl/L8vIhTppGultqzJZCKJKMAIzUE5+Q1EeDYL1kEImtSZ3hVXq4YaSB4q/bNKbDLG4pWs7IO6DFR88j5KIPfMPy15jgP6v+w0QZh8JNPQLveXvmYU4T9vxalT8i1P/4Nc2tI1A8HUv0hVoNu6V0ugTxukMLJLLFrZ80Ic7xCxNzqlzzcMhsYOHhcR4fZCdDtmoNfZm066hAkRcQJ0yNbiv7GUxrybGzer+N+s7QtS7/AGxuVW1MNQlsqdwTL2KTOvkZWHYB5cHpfqeS6zSPczeEiOIgT1fRXv3kYRBaJ4Dk7aWwXuCBbN2vAhRCEjcJC6QXE4BUukycNacytP1+HhCBeouxdPli9jrPIz2YH0iy7kc47XPiJr7zR1KAza3M3boau6xeb/why5zV7gHzB08qAxhm+pBLm4ERdKSwe/KAdGX5M0hiqMLHceUwJgzWEgukJfntxeZnP1rFJnTEAbkBy/CKtEmdr4IJYIFZU59IE9WOeYgrzl677JoGblkJ2B1sj1U8DbsRHIN+muVdAmYu+PBft0Zxih4JNe/rgmC/hNpDClMEPIEk4euRLf3xl1OHCOcWfEKuhwV/wIwJ0dtQzjN97g6a9IbSlupLAwPDZM925hC7qYicuzrF0Nj64GeOSMb4WIiEGpgH8TWOYkgxea+RoNLu0MEErcz26jqnV1QS8YGEFtT8k0lnhivg+SXIcgdVMoyavFVjqP4FeNm0aL0vE5+odOzEIzKKVNvHqofO4HbrRmlbAorS9OfzRlHbzJznIWO+8yyQc6oKyeT92THh4+wFYXQzkw0bvHsJZ08OymCAnIP+dZCzOSJ/MzcI3BjmcMZcHasPS6CfgSRtm7o8GJvyZljfW/z4zdy6+HzGLxZcgiwDc4qODYBEMdSRdDrxzpENZ4/IK3JTTcafsrRgbqi1nYyadQQx5A5xFhyYZq04iaDxx8GmDV6v4l4Hp/iPHoG0mp3yjxrt5hUjEyLW/+5lZXCnBxIfEiV/vdZBXGJxZs3oiATTOjMQxQYbbcyEs02WnFqRsMxDwlTsUnhbnEPUb9vkQhJjkHAsOt2V7csH7ojwlJPCp9baWV6iqSbvtQu5sSwWItOJflxiw2NCVvvMhGjOQOb8sFf6yeAKHOi+2vk0T7Kkr5JziPZ1L2PDmiZHwMhjVwd2uIZP6pBuQtoyxxn6040nUX5QwHjVO7RamVmqOLoKJFQHYWlRs1FHSxK6BQWH/1MeSDvCBWfiFnm6wWUMWr9NKlSOMFPWEnVdQ+eu83yoSRVT0U7WNoSw/tyK1KB64DL6Z7VffWB1AvnMZ1uvuSFtkEjHOTrcmDkGzSDQs61qO8kmDiTqlpMDVbv7DmhiXpBAC2agwLw/xh1m3xoRNetTNxowMVRjokZCrevoPLCcxrRDl0tZz9g/fF2q9rMRIAEhwWTYC+WgX4KQ4Xj4BpFbx1d3G2oHEAIJItyIFHHoKzwKJl+InNJEdXZUCEpdE3IiI2agnbP9W/1sSRWKE1Ub0KukhK7GYBIwdnwx0GgqqLYTRNrA8PnKLkSQp4ri9XJRSxI52jqbMP/S3x2ogIbvyYeQXbCwS7jloEMSgDSAQcTPZpPEzR5tHZG/XMMYHZX/h+aARdsaQJq7UNoIAJ8zrwkWnjNKSqrpF3Wfn/uOCYHXsyHepa/4f9cf0mtklGa4nSZPV8nVa0jvXzt2lzmg7uur0/ysa8Wl2LAZpkcTLlZ1tbFrbdBbcibPGi4r0QYJ6BM60yjfarwq/WnFHY2BLIpJKOxxA/7ko6kZ05t+fe/IqZnkxHqH9PsdpCN63H+py0S3tMOijrULzRMNjalF1ywPNm2Ugl/EiBbx7wv2G30wMdk1pdKbjgiGdq2wF1nPb4fOGBSoFtk8USmDSno1/FnYRKCH4gcyV3x/vHflFUHlSbh5Aw43YT1wxASJt7lvPq/uSTVw8wVe0bavIW4Gzyk7Fds5fjEi0eyZRtCfAbPlsCQ6aybuZQD870vdT8bxc1sRTdjDRbtFy8PGQqRwR91MhqITLOT7FpptcwltMV9jsGAEXBS6EX754sT3hYLB9OK6INME0rAbHUmq9tvCknIAiH3LIwuJJHHBLEFVeYveTk/00iBHjvn4Yb9MYEPaiTgMcQwRz8khf1aWj/Vz16c2aOjdiZOBEDpxH5h5tJLAMmc8bHfVAhWqdC6hDaulAgHEuo5gcKbkIbWEX/jvuOJq0fO9EYz6eDwwPy6hGGB5pzasJIKUDvXRCh40f5Iy8DOcLJRh+KCxjD9zDqhQQZ8WC+9l1/brckKPw59n0F3Sh3c+nHfyYBBlmsONTmjUZTgRwqg92z+2YQWyUf7g5jmNtBEjLyXtNvn9HkZGXd9YVp7Ps10GklGQiKYZmWUl73KGtsfsRF+/SQ4kRumd4YnlC7b04w6tFRk3HMog+38OVZDwMj40unK4XJWYABJX0t2XySGlXrL8ZNW8xR/iCVsW6+4glxFvTeH5ujPUjQKFb/0bvbZeExeYnRECdDz6u3z/gQYZiUMA8NUNKJuQTzWV9nXyubOWEG9NuJZ4X7oaGE/DtWO3j57r8bcE9KdtV+DpGvKyS+lBrdxL5vlOJ3rX+PqWeIOkxKc85lKT/us8H054bVubQebl0Tc+rvyqZVM7MToHgDj9VwlEbfV6o02Em/J5JUh0GMCoJw6CX5rfHgAIPlhlh/wXRVj8FQcUiTSzDb8lpwXxGO9rNWNfgE9ZRduiXT5LnUYhf6BC5eomyvZ6DcqDABJyWIV7kejbT0TlspicebTzP/kMyTPrGM9TchZjMdv6bPp8jRf4lUGWBjD4i1Diris5aM="

From b05a35dadf9cd9a74e553c96ca55538a1c4e2d7e Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Mon, 11 Mar 2024 09:50:40 +0000
Subject: [PATCH 07/28] docs: use BFGS with BackTracking for param estimation
 with PDEs tutorial

---
 docs/src/tutorials/param_estim.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/tutorials/param_estim.md b/docs/src/tutorials/param_estim.md
index 149b1e7200..79aac22b31 100644
--- a/docs/src/tutorials/param_estim.md
+++ b/docs/src/tutorials/param_estim.md
@@ -16,7 +16,7 @@ We start by defining the problem,
 
 ```@example param_estim
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, OrdinaryDiffEq,
-      Plots
+      Plots, LineSearches
 using ModelingToolkit: Interval, infimum, supremum
 @parameters t, σ_, β, ρ
 @variables x(..), y(..), z(..)
@@ -94,7 +94,7 @@ Then finally defining and optimizing using the `PhysicsInformedNN` interface.
 
 ```@example param_estim
 discretization = NeuralPDE.PhysicsInformedNN([chain1, chain2, chain3],
-                                             NeuralPDE.QuadratureTraining(), param_estim = true,
+                                             NeuralPDE.QuadratureTraining(; abstol = 1e-6, reltol = 1e-6, batch = 200), param_estim = true,
                                              additional_loss = additional_loss)
 @named pde_system = PDESystem(eqs, bcs, domains, [t], [x(t), y(t), z(t)], [σ_, ρ, β],
                               defaults = Dict([p .=> 1.0 for p in [σ_, ρ, β]]))
@@ -103,7 +103,7 @@ callback = function (p, l)
     println("Current loss is: $l")
     return false
 end
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 1000)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
 p_ = res.u[(end - 2):end] # p_ = [9.93, 28.002, 2.667]
 ```
 

From 66db48ff0393b17e88e1887e67c6d17e4a855aad Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Tue, 12 Mar 2024 04:46:48 +0000
Subject: [PATCH 08/28] docs: use BFGS with back tracking for low level system
 tutorial

---
 docs/src/tutorials/low_level.md | 7 +++----
 docs/src/tutorials/pdesystem.md | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/src/tutorials/low_level.md b/docs/src/tutorials/low_level.md
index d625bbcf0c..63fa0d476c 100644
--- a/docs/src/tutorials/low_level.md
+++ b/docs/src/tutorials/low_level.md
@@ -13,7 +13,7 @@ u(t, -1) = u(t, 1) = 0 \, ,
 with Physics-Informed Neural Networks. Here is an example of using the low-level API:
 
 ```@example low_level
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
 using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
@@ -37,7 +37,7 @@ domains = [t ∈ Interval(0.0, 1.0),
 
 # Neural network
 chain = Lux.Chain(Dense(2, 16, Lux.σ), Dense(16, 16, Lux.σ), Dense(16, 1))
-strategy = NeuralPDE.QuadratureTraining()
+strategy = NeuralPDE.QuadratureTraining(; abstol = 1e-6, reltol = 1e-6, batch = 200)
 
 indvars = [t, x]
 depvars = [u(t, x)]
@@ -67,8 +67,7 @@ end
 f_ = OptimizationFunction(loss_function, Optimization.AutoZygote())
 prob = Optimization.OptimizationProblem(f_, sym_prob.flat_init_params)
 
-res = Optimization.solve(prob, OptimizationOptimJL.BFGS(); callback = callback,
-                         maxiters = 2000)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 3000)
 ```
 
 And some analysis:
diff --git a/docs/src/tutorials/pdesystem.md b/docs/src/tutorials/pdesystem.md
index d5962f8d1e..4076bce5cf 100644
--- a/docs/src/tutorials/pdesystem.md
+++ b/docs/src/tutorials/pdesystem.md
@@ -66,7 +66,7 @@ end
 
 # Optimizer
 opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
-res = solve(prob, opt, callback = callback, maxiters = 2000)
+res = solve(prob, opt, callback = callback, maxiters = 1000)
 phi = discretization.phi
 
 dx = 0.05

From b13f7c0c5bf2576ce87caa04282915582d279808 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Tue, 12 Mar 2024 06:33:08 +0000
Subject: [PATCH 09/28] docs: update examples

---
 docs/src/examples/linear_parabolic.md     | 8 ++++----
 docs/src/examples/nonlinear_hyperbolic.md | 8 ++------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/src/examples/linear_parabolic.md b/docs/src/examples/linear_parabolic.md
index 6a584d5f3e..2e7936f5c0 100644
--- a/docs/src/examples/linear_parabolic.md
+++ b/docs/src/examples/linear_parabolic.md
@@ -24,9 +24,9 @@ w(t, 1) = \frac{e^{\lambda_1} cos(\frac{x}{a})-e^{\lambda_2}cos(\frac{x}{a})}{\l
 with a physics-informed neural network.
 
 ```@example
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimisers, OptimizationOptimJL, LineSearches
 using Plots
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 @variables u(..), w(..)
@@ -71,7 +71,7 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:2]
 
-strategy = GridTraining(0.01)
+strategy = StochasticTraining(500)
 discretization = PhysicsInformedNN(chain, strategy)
 
 @named pdesystem = PDESystem(eqs, bcs, domains, [t, x], [u(t, x), w(t, x)])
@@ -92,7 +92,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 500)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); callback = callback, maxiters = 10000)
 
 phi = discretization.phi
 
diff --git a/docs/src/examples/nonlinear_hyperbolic.md b/docs/src/examples/nonlinear_hyperbolic.md
index 6ac3401dbb..6666912dd4 100644
--- a/docs/src/examples/nonlinear_hyperbolic.md
+++ b/docs/src/examples/nonlinear_hyperbolic.md
@@ -33,7 +33,7 @@ where k is a root of the algebraic (transcendental) equation f(k) = g(k), j0 and
 We solve this with Neural:
 
 ```@example
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, Roots
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, Roots, LineSearches
 using SpecialFunctions
 using Plots
 import ModelingToolkit: Interval, infimum, supremum
@@ -99,7 +99,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 1000)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 200)
 
 phi = discretization.phi
 
@@ -117,9 +117,5 @@ for i in 1:2
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
     plot(p1, p2, p3)
-    savefig("nonlinear_hyperbolic_sol_u$i")
 end
 ```
-
-![nonlinear_hyperbolic_sol_u1](https://user-images.githubusercontent.com/26853713/126457614-d19e7a4d-f9e3-4e78-b8ae-1e58114a744e.png)
-![nonlinear_hyperbolic_sol_u2](https://user-images.githubusercontent.com/26853713/126457617-ee26c587-a97f-4a2e-b6b7-b326b1f117af.png)

From 01e3111823c583050ac3991b5904725d0dabfaa8 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Wed, 13 Mar 2024 04:42:48 +0000
Subject: [PATCH 10/28] docs: remove callback from solve as the output becomes
 long

---
 docs/src/examples/3rd.md                      |  2 +-
 docs/src/examples/heterogeneous.md            |  2 +-
 docs/src/examples/ks.md                       |  4 +--
 docs/src/examples/linear_parabolic.md         | 15 ++++++--
 docs/src/examples/nonlinear_elliptic.md       | 17 ++++++---
 docs/src/examples/nonlinear_hyperbolic.md     | 18 +++++++---
 docs/src/examples/wave.md                     | 16 +++------
 docs/src/tutorials/constraints.md             |  2 +-
 docs/src/tutorials/dae.md                     |  2 +-
 .../tutorials/derivative_neural_network.md    | 36 ++++++++++++++++---
 docs/src/tutorials/dgm.md                     |  4 +--
 docs/src/tutorials/gpu.md                     |  2 +-
 docs/src/tutorials/integro_diff.md            |  2 +-
 docs/src/tutorials/low_level.md               |  2 +-
 docs/src/tutorials/neural_adapter.md          | 10 +++---
 docs/src/tutorials/param_estim.md             |  2 +-
 docs/src/tutorials/pdesystem.md               |  5 +--
 docs/src/tutorials/systems.md                 | 19 ++++++++--
 18 files changed, 110 insertions(+), 50 deletions(-)

diff --git a/docs/src/examples/3rd.md b/docs/src/examples/3rd.md
index 829cd3f7e2..e64358e177 100644
--- a/docs/src/examples/3rd.md
+++ b/docs/src/examples/3rd.md
@@ -47,7 +47,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000)
 phi = discretization.phi
 ```
 
diff --git a/docs/src/examples/heterogeneous.md b/docs/src/examples/heterogeneous.md
index ad4b72b2ec..286e085be3 100644
--- a/docs/src/examples/heterogeneous.md
+++ b/docs/src/examples/heterogeneous.md
@@ -45,5 +45,5 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 100)
+res = Optimization.solve(prob, BFGS(); maxiters = 100)
 ```
diff --git a/docs/src/examples/ks.md b/docs/src/examples/ks.md
index bb2b1064f5..55f75f825d 100644
--- a/docs/src/examples/ks.md
+++ b/docs/src/examples/ks.md
@@ -72,7 +72,7 @@ callback = function (p, l)
 end
 
 opt = OptimizationOptimJL.BFGS()
-res = Optimization.solve(prob, opt; callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, opt; maxiters = 2000)
 phi = discretization.phi
 ```
 
@@ -93,5 +93,3 @@ p2 = plot(xs, u_real, title = "analytic")
 p3 = plot(xs, diff_u, title = "error")
 plot(p1, p2, p3)
 ```
-
-![plotks](https://user-images.githubusercontent.com/12683885/91025889-a6253200-e602-11ea-8f61-8e6e2488e025.png)
diff --git a/docs/src/examples/linear_parabolic.md b/docs/src/examples/linear_parabolic.md
index 2e7936f5c0..60494a5c9a 100644
--- a/docs/src/examples/linear_parabolic.md
+++ b/docs/src/examples/linear_parabolic.md
@@ -23,7 +23,7 @@ w(t, 1) = \frac{e^{\lambda_1} cos(\frac{x}{a})-e^{\lambda_2}cos(\frac{x}{a})}{\l
 
 with a physics-informed neural network.
 
-```@example
+```@example linear_parabolic
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimisers, OptimizationOptimJL, LineSearches
 using Plots
 using ModelingToolkit: Interval, infimum, supremum
@@ -92,7 +92,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); callback = callback, maxiters = 10000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); maxiters = 10000)
 
 phi = discretization.phi
 
@@ -105,10 +105,19 @@ analytic_sol_func(t, x) = [u_analytic(t, x), w_analytic(t, x)]
 u_real = [[analytic_sol_func(t, x)[i] for t in ts for x in xs] for i in 1:2]
 u_predict = [[phi[i]([t, x], minimizers_[i])[1] for t in ts for x in xs] for i in 1:2]
 diff_u = [abs.(u_real[i] .- u_predict[i]) for i in 1:2]
+ps = []
 for i in 1:2
     p1 = plot(ts, xs, u_real[i], linetype = :contourf, title = "u$i, analytic")
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
-    plot(p1, p2, p3)
+    push!(ps, plot(p1, p2, p3))
 end
 ```
+
+```@example linear_parabolic
+ps[1]
+```
+
+```@example linear_parabolic
+ps[2]
+```
diff --git a/docs/src/examples/nonlinear_elliptic.md b/docs/src/examples/nonlinear_elliptic.md
index 8ac44080cd..155330b2bc 100644
--- a/docs/src/examples/nonlinear_elliptic.md
+++ b/docs/src/examples/nonlinear_elliptic.md
@@ -26,10 +26,10 @@ where k is a root of the algebraic (transcendental) equation f(k) = g(k).
 
 This is done using a derivative neural network approximation.
 
-```@example
+```@example nonlinear_elliptic
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, Roots
 using Plots
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters x, y
 Dx = Differential(x)
@@ -103,7 +103,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 100)
+res = Optimization.solve(prob, BFGS(); maxiters = 100)
 
 phi = discretization.phi
 
@@ -116,10 +116,19 @@ analytic_sol_func(x, y) = [u_analytic(x, y), w_analytic(x, y)]
 u_real = [[analytic_sol_func(x, y)[i] for x in xs for y in ys] for i in 1:2]
 u_predict = [[phi[i]([x, y], minimizers_[i])[1] for x in xs for y in ys] for i in 1:2]
 diff_u = [abs.(u_real[i] .- u_predict[i]) for i in 1:2]
+ps = []
 for i in 1:2
     p1 = plot(xs, ys, u_real[i], linetype = :contourf, title = "u$i, analytic")
     p2 = plot(xs, ys, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(xs, ys, diff_u[i], linetype = :contourf, title = "error")
-    plot(p1, p2, p3)
+    push!(ps, plot(p1, p2, p3))
 end
 ```
+
+```@example nonlinear_elliptic
+ps[1]
+```
+
+```@example nonlinear_elliptic
+ps[2]
+```
diff --git a/docs/src/examples/nonlinear_hyperbolic.md b/docs/src/examples/nonlinear_hyperbolic.md
index 6666912dd4..60203b1610 100644
--- a/docs/src/examples/nonlinear_hyperbolic.md
+++ b/docs/src/examples/nonlinear_hyperbolic.md
@@ -32,11 +32,11 @@ where k is a root of the algebraic (transcendental) equation f(k) = g(k), j0 and
 
 We solve this with Neural:
 
-```@example
+```@example nonlinear_hyperbolic
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, Roots, LineSearches
 using SpecialFunctions
 using Plots
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 @variables u(..), w(..)
@@ -99,7 +99,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 200)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 200)
 
 phi = discretization.phi
 
@@ -112,10 +112,20 @@ analytic_sol_func(t, x) = [u_analytic(t, x), w_analytic(t, x)]
 u_real = [[analytic_sol_func(t, x)[i] for t in ts for x in xs] for i in 1:2]
 u_predict = [[phi[i]([t, x], minimizers_[i])[1] for t in ts for x in xs] for i in 1:2]
 diff_u = [abs.(u_real[i] .- u_predict[i]) for i in 1:2]
+ps = []
 for i in 1:2
     p1 = plot(ts, xs, u_real[i], linetype = :contourf, title = "u$i, analytic")
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
-    plot(p1, p2, p3)
+    push!(ps, plot(p1, p2, p3))
 end
 ```
+
+
+```@example nonlinear_hyperbolic
+ps[1]
+```
+
+```@example nonlinear_hyperbolic
+ps[2]
+```
diff --git a/docs/src/examples/wave.md b/docs/src/examples/wave.md
index d65a7b7225..954b15cc9a 100644
--- a/docs/src/examples/wave.md
+++ b/docs/src/examples/wave.md
@@ -17,7 +17,7 @@ Further, the solution of this equation with the given boundary conditions is pre
 
 ```@example wave
 using NeuralPDE, Lux, Optimization, OptimizationOptimJL
-import ModelingToolkit: Interval
+using ModelingToolkit: Interval
 
 @parameters t, x
 @variables u(..)
@@ -99,7 +99,7 @@ with grid discretization `dx = 0.05` and physics-informed neural networks. Here,
 ```@example wave2
 using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
 using Plots, Printf
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 
 @parameters t, x
 @variables u(..) Dxu(..) Dtu(..) O1(..) O2(..)
@@ -162,9 +162,9 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, BFGS(); maxiters = 2000)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, BFGS(); maxiters = 2000)
 
 phi = discretization.phi[1]
 
@@ -212,11 +212,3 @@ p2 = plot(ts, xs, u_predict, linetype = :contourf, title = "predict");
 p3 = plot(ts, xs, diff_u, linetype = :contourf, title = "error");
 plot(p1, p2, p3)
 ```
-
-We can see the results here:
-
-![Damped_wave_sol_adaptive_u](https://user-images.githubusercontent.com/12683885/149665332-d4daf7d0-682e-4933-a2b4-34f403881afb.png)
-
-Plotted as a line, one can see the analytical solution and the prediction here:
-
-![1Dwave_damped_adaptive](https://user-images.githubusercontent.com/12683885/149665327-69d04c01-2240-45ea-981e-a7b9412a3b58.gif)
diff --git a/docs/src/tutorials/constraints.md b/docs/src/tutorials/constraints.md
index 5d96bff3f1..491267fac2 100644
--- a/docs/src/tutorials/constraints.md
+++ b/docs/src/tutorials/constraints.md
@@ -84,7 +84,7 @@ cb_ = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(), callback = cb_, maxiters = 600)
+res = Optimization.solve(prob, BFGS(), maxiters = 600)
 ```
 
 And some analysis:
diff --git a/docs/src/tutorials/dae.md b/docs/src/tutorials/dae.md
index da17f4f46d..de83542c63 100644
--- a/docs/src/tutorials/dae.md
+++ b/docs/src/tutorials/dae.md
@@ -17,7 +17,7 @@ Let's solve a simple DAE system:
 using NeuralPDE 
 using Random
 using OrdinaryDiffEq, Statistics
-import Lux, OptimizationOptimisers
+using Lux, OptimizationOptimisers
 
 example = (du, u, p, t) -> [cos(2pi * t) - du[1], u[2] + cos(2pi * t) - du[2]]
 u₀ = [1.0, -1.0]
diff --git a/docs/src/tutorials/derivative_neural_network.md b/docs/src/tutorials/derivative_neural_network.md
index c803cdbf8e..82ad99b986 100644
--- a/docs/src/tutorials/derivative_neural_network.md
+++ b/docs/src/tutorials/derivative_neural_network.md
@@ -111,9 +111,9 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); callback = callback, maxiters = 2000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 200)
+res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); maxiters = 200)
 
 phi = discretization.phi
 ```
@@ -142,12 +142,40 @@ end
 u_real = [[analytic_sol_func_all(t, x)[i] for t in ts for x in xs] for i in 1:7]
 u_predict = [[phi[i]([t, x], minimizers_[i])[1] for t in ts for x in xs] for i in 1:7]
 diff_u = [abs.(u_real[i] .- u_predict[i]) for i in 1:7]
-
+ps = []
 titles = ["u1", "u2", "u3", "Dtu1", "Dtu2", "Dxu1", "Dxu2"]
 for i in 1:7
     p1 = plot(ts, xs, u_real[i], linetype = :contourf, title = "$(titles[i]), analytic")
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
-    plot(p1, p2, p3)
+    push!(ps, plot(p1, p2, p3))
 end
 ```
+
+```@example derivativenn
+ps[1]
+```
+
+```@example derivativenn
+ps[2]
+```
+
+```@example derivativenn
+ps[3]
+```
+
+```@example derivativenn
+ps[4]
+```
+
+```@example derivativenn
+ps[5]
+```
+
+```@example derivativenn
+ps[6]
+```
+
+```@example derivativenn
+ps[7]
+```
diff --git a/docs/src/tutorials/dgm.md b/docs/src/tutorials/dgm.md
index b20eb30bb9..61d32cbe33 100644
--- a/docs/src/tutorials/dgm.md
+++ b/docs/src/tutorials/dgm.md
@@ -100,9 +100,9 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, Adam(0.1); callback = callback, maxiters = 100)
+res = Optimization.solve(prob, Adam(0.1); maxiters = 100)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, Adam(0.01); callback = callback, maxiters = 500)
+res = Optimization.solve(prob, Adam(0.01); maxiters = 500)
 phi = discretization.phi
 
 u_predict= [first(phi([t, x], res.minimizer)) for t in ts, x in xs]
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
index 4f47df8475..feea05c9e1 100644
--- a/docs/src/tutorials/gpu.md
+++ b/docs/src/tutorials/gpu.md
@@ -104,7 +104,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); callback = callback, maxiters = 2500)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); maxiters = 2500)
 ```
 
 We then use the `remake` function to rebuild the PDE problem to start a new optimization at the optimized parameters, and continue with a lower learning rate:
diff --git a/docs/src/tutorials/integro_diff.md b/docs/src/tutorials/integro_diff.md
index 88e7683171..e977b87af4 100644
--- a/docs/src/tutorials/integro_diff.md
+++ b/docs/src/tutorials/integro_diff.md
@@ -67,7 +67,7 @@ callback = function (p, l)
     println("Current loss is: $l")
     return false
 end
-res = Optimization.solve(prob, BFGS(); callback = callback, maxiters = 100)
+res = Optimization.solve(prob, BFGS(); maxiters = 100)
 ```
 
 Plotting the final solution and analytical solution
diff --git a/docs/src/tutorials/low_level.md b/docs/src/tutorials/low_level.md
index 63fa0d476c..33b605e77d 100644
--- a/docs/src/tutorials/low_level.md
+++ b/docs/src/tutorials/low_level.md
@@ -67,7 +67,7 @@ end
 f_ = OptimizationFunction(loss_function, Optimization.AutoZygote())
 prob = Optimization.OptimizationProblem(f_, sym_prob.flat_init_params)
 
-res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 3000)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 3000)
 ```
 
 And some analysis:
diff --git a/docs/src/tutorials/neural_adapter.md b/docs/src/tutorials/neural_adapter.md
index 6f50157448..a56e30a269 100644
--- a/docs/src/tutorials/neural_adapter.md
+++ b/docs/src/tutorials/neural_adapter.md
@@ -50,7 +50,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 10000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); maxiters = 10000)
 phi = discretization.phi
 
 inner_ = 8
@@ -72,7 +72,7 @@ end
 strategy = NeuralPDE.QuadratureTraining()
 
 prob_ = NeuralPDE.neural_adapter(loss, init_params2, pde_system, strategy)
-res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 10000)
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 10000)
 
 phi_ = PhysicsInformedNN(chain2, strategy; init_params = res_.u).phi
 
@@ -180,7 +180,7 @@ for i in 1:count_decomp
 
     prob = NeuralPDE.discretize(pde_system_, discretization)
     symprob = NeuralPDE.symbolic_discretize(pde_system_, discretization)
-    res_ = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); maxiters = 10000, callback)
+    res_ = Optimization.solve(prob, OptimizationOptimisers.Adam(5e-3); maxiters = 10000)
     phi = discretization.phi
     push!(reses, res_)
     push!(phis, phi)
@@ -244,10 +244,10 @@ end
 
 prob_ = NeuralPDE.neural_adapter(losses, init_params2, pde_system_map,
                                  NeuralPDE.QuadratureTraining())
-res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 5000)
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 5000)
 prob_ = NeuralPDE.neural_adapter(losses, res_.u, pde_system_map,
                                  NeuralPDE.QuadratureTraining())
-res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); callback, maxiters = 5000)
+res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 5000)
 
 phi_ = PhysicsInformedNN(chain2, strategy; init_params = res_.u).phi
 
diff --git a/docs/src/tutorials/param_estim.md b/docs/src/tutorials/param_estim.md
index 79aac22b31..e696c76702 100644
--- a/docs/src/tutorials/param_estim.md
+++ b/docs/src/tutorials/param_estim.md
@@ -103,7 +103,7 @@ callback = function (p, l)
     println("Current loss is: $l")
     return false
 end
-res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 1000)
 p_ = res.u[(end - 2):end] # p_ = [9.93, 28.002, 2.667]
 ```
 
diff --git a/docs/src/tutorials/pdesystem.md b/docs/src/tutorials/pdesystem.md
index 4076bce5cf..d1c438b30c 100644
--- a/docs/src/tutorials/pdesystem.md
+++ b/docs/src/tutorials/pdesystem.md
@@ -66,7 +66,7 @@ end
 
 # Optimizer
 opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
-res = solve(prob, opt, callback = callback, maxiters = 1000)
+res = solve(prob, opt, maxiters = 1000)
 phi = discretization.phi
 
 dx = 0.05
@@ -145,7 +145,8 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, opt, callback = callback, maxiters = 1000)
+# We can pass the callback function in the solve. Not doing here as the output would be very long.
+res = Optimization.solve(prob, opt, maxiters = 1000)
 phi = discretization.phi
 ```
 
diff --git a/docs/src/tutorials/systems.md b/docs/src/tutorials/systems.md
index d4b277b1f3..80e3fbb62a 100644
--- a/docs/src/tutorials/systems.md
+++ b/docs/src/tutorials/systems.md
@@ -84,7 +84,7 @@ callback = function (p, l)
     return false
 end
 
-res = solve(prob, LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
+res = solve(prob, LBFGS(linesearch = BackTracking()); maxiters = 1000)
 phi = discretization.phi
 ```
 
@@ -151,7 +151,7 @@ end
 f_ = OptimizationFunction(loss_function, Optimization.AutoZygote())
 prob = Optimization.OptimizationProblem(f_, sym_prob.flat_init_params)
 
-res = Optimization.solve(prob, OptimizationOptimJL.LBFGS(linesearch = BackTracking()); callback = callback, maxiters = 1000)
+res = Optimization.solve(prob, OptimizationOptimJL.LBFGS(linesearch = BackTracking()); maxiters = 1000)
 ```
 
 ## Solution Representation
@@ -172,14 +172,27 @@ end
 u_real = [[analytic_sol_func(t, x)[i] for t in ts for x in xs] for i in 1:3]
 u_predict = [[phi[i]([t, x], minimizers_[i])[1] for t in ts for x in xs] for i in 1:3]
 diff_u = [abs.(u_real[i] .- u_predict[i]) for i in 1:3]
+ps = []
 for i in 1:3
     p1 = plot(ts, xs, u_real[i], linetype = :contourf, title = "u$i, analytic")
     p2 = plot(ts, xs, u_predict[i], linetype = :contourf, title = "predict")
     p3 = plot(ts, xs, diff_u[i], linetype = :contourf, title = "error")
-    plot(p1, p2, p3)
+    push!(ps, plot(p1, p2, p3))
 end
 ```
 
+```@example system
+ps[1]
+```
+
+```@example system
+ps[2]
+```
+
+```@example system
+ps[3]
+```
+
 Notice here that the solution is represented in the `OptimizationSolution` with `u` as
 the parameters for the trained neural network. But, for the case where the neural network
 is from Lux.jl, it's given as a `ComponentArray` where `res.u.depvar.x` corresponds to the result

From ba9784a52cb8ed4b3e69da10e2bbfdd59f08d4f9 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Wed, 13 Mar 2024 09:48:06 +0000
Subject: [PATCH 11/28] docs: use BFGS with backtracking for constraints
 tutorial

---
 docs/src/tutorials/constraints.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tutorials/constraints.md b/docs/src/tutorials/constraints.md
index 491267fac2..59f389b172 100644
--- a/docs/src/tutorials/constraints.md
+++ b/docs/src/tutorials/constraints.md
@@ -21,7 +21,7 @@ p(-2.2) = p(2.2) = 0
 with Physics-Informed Neural Networks.
 
 ```@example fokkerplank
-using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL
+using NeuralPDE, Lux, ModelingToolkit, Optimization, OptimizationOptimJL, LineSearches
 using Integrals, Cubature
 using ModelingToolkit: Interval, infimum, supremum
 # the example is taken from this article https://arxiv.org/abs/1910.10503
@@ -84,7 +84,7 @@ cb_ = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(), maxiters = 600)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()), callback = cb_, maxiters = 600)
 ```
 
 And some analysis:

From c89c95179b6a59e81aedddea5c4ebaeac252a531 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Wed, 13 Mar 2024 10:16:08 +0000
Subject: [PATCH 12/28] ci: remove neuraladpater from GHA tests

---
 .github/workflows/CI.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 59f046cc9f..97b6f7f8f6 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -25,7 +25,6 @@ jobs:
           - AdaptiveLoss
           - Logging
           - Forward
-          - NeuralAdapter
           - DGM
         version:
           - "1"

From b3c454f8e39d721969cd3e2a0607ce64ef104e0a Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Wed, 13 Mar 2024 17:16:44 +0000
Subject: [PATCH 13/28] build(docs): add MOL

---
 docs/Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/Project.toml b/docs/Project.toml
index bb7441e6b5..481298e001 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -12,6 +12,7 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MethodOfLines = "94925ecb-adb7-4558-8ed8-f975c56a0bf4"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 MonteCarloMeasurements = "0987c9cc-fe09-11e8-30f0-b96dd679fdca"
 NeuralPDE = "315f7962-48a3-4962-8226-d0f33b1235f0"
@@ -39,6 +40,7 @@ Integrals = "4"
 LineSearches = "7.2"
 Lux = "0.5"
 LuxCUDA = "0.3"
+MethodOfLines = "0.10"
 ModelingToolkit = "8.33"
 MonteCarloMeasurements = "1"
 NeuralPDE = "5.3"

From 9d3f9f0d02d677660824b5e291e64a022387bc42 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 14 Mar 2024 04:47:30 +0000
Subject: [PATCH 14/28] docs: fix dgm example

---
 docs/src/tutorials/dgm.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/src/tutorials/dgm.md b/docs/src/tutorials/dgm.md
index 61d32cbe33..1917df8f43 100644
--- a/docs/src/tutorials/dgm.md
+++ b/docs/src/tutorials/dgm.md
@@ -7,6 +7,7 @@ Deep Galerkin Method is a meshless deep learning algorithm to solve high dimensi
 In the following example, we demonstrate computing the loss function using Quasi-Random Sampling, a sampling technique that uses quasi-Monte Carlo sampling to generate low discrepancy random sequences in high dimensional spaces.
 
 ### Algorithm
+
 The authors of DGM suggest a network composed of LSTM-type layers that works well for most of the parabolic and quasi-parabolic PDEs.
 
 ```math
@@ -47,13 +48,15 @@ u(t, 1) & = 0
 ```
 
 ### Copy- Pasteable code
+
 ```@example dgm
 using NeuralPDE
 using ModelingToolkit, Optimization, OptimizationOptimisers
-import Lux: tanh, identity
+using Lux: tanh, identity
 using Distributions
-import ModelingToolkit: Interval, infimum, supremum
+using ModelingToolkit: Interval, infimum, supremum
 using MethodOfLines, OrdinaryDiffEq
+using Plots
 
 @parameters x t
 @variables u(..)
@@ -108,8 +111,9 @@ phi = discretization.phi
 u_predict= [first(phi([t, x], res.minimizer)) for t in ts, x in xs]
 
 diff_u = abs.(u_predict .- u_MOL)
+tgrid = 0.0:0.01:1.0
+xgrid = -1.0:0.01:1.0
 
-using Plots
 p1 = plot(tgrid, xgrid, u_MOL', linetype = :contourf, title = "FD");
 p2 = plot(tgrid, xgrid, u_predict', linetype = :contourf, title = "predict");
 p3 = plot(tgrid, xgrid, diff_u', linetype = :contourf, title = "error");

From 2daf59b4ee0c767f6e2bd9c4caf6743b2c824718 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Thu, 14 Mar 2024 11:41:51 -0400
Subject: [PATCH 15/28] Update Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index b3da28942b..6545930693 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NeuralPDE"
 uuid = "315f7962-48a3-4962-8226-d0f33b1235f0"
 authors = ["Chris Rackauckas <accounts@chrisrackauckas.com>"]
-version = "5.13.0"
+version = "5.14.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 808008df43c4775bd54c6b4fa1bf8601b3e50e08 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 14 Mar 2024 06:39:29 +0000
Subject: [PATCH 16/28] ci: add Downgrade CI

---
 .github/workflows/Downgrade.yml | 60 +++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 .github/workflows/Downgrade.yml

diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
new file mode 100644
index 0000000000..190d11e1f6
--- /dev/null
+++ b/.github/workflows/Downgrade.yml
@@ -0,0 +1,60 @@
+name: Downgrade
+on:
+  pull_request:
+    branches:
+      - master
+    paths-ignore:
+      - 'docs/**'
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'docs/**'
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        group:
+          - QA
+          - ODEBPINN
+          - PDEBPINN
+          - NNPDE1
+          - NNPDE2
+          - AdaptiveLoss
+          - Logging
+          - Forward
+          - DGM
+        version:
+          - "1"
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+      - uses: actions/cache@v4
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-downgrade-compat@v1
+        with:
+          skip: LinearAlgebra, Pkg, Random, Test
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        env:
+          GROUP: ${{ matrix.group }}
+      - uses: julia-actions/julia-processcoverage@v1
+        with:
+          directories: src,lib/NeuralPDELogging/src
+      - uses: codecov/codecov-action@v4
+        with:
+          files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: true

From 55017b45b375b79219de950aaf9c105fc12556d6 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 14 Mar 2024 06:45:21 +0000
Subject: [PATCH 17/28] build: update compats of packages

---
 Project.toml | 56 ++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6545930693..d76565ff25 100644
--- a/Project.toml
+++ b/Project.toml
@@ -40,50 +40,50 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Adapt = "4"
-AdvancedHMC = "0.6"
+AdvancedHMC = "0.6.1"
 Aqua = "0.8"
-ArrayInterface = "7"
-CUDA = "5.1"
-ChainRulesCore = "1"
-ComponentArrays = "0.15"
+ArrayInterface = "7.7"
+CUDA = "5.2"
+ChainRulesCore = "1.18"
+ComponentArrays = "0.15.8"
 Cubature = "1.5"
-DiffEqBase = "6"
-DiffEqNoiseProcess = "5.1"
-Distributions = "0.25"
+DiffEqBase = "6.144"
+DiffEqNoiseProcess = "5.20"
+Distributions = "0.25.107"
 DocStringExtensions = "0.9"
 DomainSets = "0.6, 0.7"
-Flux = "0.14"
-ForwardDiff = "0.10"
-Functors = "0.4"
+Flux = "0.14.11"
+ForwardDiff = "0.10.36"
+Functors = "0.4.4"
 Integrals = "4"
 LineSearches = "7.2"
 LinearAlgebra = "1"
 LogDensityProblems = "2"
-Lux = "0.5"
-LuxCUDA = "0.3"
+Lux = "0.5.14"
+LuxCUDA = "0.3.2"
 MCMCChains = "6"
-ModelingToolkit = "8"
-MonteCarloMeasurements = "1"
+MethodOfLines = "0.10.7"
+ModelingToolkit = "8.75"
+MonteCarloMeasurements = "1.1"
 Optim = "1.7.8"
-Optimization = "3"
-OptimizationOptimJL = "0.2"
-OptimizationOptimisers = "0.2"
-OrdinaryDiffEq = "6"
+Optimization = "3.22"
+OptimizationOptimJL = "0.2.1"
+OptimizationOptimisers = "0.2.1"
+OrdinaryDiffEq = "6.70"
 Pkg = "1"
 QuasiMonteCarlo = "0.3.2"
 Random = "1"
-Reexport = "1.0"
-RuntimeGeneratedFunctions = "0.5"
+Reexport = "1.2"
+RuntimeGeneratedFunctions = "0.5.11"
 SafeTestsets = "0.1"
-SciMLBase = "2"
-Statistics = "1"
-SymbolicUtils = "1"
-Symbolics = "5"
+SciMLBase = "2.24"
+Statistics = "1.10"
+SymbolicUtils = "1.4"
+Symbolics = "5.17"
 Test = "1"
 UnPack = "1"
-Zygote = "0.6"
-MethodOfLines = "0.10.7"
-julia = "1.6"
+Zygote = "0.6.68"
+julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

From a89afeb52ca843da64af64f4d981019b5c73b1de Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 15 Mar 2024 06:11:00 +0000
Subject: [PATCH 18/28] docs: fix rendering of dgm example

---
 docs/src/tutorials/dgm.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/src/tutorials/dgm.md b/docs/src/tutorials/dgm.md
index 1917df8f43..cb853d0099 100644
--- a/docs/src/tutorials/dgm.md
+++ b/docs/src/tutorials/dgm.md
@@ -28,15 +28,15 @@ where $\vec{x}$ is the concatenated vector of $(t, x)$ and $L$ is the number of
 
 Let's try to solve the following Burger's equation using Deep Galerkin Method for $\alpha = 0.05$ and compare our solution with the finite difference method:
 
-$$
+```math
 \partial_t u(t, x) + u(t, x) \partial_x u(t, x) - \alpha \partial_{xx} u(t, x) = 0 
-$$
+```
 
 defined over
 
-$$ 
+```math
 t \in [0, 1], x \in [-1, 1] 
-$$
+```
 
 with boundary conditions
 ```math

From fe39bcc0ce754721cbb343935b2e2f02860de4e3 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 15 Mar 2024 06:11:21 +0000
Subject: [PATCH 19/28] docs: remove example block from warnonly

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 3b1340c277..dfa8087f3f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,7 @@ makedocs(sitename = "NeuralPDE.jl",
          authors = "#",
          modules = [NeuralPDE],
          clean = true, doctest = false, linkcheck = true,
-         warnonly = [:missing_docs, :example_block],
+         warnonly = [:missing_docs],
          format = Documenter.HTML(assets = ["assets/favicon.ico"],
                                   canonical = "https://docs.sciml.ai/NeuralPDE/stable/"),
          pages = pages)

From bfb159c19077536324ca720e6a4b84eb7af1ad2c Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 15 Mar 2024 06:26:14 +0000
Subject: [PATCH 20/28] docs: fix rendering of Bayesian PINN docstrings

---
 src/BPINN_ode.jl        | 34 ++++++++++++++++++----------------
 src/advancedHMC_MCMC.jl |  4 ++--
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/BPINN_ode.jl b/src/BPINN_ode.jl
index 4493a07326..3bbf1afea8 100644
--- a/src/BPINN_ode.jl
+++ b/src/BPINN_ode.jl
@@ -123,18 +123,19 @@ function BNNODE(chain, Kernel = HMC; strategy = nothing, draw_samples = 2000,
 end
 
 """
-Contains ahmc_bayesian_pinn_ode() function output:
-1> a MCMCChains.jl chain object for sampled parameters
-2> The set of all sampled parameters
-3> statistics like: 
-    > n_steps
-    > acceptance_rate
-    > log_density
-    > hamiltonian_energy
-    > hamiltonian_energy_error 
-    > numerical_error
-    > step_size
-    > nom_step_size
+Contains `ahmc_bayesian_pinn_ode()` function output:
+
+1. A MCMCChains.jl chain object for sampled parameters.
+2. The set of all sampled parameters.
+3. Statistics like:
+    - n_steps
+    - acceptance_rate
+    - log_density
+    - hamiltonian_energy
+    - hamiltonian_energy_error
+    - numerical_error
+    - step_size
+    - nom_step_size
 """
 struct BPINNstats{MC, S, ST}
     mcmc_chain::MC
@@ -143,10 +144,11 @@ struct BPINNstats{MC, S, ST}
 end
 
 """
-BPINN Solution contains the original solution from AdvancedHMC.jl sampling(BPINNstats contains fields related to that)
-> ensemblesol is the Probabilistic Estimate(MonteCarloMeasurements.jl Particles type) of Ensemble solution from All Neural Network's(made using all sampled parameters) output's.
-> estimated_nn_params - Probabilistic Estimate of NN params from sampled weights,biases
-> estimated_de_params - Probabilistic Estimate of DE params from sampled unknown DE parameters
+BPINN Solution contains the original solution from AdvancedHMC.jl sampling (BPINNstats contains fields related to that).
+
+1. `ensemblesol` is the Probabilistic Estimate (MonteCarloMeasurements.jl Particles type) of Ensemble solution from All Neural Network's (made using all sampled parameters) output's.
+2. `estimated_nn_params` - Probabilistic Estimate of NN params from sampled weights, biases.
+3. `estimated_de_params` - Probabilistic Estimate of DE params from sampled unknown DE parameters.
 """
 struct BPINNsolution{O <: BPINNstats, E, NP, OP, P}
     original::O
diff --git a/src/advancedHMC_MCMC.jl b/src/advancedHMC_MCMC.jl
index c051d208ca..6f30149257 100644
--- a/src/advancedHMC_MCMC.jl
+++ b/src/advancedHMC_MCMC.jl
@@ -344,8 +344,8 @@ end
 
 !!! warn
 
-    Note that ahmc_bayesian_pinn_ode() only supports ODEs which are written in the out-of-place form, i.e.
-    `du = f(u,p,t)`, and not `f(du,u,p,t)`. If not declared out-of-place, then the ahmc_bayesian_pinn_ode()
+    Note that `ahmc_bayesian_pinn_ode()` only supports ODEs which are written in the out-of-place form, i.e.
+    `du = f(u,p,t)`, and not `f(du,u,p,t)`. If not declared out-of-place, then the `ahmc_bayesian_pinn_ode()`
     will exit with an error.
 
 ## Example

From 2a72fa8c58ac8736559cfcda739738bd357b0df1 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 21 Mar 2024 06:56:28 +0000
Subject: [PATCH 21/28] refactor: correctly lower quadrature training strategy
 in NNODE

---
 src/ode_solve.jl           | 35 +++++++++++------------------------
 src/training_strategies.jl |  6 +-----
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/ode_solve.jl b/src/ode_solve.jl
index f93183d76f..b9c46d3463 100644
--- a/src/ode_solve.jl
+++ b/src/ode_solve.jl
@@ -27,11 +27,12 @@ of the physics-informed neural network which is used as a solver for a standard
               the PDE operators. The reverse mode of the loss function is always
               automatic differentiation (via Zygote), this is only for the derivative
               in the loss function (the derivative with respect to time).
-* `batch`: The batch size to use for the internal quadrature. Defaults to `0`, which
+* `batch`: The batch size for the loss computation. Defaults to `false`, which
            means the application of the neural network is done at individual time points one
-           at a time. `batch>0` means the neural network is applied at a row vector of values
+           at a time. `true` means the neural network is applied at a row vector of values
            `t` simultaneously, i.e. it's the batch size for the neural network evaluations.
            This requires a neural network compatible with batched data.
+           This is not applicable to `QuadratureTraining` where `batch` is passed in the `strategy` which is the number of points it can parallelly compute the integrand.
 * `param_estim`: Boolean to indicate whether parameters of the differential equations are learnt along with parameters of the neural network.
 * `strategy`: The training strategy used to choose the points for the evaluations.
               Default of `nothing` means that `QuadratureTraining` with QuadGK is used if no
@@ -88,7 +89,7 @@ struct NNODE{C, O, P, B, PE, K, AL <: Union{Nothing, Function},
 end
 function NNODE(chain, opt, init_params = nothing;
                strategy = nothing,
-               autodiff = false, batch = nothing, param_estim = false, additional_loss = nothing, kwargs...)
+               autodiff = false, batch = false, param_estim = false, additional_loss = nothing, kwargs...)
     !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
     NNODE(chain, opt, init_params, autodiff, batch, strategy, param_estim, additional_loss, kwargs)
 end
@@ -111,11 +112,7 @@ end
 
 function generate_phi_θ(chain::Lux.AbstractExplicitLayer, t, u0, init_params)
     θ, st = Lux.setup(Random.default_rng(), chain)
-    if init_params === nothing
-        init_params = ComponentArrays.ComponentArray(θ)
-    else
-        init_params = ComponentArrays.ComponentArray(init_params)
-    end
+    isnothing(init_params) && (init_params = θ)
     ODEPhi(chain, t, u0, st), init_params
 end
 
@@ -182,7 +179,7 @@ function ode_dfdx(phi::ODEPhi, t::AbstractVector, θ, autodiff::Bool)
 end
 
 """
-    inner_loss(phi, f, autodiff, t, θ, p)
+    inner_loss(phi, f, autodiff, t, θ, p, param_estim)
 
 Simple L2 inner loss at a time `t` with parameters `θ` of the neural network.
 """
@@ -220,7 +217,7 @@ function inner_loss(phi::ODEPhi{C, T, U}, f, autodiff::Bool, t::AbstractVector,
 end
 
 """
-    generate_loss(strategy, phi, f, autodiff, tspan, p, batch)
+    generate_loss(strategy, phi, f, autodiff, tspan, p, batch, param_estim)
 
 Representation of the loss function, parametric on the training strategy `strategy`.
 """
@@ -229,14 +226,13 @@ function generate_loss(strategy::QuadratureTraining, phi, f, autodiff::Bool, tsp
     integrand(t::Number, θ) = abs2(inner_loss(phi, f, autodiff, t, θ, p, param_estim))
 
     integrand(ts, θ) = [abs2(inner_loss(phi, f, autodiff, t, θ, p, param_estim)) for t in ts]
-    @assert batch == 0 # not implemented
 
     function loss(θ, _)
-        intprob = IntegralProblem(integrand, (tspan[1], tspan[2]), θ)
-        sol = solve(intprob, QuadGKJL(); abstol = strategy.abstol, reltol = strategy.reltol)
+        intf = BatchIntegralFunction(integrand, max_batch = strategy.batch)
+        intprob = IntegralProblem(intf, (tspan[1], tspan[2]), θ)
+        sol = solve(intprob, strategy.quadrature_alg; abstol = strategy.abstol, reltol = strategy.reltol, maxiters = strategy.maxiters)
         sol.u
     end
-
     return loss
 end
 
@@ -395,16 +391,7 @@ function DiffEqBase.__solve(prob::DiffEqBase.AbstractODEProblem,
         alg.strategy
     end
 
-    batch = if alg.batch === nothing
-        if strategy isa QuadratureTraining
-            strategy.batch
-        else
-            true
-        end
-    else
-        alg.batch
-    end
-
+    batch = alg.batch
     inner_f = generate_loss(strategy, phi, f, autodiff, tspan, p, batch, param_estim)
     additional_loss = alg.additional_loss
     (param_estim && isnothing(additional_loss)) && throw(ArgumentError("Please provide `additional_loss` in `NNODE` for parameter estimation (`param_estim` is true)."))
diff --git a/src/training_strategies.jl b/src/training_strategies.jl
index 1db8780941..c997e6c4cc 100644
--- a/src/training_strategies.jl
+++ b/src/training_strategies.jl
@@ -272,7 +272,7 @@ struct QuadratureTraining{Q <: SciMLBase.AbstractIntegralAlgorithm, T} <:
     batch::Int64
 end
 
-function QuadratureTraining(; quadrature_alg = CubatureJLh(), reltol = 1e-6, abstol = 1e-3,
+function QuadratureTraining(; quadrature_alg = CubatureJLh(), reltol = 1e-3, abstol = 1e-6,
                             maxiters = 1_000, batch = 100)
     QuadratureTraining(quadrature_alg, reltol, abstol, maxiters, batch)
 end
@@ -306,11 +306,7 @@ function get_loss_function(loss_function, lb, ub, eltypeθ, strategy::Quadrature
     end
     area = eltypeθ(prod(abs.(ub .- lb)))
     f_ = (lb, ub, loss_, θ) -> begin
-        # last_x = 1
         function integrand(x, θ)
-            # last_x = x
-            # mean(abs2,loss_(x,θ), dims=2)
-            # size_x = fill(size(x)[2],(1,1))
             x = adapt(parameterless_type(ComponentArrays.getdata(θ)), x)
             sum(abs2, view(loss_(x, θ), 1, :), dims = 2) #./ size_x
         end

From a80cd034e68e3482ff525c82cf1303023cacda59 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 21 Mar 2024 08:36:15 +0000
Subject: [PATCH 22/28] test: remove tests for assertion error with batch to be
 true for QuadratureTraining

---
 test/NNODE_tests.jl | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/test/NNODE_tests.jl b/test/NNODE_tests.jl
index 122adcceb3..b3731f059d 100644
--- a/test/NNODE_tests.jl
+++ b/test/NNODE_tests.jl
@@ -69,9 +69,6 @@ end
     sol = solve(prob, NNODE(luxchain, opt), verbose = true, maxiters = 400)
     @test sol.errors[:l2] < 0.5
 
-    @test_throws AssertionError solve(prob, NNODE(luxchain, opt; batch = true), verbose = true,
-                        maxiters = 400)
-
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = false,
                                 strategy = StochasticTraining(100)),
@@ -105,10 +102,6 @@ end
                 abstol = 1.0f-8)
     @test sol.errors[:l2] < 0.5
 
-    @test_throws AssertionError solve(prob, NNODE(luxchain, opt; batch = true), verbose = true,
-                        maxiters = 400,
-                        abstol = 1.0f-8)
-
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = false,
                                 strategy = StochasticTraining(100)),

From 118102989f25117c6aa919480578526d5422db91 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Thu, 21 Mar 2024 12:19:04 +0000
Subject: [PATCH 23/28] test: make verbose = false for NNODE tests

---
 test/NNODE_tests.jl       | 61 +++++++++++++++++++++++----------------
 test/NNODE_tstops_test.jl | 21 ++++++++++----
 2 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/test/NNODE_tests.jl b/test/NNODE_tests.jl
index b3731f059d..8475737e05 100644
--- a/test/NNODE_tests.jl
+++ b/test/NNODE_tests.jl
@@ -9,6 +9,7 @@ Random.seed!(100)
 
 @testset "Scalar" begin
     # Run a solve on scalars
+    println("Scalar")
     linear = (u, p, t) -> cos(2pi * t)
     tspan = (0.0f0, 1.0f0)
     u0 = 0.0f0
@@ -16,26 +17,27 @@ Random.seed!(100)
     luxchain = Lux.Chain(Lux.Dense(1, 5, Lux.σ), Lux.Dense(5, 1))
     opt = OptimizationOptimisers.Adam(0.1, (0.9, 0.95))
 
-    sol = solve(prob, NNODE(luxchain, opt), dt = 1 / 20.0f0, verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt), dt = 1 / 20.0f0, verbose = false,
                 abstol = 1.0f-10, maxiters = 200)
 
     @test_throws ArgumentError solve(prob, NNODE(luxchain, opt; autodiff = true),
                         dt = 1 / 20.0f0,
-                        verbose = true, abstol = 1.0f-10, maxiters = 200)
+                        verbose = false, abstol = 1.0f-10, maxiters = 200)
 
-    sol = solve(prob, NNODE(luxchain, opt), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt), verbose = false,
                 abstol = 1.0f-6, maxiters = 200)
 
     opt = OptimizationOptimJL.BFGS()
-    sol = solve(prob, NNODE(luxchain, opt), dt = 1 / 20.0f0, verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt), dt = 1 / 20.0f0, verbose = false,
                 abstol = 1.0f-10, maxiters = 200)
 
-    sol = solve(prob, NNODE(luxchain, opt), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt), verbose = false,
                 abstol = 1.0f-6, maxiters = 200)
 end
 
 @testset "Vector" begin
     # Run a solve on vectors
+    println("Vector")
     linear = (u, p, t) -> [cos(2pi * t)]
     tspan = (0.0f0, 1.0f0)
     u0 = [0.0f0]
@@ -44,14 +46,14 @@ end
 
     opt = OptimizationOptimJL.BFGS()
     sol = solve(prob, NNODE(luxchain, opt), dt = 1 / 20.0f0, abstol = 1e-10,
-                verbose = true, maxiters = 200)
+                verbose = false, maxiters = 200)
 
     @test_throws ArgumentError solve(prob, NNODE(luxchain, opt; autodiff = true),
                         dt = 1 / 20.0f0,
-                        abstol = 1e-10, verbose = true, maxiters = 200)
+                        abstol = 1e-10, verbose = false, maxiters = 200)
 
     sol = solve(prob, NNODE(luxchain, opt), abstol = 1.0f-6,
-                verbose = true, maxiters = 200)
+                verbose = false, maxiters = 200)
 
     @test sol(0.5) isa Vector
     @test sol(0.5; idxs = 1) isa Number
@@ -59,6 +61,7 @@ end
 end
 
 @testset "Example 1" begin
+    println("Example 1")
     linear = (u, p, t) -> @. t^3 + 2 * t + (t^2) * ((1 + 3 * (t^2)) / (1 + t + (t^3))) -
                             u * (t + ((1 + 3 * (t^2)) / (1 + t + t^3)))
     linear_analytic = (u0, p, t) -> [exp(-(t^2) / 2) / (1 + t + t^3) + t^2]
@@ -66,68 +69,70 @@ end
     luxchain = Lux.Chain(Lux.Dense(1, 128, Lux.σ), Lux.Dense(128, 1))
     opt = OptimizationOptimisers.Adam(0.01)
 
-    sol = solve(prob, NNODE(luxchain, opt), verbose = true, maxiters = 400)
+    sol = solve(prob, NNODE(luxchain, opt), verbose = false, maxiters = 400)
     @test sol.errors[:l2] < 0.5
 
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = false,
                                 strategy = StochasticTraining(100)),
-                verbose = true, maxiters = 400)
+                verbose = false, maxiters = 400)
     @test sol.errors[:l2] < 0.5
 
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = true,
                                 strategy = StochasticTraining(100)),
-                verbose = true, maxiters = 400)
+                verbose = false, maxiters = 400)
     @test sol.errors[:l2] < 0.5
 
-    sol = solve(prob, NNODE(luxchain, opt; batch = false), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt; batch = false), verbose = false,
                 maxiters = 400, dt = 1 / 5.0f0)
     @test sol.errors[:l2] < 0.5
 
-    sol = solve(prob, NNODE(luxchain, opt; batch = true), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt; batch = true), verbose = false,
                 maxiters = 400,
                 dt = 1 / 5.0f0)
     @test sol.errors[:l2] < 0.5
 end
 
 @testset "Example 2" begin
+    println("Example 2")
     linear = (u, p, t) -> -u / 5 + exp(-t / 5) .* cos(t)
     linear_analytic = (u0, p, t) -> exp(-t / 5) * (u0 + sin(t))
     prob = ODEProblem(ODEFunction(linear, analytic = linear_analytic), 0.0f0, (0.0f0, 1.0f0))
     luxchain = Lux.Chain(Lux.Dense(1, 5, Lux.σ), Lux.Dense(5, 1))
 
     opt = OptimizationOptimisers.Adam(0.1)
-    sol = solve(prob, NNODE(luxchain, opt), verbose = true, maxiters = 400,
+    sol = solve(prob, NNODE(luxchain, opt), verbose = false, maxiters = 400,
                 abstol = 1.0f-8)
     @test sol.errors[:l2] < 0.5
 
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = false,
                                 strategy = StochasticTraining(100)),
-                verbose = true, maxiters = 400,
+                verbose = false, maxiters = 400,
                 abstol = 1.0f-8)
     @test sol.errors[:l2] < 0.5
 
     sol = solve(prob,
                 NNODE(luxchain, opt; batch = true,
                                 strategy = StochasticTraining(100)),
-                verbose = true, maxiters = 400,
+                verbose = false, maxiters = 400,
                 abstol = 1.0f-8)
     @test sol.errors[:l2] < 0.5
 
-    sol = solve(prob, NNODE(luxchain, opt; batch = false), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt; batch = false), verbose = false,
                 maxiters = 400,
                 abstol = 1.0f-8, dt = 1 / 5.0f0)
     @test sol.errors[:l2] < 0.5
 
-    sol = solve(prob, NNODE(luxchain, opt; batch = true), verbose = true,
+    sol = solve(prob, NNODE(luxchain, opt; batch = true), verbose = false,
                 maxiters = 400,
                 abstol = 1.0f-8, dt = 1 / 5.0f0)
     @test sol.errors[:l2] < 0.5
 end
 
 @testset "Example 3" begin
+    println("Example 3")
     linear = (u, p, t) -> [cos(2pi * t), sin(2pi * t)]
     tspan = (0.0f0, 1.0f0)
     u0 = [0.0f0, -1.0f0 / 2pi]
@@ -139,13 +144,14 @@ end
     alg = NNODE(luxchain, opt; autodiff = false)
 
     sol = solve(prob,
-                alg, verbose = true, dt = 1 / 40.0f0,
+                alg, verbose = false, dt = 1 / 40.0f0,
                 maxiters = 2000, abstol = 1.0f-7)
     @test sol.errors[:l2] < 0.5
 end
 
 @testset "Training Strategies" begin
     @testset "WeightedIntervalTraining" begin
+        println("WeightedIntervalTraining")
         function f(u, p, t)
             [p[1] * u[1] - p[2] * u[1] * u[2], -p[3] * u[2] + p[4] * u[1] * u[2]]
         end
@@ -162,7 +168,7 @@ end
         points = 200
         alg = NNODE(chain, opt, autodiff = false,
                             strategy = NeuralPDE.WeightedIntervalTraining(weights, points))
-        sol = solve(prob_oop, alg, verbose = true, maxiters = 100000, saveat = 0.01)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = 5000, saveat = 0.01)
         @test abs(mean(sol) - mean(true_sol)) < 0.2
     end
 
@@ -176,6 +182,7 @@ end
     u_analytical(x) = (1 / (2pi)) .* sin.(2pi .* x)
 
     @testset "GridTraining" begin
+        println("GridTraining")
         luxchain = Lux.Chain(Lux.Dense(1, 5, Lux.σ), Lux.Dense(5, 1))
         (u_, t_) = (u_analytical(ts), ts)
         function additional_loss(phi, θ)
@@ -183,22 +190,24 @@ end
         end
         alg1 = NNODE(luxchain, opt, strategy = GridTraining(0.01),
                             additional_loss = additional_loss)
-        sol1 = solve(prob, alg1, verbose = true, abstol = 1e-8, maxiters = 500)
+        sol1 = solve(prob, alg1, verbose = false, abstol = 1e-8, maxiters = 500)
         @test sol1.errors[:l2] < 0.5
     end
 
     @testset "QuadratureTraining" begin
+        println("QuadratureTraining")
         luxchain = Lux.Chain(Lux.Dense(1, 5, Lux.σ), Lux.Dense(5, 1))
         (u_, t_) = (u_analytical(ts), ts)
         function additional_loss(phi, θ)
             return sum(sum(abs2, [phi(t, θ) for t in t_] .- u_)) / length(u_)
         end
         alg1 = NNODE(luxchain, opt, additional_loss = additional_loss)
-        sol1 = solve(prob, alg1, verbose = true, abstol = 1e-10, maxiters = 200)
+        sol1 = solve(prob, alg1, verbose = false, abstol = 1e-10, maxiters = 200)
         @test sol1.errors[:l2] < 0.5
     end
 
     @testset "StochasticTraining" begin
+        println("StochasticTraining")
         luxchain = Lux.Chain(Lux.Dense(1, 5, Lux.σ), Lux.Dense(5, 1))
         (u_, t_) = (u_analytical(ts), ts)
         function additional_loss(phi, θ)
@@ -206,12 +215,13 @@ end
         end
         alg1 = NNODE(luxchain, opt, strategy = StochasticTraining(1000),
                             additional_loss = additional_loss)
-        sol1 = solve(prob, alg1, verbose = true, abstol = 1e-8, maxiters = 500)
+        sol1 = solve(prob, alg1, verbose = false, abstol = 1e-8, maxiters = 500)
         @test sol1.errors[:l2] < 0.5
     end
 end
 
 @testset "Parameter Estimation" begin
+    println("Parameter Estimation")
     function lorenz(u, p, t)
         return [p[1]*(u[2]-u[1]),
                 u[1]*(p[2]-u[3])-u[2],
@@ -235,12 +245,13 @@ end
             )
     opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
     alg = NNODE(luxchain, opt, strategy = GridTraining(0.01), param_estim = true, additional_loss = additional_loss)
-    sol = solve(prob, alg, verbose = true, abstol = 1e-8, maxiters = 5000, saveat = t_)
+    sol = solve(prob, alg, verbose = false, abstol = 1e-8, maxiters = 5000, saveat = t_)
     @test sol.k.u.p≈true_p atol=1e-2
     @test reduce(hcat, sol.u)≈u_ atol=1e-2
 end
 
 @testset "Translating from Flux" begin
+    println("Translating from Flux")
     linear = (u, p, t) -> cos(2pi * t)
     linear_analytic = (u, p, t) -> (1 / (2pi)) * sin(2pi * t)
     tspan = (0.0, 1.0)
@@ -252,6 +263,6 @@ end
     fluxchain = Flux.Chain(Flux.Dense(1, 5, Flux.σ), Flux.Dense(5, 1))
     alg1 = NNODE(fluxchain, opt)
     @test alg1.chain isa Lux.AbstractExplicitLayer
-    sol1 = solve(prob, alg1, verbose = true, abstol = 1e-10, maxiters = 200)
+    sol1 = solve(prob, alg1, verbose = false, abstol = 1e-10, maxiters = 200)
     @test sol1.errors[:l2] < 0.5
 end
diff --git a/test/NNODE_tstops_test.jl b/test/NNODE_tstops_test.jl
index c0f8422a09..bc4a4b08d6 100644
--- a/test/NNODE_tstops_test.jl
+++ b/test/NNODE_tstops_test.jl
@@ -31,46 +31,55 @@ points = 3
 dx = 1.0
 
 @testset "GridTraining" begin
+    println("GridTraining")
     @testset "Without added points" begin
+        println("Without added points")
         # (difference between solutions should be high)
         alg = NNODE(chain, opt, autodiff = false, strategy = GridTraining(dx))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat)
         @test abs(mean(sol) - mean(true_sol)) > threshold
     end
     @testset "With added points" begin
+        println("With added points")
         # (difference between solutions should be low)
         alg = NNODE(chain, opt, autodiff = false, strategy = GridTraining(dx))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
         @test abs(mean(sol) - mean(true_sol)) < threshold
     end
 end
 
 @testset "WeightedIntervalTraining" begin
+    println("WeightedIntervalTraining")
     @testset "Without added points" begin
+        println("Without added points")
         # (difference between solutions should be high)
         alg = NNODE(chain, opt, autodiff = false, strategy = WeightedIntervalTraining(weights, points))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat)
         @test abs(mean(sol) - mean(true_sol)) > threshold
     end
     @testset "With added points" begin
+        println("With added points")
         # (difference between solutions should be low)
         alg = NNODE(chain, opt, autodiff = false, strategy = WeightedIntervalTraining(weights, points))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
         @test abs(mean(sol) - mean(true_sol)) < threshold
     end
 end
 
 @testset "StochasticTraining" begin
+    println("StochasticTraining")
     @testset "Without added points" begin
+        println("Without added points")
         # (difference between solutions should be high)
         alg = NNODE(chain, opt, autodiff = false, strategy = StochasticTraining(points))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat)
         @test abs(mean(sol) - mean(true_sol)) > threshold
     end
     @testset "With added points" begin
+        println("With added points")
         # (difference between solutions should be low)
         alg = NNODE(chain, opt, autodiff = false, strategy = StochasticTraining(points))
-        sol = solve(prob_oop, alg, verbose=true, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
+        sol = solve(prob_oop, alg, verbose = false, maxiters = maxiters, saveat = saveat, tstops = addedPoints)
         @test abs(mean(sol) - mean(true_sol)) < threshold
     end
 end

From c74384caa5fb7a5e360d5c3a98b85a6fa469328c Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 22 Mar 2024 07:11:24 +0000
Subject: [PATCH 24/28] test: use BFGS instead of LBFGS for parameter
 estimation in NNODE

---
 test/NNODE_tests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/NNODE_tests.jl b/test/NNODE_tests.jl
index 8475737e05..1e2ba3c055 100644
--- a/test/NNODE_tests.jl
+++ b/test/NNODE_tests.jl
@@ -243,9 +243,9 @@ end
                 Lux.Dense(n, n, Lux.σ),
                 Lux.Dense(n, 3)
             )
-    opt = OptimizationOptimJL.LBFGS(linesearch = BackTracking())
+    opt = OptimizationOptimJL.BFGS(linesearch = BackTracking())
     alg = NNODE(luxchain, opt, strategy = GridTraining(0.01), param_estim = true, additional_loss = additional_loss)
-    sol = solve(prob, alg, verbose = false, abstol = 1e-8, maxiters = 5000, saveat = t_)
+    sol = solve(prob, alg, verbose = false, abstol = 1e-8, maxiters = 1000, saveat = t_)
     @test sol.k.u.p≈true_p atol=1e-2
     @test reduce(hcat, sol.u)≈u_ atol=1e-2
 end

From 4ab36749592e9b32c79d89940ff1457f0880b94f Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 22 Mar 2024 09:10:30 +0000
Subject: [PATCH 25/28] refactor: use FromFluxAdaptor for converting Flux to
 Lux as Lux.transform is deprecated

---
 src/BPINN_ode.jl        | 2 +-
 src/NeuralPDE.jl        | 1 +
 src/advancedHMC_MCMC.jl | 2 +-
 src/dae_solve.jl        | 2 +-
 src/ode_solve.jl        | 4 ++--
 src/pinn_types.jl       | 6 +++---
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/BPINN_ode.jl b/src/BPINN_ode.jl
index 3bbf1afea8..087b9d41cc 100644
--- a/src/BPINN_ode.jl
+++ b/src/BPINN_ode.jl
@@ -113,7 +113,7 @@ function BNNODE(chain, Kernel = HMC; strategy = nothing, draw_samples = 2000,
         targetacceptancerate = 0.8),
     Integratorkwargs = (Integrator = Leapfrog,),
     autodiff = false, progress = false, verbose = false)
-    !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+    !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
     BNNODE(chain, Kernel, strategy,
         draw_samples, priorsNNw, param, l2std,
         phystd, dataset, physdt, MCMCkwargs,
diff --git a/src/NeuralPDE.jl b/src/NeuralPDE.jl
index d367bf8b6c..2ba1de25b2 100644
--- a/src/NeuralPDE.jl
+++ b/src/NeuralPDE.jl
@@ -30,6 +30,7 @@ using DomainSets: Domain, ClosedInterval, AbstractInterval, leftendpoint, righte
 using SciMLBase: @add_kwonly, parameterless_type
 using UnPack: @unpack
 import ChainRulesCore, Lux, ComponentArrays
+using Lux: FromFluxAdaptor
 using ChainRulesCore: @non_differentiable
 
 RuntimeGeneratedFunctions.init(@__MODULE__)
diff --git a/src/advancedHMC_MCMC.jl b/src/advancedHMC_MCMC.jl
index 6f30149257..c86c87599c 100644
--- a/src/advancedHMC_MCMC.jl
+++ b/src/advancedHMC_MCMC.jl
@@ -439,7 +439,7 @@ function ahmc_bayesian_pinn_ode(prob::DiffEqBase.ODEProblem, chain;
     MCMCkwargs = (n_leapfrog = 30,),
     progress = false, verbose = false)
 
-    !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+    !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
     # NN parameter prior mean and variance(PriorsNN must be a tuple)
     if isinplace(prob)
         throw(error("The BPINN ODE solver only supports out-of-place ODE definitions, i.e. du=f(u,p,t)."))
diff --git a/src/dae_solve.jl b/src/dae_solve.jl
index 3f6bf8f0fb..0c9d1323de 100644
--- a/src/dae_solve.jl
+++ b/src/dae_solve.jl
@@ -42,7 +42,7 @@ end
 
 function NNDAE(chain, opt, init_params = nothing; strategy = nothing, autodiff = false,
         kwargs...)
-    !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+    !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
     NNDAE(chain, opt, init_params, autodiff, strategy, kwargs)
 end
 
diff --git a/src/ode_solve.jl b/src/ode_solve.jl
index b9c46d3463..ef57beabe8 100644
--- a/src/ode_solve.jl
+++ b/src/ode_solve.jl
@@ -15,7 +15,7 @@ of the physics-informed neural network which is used as a solver for a standard
 ## Positional Arguments
 
 * `chain`: A neural network architecture, defined as a `Lux.AbstractExplicitLayer` or `Flux.Chain`. 
-          `Flux.Chain` will be converted to `Lux` using `Lux.transform`.
+          `Flux.Chain` will be converted to `Lux` using `adapt(FromFluxAdaptor(false, false), chain)`.
 * `opt`: The optimizer to train the neural network.
 * `init_params`: The initial parameter of the neural network. By default, this is `nothing` 
                  which thus uses the random initialization provided by the neural network library.
@@ -90,7 +90,7 @@ end
 function NNODE(chain, opt, init_params = nothing;
                strategy = nothing,
                autodiff = false, batch = false, param_estim = false, additional_loss = nothing, kwargs...)
-    !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+    !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
     NNODE(chain, opt, init_params, autodiff, batch, strategy, param_estim, additional_loss, kwargs)
 end
 
diff --git a/src/pinn_types.jl b/src/pinn_types.jl
index 69116c4da3..50f7649dc6 100644
--- a/src/pinn_types.jl
+++ b/src/pinn_types.jl
@@ -48,7 +48,7 @@ methodology.
 * `chain`: a vector of Lux/Flux chains with a d-dimensional input and a
            1-dimensional output corresponding to each of the dependent variables. Note that this
            specification respects the order of the dependent variables as specified in the PDESystem.
-           Flux chains will be converted to Lux internally using `Lux.transform`.
+           Flux chains will be converted to Lux internally using `adapt(FromFluxAdaptor(false, false), chain)`.
 * `strategy`: determines which training strategy will be used. See the Training Strategy
               documentation for more details.
 
@@ -107,7 +107,7 @@ struct PhysicsInformedNN{T, P, PH, DER, PE, AL, ADA, LOG, K} <: AbstractPINN
         if multioutput
             !all(i -> i isa Lux.AbstractExplicitLayer, chain) && (chain = Lux.transform.(chain))
         else
-            !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+            !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
         end
         if phi === nothing
             if multioutput
@@ -243,7 +243,7 @@ struct BayesianPINN{T, P, PH, DER, PE, AL, ADA, LOG, D, K} <: AbstractPINN
         if multioutput
             !all(i -> i isa Lux.AbstractExplicitLayer, chain) && (chain = Lux.transform.(chain))
         else
-            !(chain isa Lux.AbstractExplicitLayer) && (chain = Lux.transform(chain))
+            !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
         end
         if phi === nothing
             if multioutput

From 6c94adff6203504f8d7d5c63724ae31645cda660 Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 22 Mar 2024 09:28:12 +0000
Subject: [PATCH 26/28] docs: adjust tolerance in QuadratureTraining in neural
 adapter example

---
 docs/src/tutorials/neural_adapter.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/src/tutorials/neural_adapter.md b/docs/src/tutorials/neural_adapter.md
index a56e30a269..93f0dd036f 100644
--- a/docs/src/tutorials/neural_adapter.md
+++ b/docs/src/tutorials/neural_adapter.md
@@ -69,7 +69,7 @@ function loss(cord, θ)
     ch2 .- phi(cord, res.u)
 end
 
-strategy = NeuralPDE.QuadratureTraining()
+strategy = NeuralPDE.QuadratureTraining(; reltol = 1e-6)
 
 prob_ = NeuralPDE.neural_adapter(loss, init_params2, pde_system, strategy)
 res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 10000)
@@ -173,7 +173,7 @@ for i in 1:count_decomp
     bcs_ = create_bcs(domains_[1].domain, phi_bound)
     @named pde_system_ = PDESystem(eq, bcs_, domains_, [x, y], [u(x, y)])
     push!(pde_system_map, pde_system_)
-    strategy = NeuralPDE.QuadratureTraining()
+    strategy = NeuralPDE.QuadratureTraining(; reltol = 1e-6)
 
     discretization = NeuralPDE.PhysicsInformedNN(chains[i], strategy;
                                                  init_params = init_params[i])
@@ -243,10 +243,10 @@ callback = function (p, l)
 end
 
 prob_ = NeuralPDE.neural_adapter(losses, init_params2, pde_system_map,
-                                 NeuralPDE.QuadratureTraining())
+                                 NeuralPDE.QuadratureTraining(; reltol = 1e-6))
 res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 5000)
 prob_ = NeuralPDE.neural_adapter(losses, res_.u, pde_system_map,
-                                 NeuralPDE.QuadratureTraining())
+                                 NeuralPDE.QuadratureTraining(; reltol = 1e-6))
 res_ = Optimization.solve(prob_, OptimizationOptimisers.Adam(5e-3); maxiters = 5000)
 
 phi_ = PhysicsInformedNN(chain2, strategy; init_params = res_.u).phi

From 7f3cc74adfa811f1ff9a1d30bd816d92b9ab732c Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 22 Mar 2024 10:30:19 +0000
Subject: [PATCH 27/28] build: bump lower bound of Lux

---
 Project.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index d76565ff25..45468985c5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -44,7 +44,7 @@ AdvancedHMC = "0.6.1"
 Aqua = "0.8"
 ArrayInterface = "7.7"
 CUDA = "5.2"
-ChainRulesCore = "1.18"
+ChainRulesCore = "1.21"
 ComponentArrays = "0.15.8"
 Cubature = "1.5"
 DiffEqBase = "6.144"
@@ -59,7 +59,7 @@ Integrals = "4"
 LineSearches = "7.2"
 LinearAlgebra = "1"
 LogDensityProblems = "2"
-Lux = "0.5.14"
+Lux = "0.5.22"
 LuxCUDA = "0.3.2"
 MCMCChains = "6"
 MethodOfLines = "0.10.7"
@@ -82,7 +82,7 @@ SymbolicUtils = "1.4"
 Symbolics = "5.17"
 Test = "1"
 UnPack = "1"
-Zygote = "0.6.68"
+Zygote = "0.6.69"
 julia = "1.10"
 
 [extras]
@@ -91,12 +91,12 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MethodOfLines = "94925ecb-adb7-4558-8ed8-f975c56a0bf4"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-MethodOfLines = "94925ecb-adb7-4558-8ed8-f975c56a0bf4"
 
 [targets]
 test = ["Aqua", "Test", "CUDA", "SafeTestsets", "OptimizationOptimJL", "Pkg", "OrdinaryDiffEq", "LineSearches", "LuxCUDA", "Flux", "MethodOfLines"]

From 7e3de9879de54292ac4a0ec8f41375104303120b Mon Sep 17 00:00:00 2001
From: Sathvik Bhagavan <sathvik.bhagavan@juliahub.com>
Date: Fri, 22 Mar 2024 11:15:10 +0000
Subject: [PATCH 28/28] refactor: make batch to be true by default for NNODE

---
 src/ode_solve.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/ode_solve.jl b/src/ode_solve.jl
index ef57beabe8..8af6a708d9 100644
--- a/src/ode_solve.jl
+++ b/src/ode_solve.jl
@@ -27,11 +27,9 @@ of the physics-informed neural network which is used as a solver for a standard
               the PDE operators. The reverse mode of the loss function is always
               automatic differentiation (via Zygote), this is only for the derivative
               in the loss function (the derivative with respect to time).
-* `batch`: The batch size for the loss computation. Defaults to `false`, which
-           means the application of the neural network is done at individual time points one
-           at a time. `true` means the neural network is applied at a row vector of values
-           `t` simultaneously, i.e. it's the batch size for the neural network evaluations.
-           This requires a neural network compatible with batched data.
+* `batch`: The batch size for the loss computation. Defaults to `true`, means the neural network is applied at a row vector of values
+           `t` simultaneously, i.e. it's the batch size for the neural network evaluations. This requires a neural network compatible with batched data. 
+           `false` means which means the application of the neural network is done at individual time points one at a time. 
            This is not applicable to `QuadratureTraining` where `batch` is passed in the `strategy` which is the number of points it can parallelly compute the integrand.
 * `param_estim`: Boolean to indicate whether parameters of the differential equations are learnt along with parameters of the neural network.
 * `strategy`: The training strategy used to choose the points for the evaluations.
@@ -89,7 +87,7 @@ struct NNODE{C, O, P, B, PE, K, AL <: Union{Nothing, Function},
 end
 function NNODE(chain, opt, init_params = nothing;
                strategy = nothing,
-               autodiff = false, batch = false, param_estim = false, additional_loss = nothing, kwargs...)
+               autodiff = false, batch = true, param_estim = false, additional_loss = nothing, kwargs...)
     !(chain isa Lux.AbstractExplicitLayer) && (chain = adapt(FromFluxAdaptor(false, false), chain))
     NNODE(chain, opt, init_params, autodiff, batch, strategy, param_estim, additional_loss, kwargs)
 end