From 9770ceb04f67706a69a45d857677470e364a3609 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 19 Mar 2024 00:12:52 -0400 Subject: [PATCH 01/88] adding gmm --- .../benchmarks/ReverseMode/gmmrs/Cargo.lock | 16 ++ .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 19 +++ .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 126 +++++++++++++++ .../benchmarks/ReverseMode/gmmrs/src/main.rs | 147 ++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock new file mode 100644 index 000000000000..cfdab95b3d9c --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "gmmrs" +version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml new file mode 100644 index 000000000000..9ff65cd97178 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "gmmrs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["cdylib"] + + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] +libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs new file mode 100644 index 000000000000..46bd57c99dd1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs @@ -0,0 +1,126 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs new file mode 100644 index 000000000000..784dce85143a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs @@ -0,0 +1,147 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} From 9f05ff836db1fa5a483e27e52d735d76dbab7e5c Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 25 Mar 2024 18:11:00 -0400 Subject: [PATCH 02/88] working C too --- .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 3 +- .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 11 +- .../benchmarks/ReverseMode/gmmrs/src/main.rs | 125 +-------------- .../benchmarks/ReverseMode/gmmrs/src/main.rs2 | 147 ++++++++++++++++++ 4 files changed, 154 insertions(+), 132 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml index 9ff65cd97178..6271be06da5d 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml @@ -6,8 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] -crate-type = ["cdylib"] - +crate-type = ["lib"] [profile.release] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs index 46bd57c99dd1..2b565072a505 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs @@ -1,14 +1,13 @@ #![feature(autodiff)] -#![crate_type = "dylib"] use libm::lgamma; #[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { +pub extern "C" fn dgmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; @@ -18,7 +17,7 @@ fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *con gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); } -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; @@ -104,8 +103,8 @@ fn log_gamma_distrib(a: f64, p: f64) -> f64 { #[derive(Clone, Copy)] #[repr(C)] pub struct Wishart { - gamma: f64, - m: usize, + pub gamma: f64, + pub m: usize, } fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { let n = p + wishart.m + 1; diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs index 784dce85143a..8f4357588ab8 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs @@ -1,6 +1,5 @@ #![feature(autodiff)] -#![crate_type = "dylib"] -use libm::lgamma; +use gmmrs::{Wishart, dgmm_objective}; fn main() { let d = 2; @@ -23,125 +22,3 @@ fn main() { // pass as raw ptr: dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); } -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -#[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - gamma: f64, - m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 new file mode 100644 index 000000000000..784dce85143a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 @@ -0,0 +1,147 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} From 4da9910986f12470743130ebaadeb4fa99653f94 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 25 Mar 2024 20:17:27 -0400 Subject: [PATCH 03/88] Delete enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 --- .../benchmarks/ReverseMode/gmmrs/src/main.rs2 | 147 ------------------ 1 file changed, 147 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 deleted file mode 100644 index 784dce85143a..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 +++ /dev/null @@ -1,147 +0,0 @@ -#![feature(autodiff)] -#![crate_type = "dylib"] -use libm::lgamma; - -fn main() { - let d = 2; - let k = 2; - let n = 2; - let alphas = vec![0.5, 0.5]; - let means = vec![0., 0., 1., 1.]; - let icf = vec![1., 0., 1.]; - let x = vec![0., 0., 1., 1.]; - let wishart = Wishart { gamma: 1., m: 1 }; - let mut err = 0.; - let mut d_alphas = vec![0.; alphas.len()]; - let mut d_means = vec![0.; means.len()]; - let mut d_icf = vec![0.; icf.len()]; - let mut d_x = vec![0.; x.len()]; - let mut d_err = 0.; - let mut err2 = &mut err; - let mut d_err2 = &mut d_err; - let wishart2 = &wishart; - // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); -} -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -#[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - gamma: f64, - m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} From 0f68ce8848374e78502d9213baf065d05d812817 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 28 Mar 2024 14:00:34 -0400 Subject: [PATCH 04/88] rust setup --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 34 +++++ .../benchmarks/ReverseMode/gmm/Makefile.make | 15 ++- .../benchmarks/ReverseMode/gmmrs/Cargo.lock | 16 --- .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 18 --- .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 125 ------------------ .../benchmarks/ReverseMode/gmmrs/src/main.rs | 24 ---- enzyme/benchmarks/lit.site.cfg.py.in | 67 ++++++++-- 7 files changed, 101 insertions(+), 198 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 00f4302b9f99..e47dfaa62a23 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -47,6 +47,11 @@ extern "C" { alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb); + + void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * + alphasb, const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, double * + errb); } void read_gmm_instance(const string& fn, @@ -269,6 +274,35 @@ int main(const int argc, const char* argv[]) { test_suite["tools"].push_back(enzyme); } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = { 0, std::vector(Jcols) }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + json enzyme; + enzyme["name"] = "Rust Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5072679eeb0e..1e1a36f72c2f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,23 +1,28 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi +# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi .PHONY: clean clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +%-unopt.ll: %.cpp src/lib.rs + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + clang++ $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + echo opt $^ $(LOAD) -passes="enzyme" -o $@ -S + opt $^ $(LOAD) -passes="enzyme" -o $@ -S %-opt.ll: %-raw.ll + echo opt $^ -o $@ -S opt $^ -o $@ -S #opt $^ -O2 -o $@ -S gmm.o: gmm-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm + pwd + echo clang++ -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -v -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock deleted file mode 100644 index cfdab95b3d9c..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock +++ /dev/null @@ -1,16 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "gmmrs" -version = "0.1.0" -dependencies = [ - "libm", -] - -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml deleted file mode 100644 index 6271be06da5d..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "gmmrs" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -crate-type = ["lib"] - -[profile.release] -lto = "fat" - -[profile.dev] -lto = "fat" - -[dependencies] -libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs deleted file mode 100644 index 2b565072a505..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs +++ /dev/null @@ -1,125 +0,0 @@ -#![feature(autodiff)] -use libm::lgamma; - -#[no_mangle] -pub extern "C" fn dgmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - pub gamma: f64, - pub m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs deleted file mode 100644 index 8f4357588ab8..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs +++ /dev/null @@ -1,24 +0,0 @@ -#![feature(autodiff)] -use gmmrs::{Wishart, dgmm_objective}; - -fn main() { - let d = 2; - let k = 2; - let n = 2; - let alphas = vec![0.5, 0.5]; - let means = vec![0., 0., 1., 1.]; - let icf = vec![1., 0., 1.]; - let x = vec![0., 0., 1., 1.]; - let wishart = Wishart { gamma: 1., m: 1 }; - let mut err = 0.; - let mut d_alphas = vec![0.; alphas.len()]; - let mut d_means = vec![0.; means.len()]; - let mut d_icf = vec![0.; icf.len()]; - let mut d_x = vec![0.; x.len()]; - let mut d_err = 0.; - let mut err2 = &mut err; - let mut d_err2 = &mut d_err; - let wishart2 = &wishart; - // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); -} diff --git a/enzyme/benchmarks/lit.site.cfg.py.in b/enzyme/benchmarks/lit.site.cfg.py.in index 93937f9c62d3..2ef3c28b0ca9 100644 --- a/enzyme/benchmarks/lit.site.cfg.py.in +++ b/enzyme/benchmarks/lit.site.cfg.py.in @@ -49,21 +49,68 @@ config.substitutions.append(('%lli', config.llvm_tools_dir + "/lli" + (" --jit-k config.substitutions.append(('%opt', config.llvm_tools_dir + "/opt")) config.substitutions.append(('%llvmver', config.llvm_ver)) config.substitutions.append(('%FileCheck', config.llvm_tools_dir + "/FileCheck")) -config.substitutions.append(('%clang', config.llvm_tools_dir + "/clang")) -config.substitutions.append(('%loadEnzyme', '' - + (" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + +emopt = config.enzyme_obj_root + "/Enzyme/MLIR/enzymemlir-opt" +if len("@ENZYME_BINARY_DIR@") == 0: + emopt = os.path.dirname(os.path.abspath(__file__)) + "/../enzymemlir-opt" + +eclang = config.llvm_tools_dir + "/clang" +if len("@ENZYME_BINARY_DIR@") == 0: + eclang = os.path.dirname(os.path.abspath(__file__)) + "/../enzyme-clang" + resource = config.llvm_tools_dir + "/../clang/staging" + eclang += " -resource-dir " + resource + " " + eclang += "-I " + os.path.dirname(os.path.abspath(__file__)) + "/Integration" + +config.substitutions.append(('%eopt', emopt)) +config.substitutions.append(('%llvmver', config.llvm_ver)) +config.substitutions.append(('%FileCheck', config.llvm_tools_dir + "/FileCheck")) +config.substitutions.append(('%clang', eclang)) +config.substitutions.append(('%O0TBAA', "-O1 -Xclang -disable-llvm-passes")) + +oldPM = ((" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + ' -load=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext - + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "") - + ' -enzyme-preopt=0' - )) + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) +newPM = ((" --enable-new-pm=1" if int(config.llvm_ver) in (12,13) else "") + + ' -load-pass-plugin=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + ' -load=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) +if len("@ENZYME_BINARY_DIR@") == 0: + oldPM = ((" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) + newPM = ((" --enable-new-pm=1" if int(config.llvm_ver) in (12,13) else "") + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) + +oldPMOP = oldPM +newPMOP = newPM +if int(config.llvm_ver) == 16: + newPM += " -opaque-pointers=0" + oldPM += " -opaque-pointers=0" + +config.substitutions.append(('%loadEnzyme', oldPM if int(config.llvm_ver) < 16 else newPM)) +config.substitutions.append(('%newLoadEnzyme', newPM)) +config.substitutions.append(('%OPloadEnzyme', oldPMOP if int(config.llvm_ver) < 16 else newPMOP)) +config.substitutions.append(('%OPnewLoadEnzyme', newPMOP)) +config.substitutions.append(('%enzyme', ('-enzyme' if int(config.llvm_ver) < 16 else '-passes="enzyme"'))) +config.substitutions.append(('%simplifycfg', ("simplify-cfg" if int(config.llvm_ver) < 11 else "simplifycfg"))) +config.substitutions.append(('%loopmssa', ("loop" if int(config.llvm_ver) < 11 else "loop-mssa"))) + config.substitutions.append(('%loadBC', '' + ' @ENZYME_BINARY_DIR@/BCLoad/BCPass-' + config.llvm_ver + config.llvm_shlib_ext )) config.substitutions.append(('%BClibdir', '@ENZYME_SOURCE_DIR@/bclib/')) -config.substitutions.append(('%loadClangEnzyme', '' - + (" -fno-experimental-new-pass-manager" if int(config.llvm_ver) >= 13 else "") - + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext - )) + +oldPM = (((" -fno-experimental-new-pass-manager" if int(config.llvm_ver) < 14 else "-flegacy-pass-manager") if int(config.llvm_ver) >= 13 else "") + + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext) +newPM = ((" -fexperimental-new-pass-manager" if int(config.llvm_ver) < 13 else "") + + ' -fpass-plugin=@ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext) + +if len("@ENZYME_BINARY_DIR@") == 0: + oldPM = ((" -fno-experimental-new-pass-manager" if int(config.llvm_ver) < 14 else "-flegacy-pass-manager") if int(config.llvm_ver) >= 13 else "") + newPM = (" -fexperimental-new-pass-manager" if int(config.llvm_ver) < 13 else "") + +config.substitutions.append(('%loadClangEnzyme', oldPM if int(config.llvm_ver) < 15 else newPM)) +config.substitutions.append(('%newLoadClangEnzyme', newPM)) # Let the main config do the real work. lit_config.load_config(config, "@ENZYME_SOURCE_DIR@/benchmarks/lit.cfg.py") From 7995eb3be465a3b9852c4f9af05972befa268186 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 28 Mar 2024 18:46:10 -0400 Subject: [PATCH 05/88] add files --- enzyme/benchmarks/ReverseMode/gmm/Cargo.lock | 16 +++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 18 +++ enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 125 ++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/src/main.rs | 24 ++++ 4 files changed, 183 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/gmm/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock new file mode 100644 index 000000000000..cfdab95b3d9c --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "gmmrs" +version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml new file mode 100644 index 000000000000..6271be06da5d --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "gmmrs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] +libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs new file mode 100644 index 000000000000..c6bc0c737dd4 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -0,0 +1,125 @@ +#![feature(autodiff)] +use libm::lgamma; + +#[no_mangle] +pub extern "C" fn rust_dgmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + pub gamma: f64, + pub m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs new file mode 100644 index 000000000000..8f4357588ab8 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs @@ -0,0 +1,24 @@ +#![feature(autodiff)] +use gmmrs::{Wishart, dgmm_objective}; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} From 069e3cb85c92ce8f405fc905930d77ad7d2d0916 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 29 Mar 2024 00:10:33 -0400 Subject: [PATCH 06/88] improve makefile and fix c ffi --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 8 +++++--- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1e1a36f72c2f..5d8b4b70d7f4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -2,12 +2,14 @@ .PHONY: clean +dir=/h/344/drehwald/prog/Enzyme/enzyme + clean: rm -f *.ll *.o results.txt results.json %-unopt.ll: %.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm + clang++ -pthread $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @@ -21,8 +23,8 @@ clean: gmm.o: gmm-opt.ll pwd - echo clang++ -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ -v -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + echo clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -pthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index c6bc0c737dd4..ecbb0cb2545b 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -2,8 +2,8 @@ use libm::lgamma; #[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + //dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] @@ -104,10 +104,10 @@ fn log_gamma_distrib(a: f64, p: f64) -> f64 { #[repr(C)] pub struct Wishart { pub gamma: f64, - pub m: usize, + pub m: i32, } fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; + let n = p + wishart.m as usize + 1; let icf_sz = p * (p + 1) / 2; let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); From be54358ec1b213e5fcc3382c9284e4eff9483386 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 15:59:35 -0400 Subject: [PATCH 07/88] maybe needed? pthread for cmake --- enzyme/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/enzyme/CMakeLists.txt b/enzyme/CMakeLists.txt index 82c7887cde3e..f2e480f181ce 100644 --- a/enzyme/CMakeLists.txt +++ b/enzyme/CMakeLists.txt @@ -21,6 +21,10 @@ SET(CMAKE_CXX_FLAGS "-Wall -fno-rtti ${CMAKE_CXX_FLAGS} -Werror=unused-variable SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -ggdb") SET(CMAKE_CXX_FLAGS_RELEASE "-O2") + + +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -ggdb -fno-omit-frame-pointer") #SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer -fsanitize=address") From 4423222a6501de34fe92399667a41e19265c47f1 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 17:45:46 -0600 Subject: [PATCH 08/88] bench gmm: use path relative to Makefile --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5d8b4b70d7f4..5a403c789c80 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -2,7 +2,7 @@ .PHONY: clean -dir=/h/344/drehwald/prog/Enzyme/enzyme +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json From 7b5a24c7711cbcbb829a1ae0c0fcb84451d00da4 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 20:36:34 -0400 Subject: [PATCH 09/88] Fix byref issue for rust abi --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index e47dfaa62a23..934d26b829a8 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -50,7 +50,7 @@ extern "C" { void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * + double *icfb, const double *x, Wishart &wishart, double *err, double * errb); } @@ -128,10 +128,7 @@ void read_gmm_instance(const string& fn, fclose(fid); } -typedef void(*deriv_t)(int d, int k, int n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double *errb); - -template +template void calculate_jacobian(struct GMMInput &input, struct GMMOutput &result) { double* alphas_gradient_part = result.gradient.data(); @@ -262,6 +259,7 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); + printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme combined"; enzyme["runtime"] = tdiff(&start, &end); @@ -291,6 +289,7 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); + printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Rust Enzyme combined"; enzyme["runtime"] = tdiff(&start, &end); From ea03750e1bd193721bc01ad069e0750c5f79c293 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 20:49:48 -0400 Subject: [PATCH 10/88] Add primal bench/test --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 48 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 3 ++ .../benchmarks/ReverseMode/gmm/Makefile.make | 6 +-- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 7 ++- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 934d26b829a8..acd18a1f1319 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -33,6 +33,17 @@ struct GMMParameters { }; extern "C" { +void gmm_objective( + int d, + int k, + int n, + double const* alphas, + double const* means, + double const* icf, + double const* x, + Wishart wishart, + double* err +); void dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * @@ -52,6 +63,10 @@ extern "C" { alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double * errb); + + void rust_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, double *err); } void read_gmm_instance(const string& fn, @@ -161,6 +176,25 @@ void calculate_jacobian(struct GMMInput &input, struct GMMOutput &result) ); } +template +double primal(struct GMMInput &input) +{ + double tmp = 0.0; // stores fictive result + // (Tapenade doesn't calculate an original function in reverse mode) + deriv( + input.d, + input.k, + input.n, + input.alphas.data(), + input.means.data(), + input.icf.data(), + input.x.data(), + input.wishart, + &tmp + ); + return tmp; +} + int main(const int argc, const char* argv[]) { printf("starting main\n"); @@ -284,6 +318,20 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("rust primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + } { struct timeval start, end; gettimeofday(&start, NULL); diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 6271be06da5d..5916af111e25 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -10,6 +10,9 @@ crate-type = ["lib"] [profile.release] lto = "fat" +debug = true +strip = "none" +opt-level = 1 [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5a403c789c80..1fd871d8963e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -9,7 +9,7 @@ clean: %-unopt.ll: %.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ -pthread $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm + clang++ -g $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O1 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @@ -22,9 +22,7 @@ clean: #opt $^ -O2 -o $@ -S gmm.o: gmm-opt.ll - pwd - echo clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ -pthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -g -lpthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index ecbb0cb2545b..7d4dd714a63d 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -3,7 +3,12 @@ use libm::lgamma; #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - //dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); + dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[no_mangle] +pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + gmm_objective_c(d as usize, k as usize, n as usize, alphas, means, icf, x, wishart, err); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] From 77a130f6983287e40874f4751f8c935adf2d62c7 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 20:57:02 -0400 Subject: [PATCH 11/88] fix math --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 7d4dd714a63d..b057a2443a87 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,5 +1,6 @@ #![feature(autodiff)] use libm::lgamma; +use std::f64::consts::PI; #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { @@ -23,7 +24,7 @@ pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: } pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; let mut sum_qs = vec![0.; k]; From 4abf2bf787a494925b27dbc897d86039d99a6e99 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 21:01:50 -0400 Subject: [PATCH 12/88] write into return var --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index b057a2443a87..3d869c072a5d 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -19,8 +19,9 @@ pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); + let mut my_err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); + unsafe { *err = my_err }; } pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { From 5cde3d7efdb1b3617997d2559d0b5329d61cd87d Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 21:47:48 -0400 Subject: [PATCH 13/88] Cleanup gmm config --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 11 +++++++++++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 4 +--- .../benchmarks/ReverseMode/gmm/Makefile.make | 19 +++---------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index acd18a1f1319..a25b4d0ded54 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -324,6 +324,12 @@ int main(const int argc, const char* argv[]) { auto res = primal(input); gettimeofday(&end, NULL); printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + + json primal; + primal["name"] = "C++ primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); } { struct timeval start, end; @@ -331,6 +337,11 @@ int main(const int argc, const char* argv[]) { auto res = primal(input); gettimeofday(&end, NULL); printf("rust primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + json primal; + primal["name"] = "Rust primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); } { struct timeval start, end; diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 5916af111e25..655d1a1f3117 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -10,9 +10,7 @@ crate-type = ["lib"] [profile.release] lto = "fat" -debug = true -strip = "none" -opt-level = 1 +opt-level = 3 [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1fd871d8963e..77cc84e2832e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi +# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json -f %s; fi .PHONY: clean @@ -7,22 +7,9 @@ dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp src/lib.rs +gmm.o: gmm.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ -g $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O1 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - echo opt $^ $(LOAD) -passes="enzyme" -o $@ -S - opt $^ $(LOAD) -passes="enzyme" -o $@ -S - -%-opt.ll: %-raw.ll - echo opt $^ -o $@ -S - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S - -gmm.o: gmm-opt.ll - clang++ -g -lpthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ From af6583dfa2481e120a9a103484438c9123e1384a Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 20:48:31 -0600 Subject: [PATCH 14/88] bench gmm: make cmath::lgamma with libm as an optional feature --- enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 7 ++++++- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 655d1a1f3117..85dfa6310c34 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -8,12 +8,17 @@ edition = "2021" [lib] crate-type = ["lib"] +[features] +libm = ["dep:libm"] + [profile.release] lto = "fat" opt-level = 3 +#debug = true +#strip = "none" [profile.dev] lto = "fat" [dependencies] -libm = "0.2.8" +libm = { version = "0.2.8", optional = true } diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 3d869c072a5d..7cf23525d026 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,7 +1,21 @@ #![feature(autodiff)] -use libm::lgamma; use std::f64::consts::PI; +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); From 48c2e1fdd7ecd155fe54dd39d5d8049f16da5101 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 23:06:51 -0400 Subject: [PATCH 15/88] oxidize - more noalias --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 41 +++++++++++++++----- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 7cf23525d026..80d8b3789d29 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -18,27 +18,50 @@ fn lgamma(x: f64) -> f64 { #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + //let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + + let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let mut d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let mut my_derr = unsafe { *derr }; + + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart, &mut my_err, &mut my_derr); + + unsafe { *err = my_err }; + unsafe { *derr = my_derr }; } #[no_mangle] pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - gmm_objective_c(d as usize, k as usize, n as usize, alphas, means, icf, x, wishart, err); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; + //let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); unsafe { *err = my_err }; } -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: *const Wishart, err: &mut f64) { + let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; @@ -118,7 +141,7 @@ fn log_sum_exp(n: usize, x: &[f64]) -> f64 { semx.ln() + mx } fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() } #[derive(Clone, Copy)] From ba75484b4afed02330dda8ea16edf8fcf2fd6fbf Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 00:13:53 -0400 Subject: [PATCH 16/88] reduce caching --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 77cc84e2832e..1a9c3cd3b826 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json gmm.o: gmm.cpp src/lib.rs - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 80d8b3789d29..914c41e156d4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -25,7 +25,7 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - //let wishart: Wishart = unsafe { *wishart }; + let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; @@ -33,7 +33,7 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart, &mut my_err, &mut my_derr); + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); unsafe { *err = my_err }; unsafe { *derr = my_derr }; @@ -48,9 +48,9 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - //let wishart: Wishart = unsafe { *wishart }; + let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); + gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); unsafe { *err = my_err }; } @@ -59,9 +59,10 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, // gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); //} -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: *const Wishart, err: &mut f64) { - let wishart: Wishart = unsafe { *wishart }; +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { + let wishart: Wishart = Wishart { gamma, m }; + //let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; From 9394028d1cbe382203c64a1c26b487f23598d101 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 22:07:01 -0600 Subject: [PATCH 17/88] bench gmm: makefile dep on Cargo.toml, split targets --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1a9c3cd3b826..e3c15f4dcc11 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -7,8 +7,10 @@ dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json -gmm.o: gmm.cpp src/lib.rs +$(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.toml ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + +gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o From 814eb62e8c59292c86af1f99e44f7a85fadc157a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 00:50:56 -0400 Subject: [PATCH 18/88] revert cmake pthread since only needed for Rust --- enzyme/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/enzyme/CMakeLists.txt b/enzyme/CMakeLists.txt index f2e480f181ce..82c7887cde3e 100644 --- a/enzyme/CMakeLists.txt +++ b/enzyme/CMakeLists.txt @@ -21,10 +21,6 @@ SET(CMAKE_CXX_FLAGS "-Wall -fno-rtti ${CMAKE_CXX_FLAGS} -Werror=unused-variable SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -ggdb") SET(CMAKE_CXX_FLAGS_RELEASE "-O2") - - -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") - SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -ggdb -fno-omit-frame-pointer") #SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer -fsanitize=address") From 41157fafd71d20cc7707269cec28f2fc1b28cd50 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 23:44:38 -0600 Subject: [PATCH 19/88] bench gmm: fix primal (sqnorm length matters) --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 914c41e156d4..16aecf8de96f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -78,7 +78,7 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] for ik in 0..k { subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); } slse = slse + log_sum_exp(k, &main_term); @@ -158,13 +158,13 @@ fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiag let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] }).sum::(); out - k as f64 * c } -fn sqnorm(n: usize, x: &[f64]) -> f64 { +fn sqnorm(x: &[f64]) -> f64 { x.iter().map(|x| x * x).sum() } From 114f2369dbca5cdf572e5dc599c88f94eb017fb6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 23:48:16 -0600 Subject: [PATCH 20/88] bench gmm: quash rust warnings --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 16aecf8de96f..a2ba1d041689 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -28,9 +28,9 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; - let mut d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; - let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); @@ -77,7 +77,7 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] for ix in 0..n { for ik in 0..k { subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); } @@ -119,7 +119,7 @@ fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { } } -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { +fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { assert!(out.len() >= d); assert!(q_diag.len() >= d); assert!(x.len() >= d); From 012cf4ccc62c0a3cfd27f074ef70977899b72bfe Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 11:41:43 -0400 Subject: [PATCH 21/88] adding ba benchmark --- enzyme/benchmarks/ReverseMode/ba/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/ba/Cargo.toml | 18 ++ enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 221 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/ba/src/main.rs | 26 +++ 4 files changed, 272 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/ba/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/ba/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock new file mode 100644 index 000000000000..7e322bed2b9a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bars" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml new file mode 100644 index 000000000000..1abfe3da5163 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "bars" +version = "0.1.0" +edition = "2021" + + +[lib] +crate-type = ["cdylib"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs new file mode 100644 index 000000000000..412a6a477109 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -0,0 +1,221 @@ +#![feature(autodiff)] +#![feature(slice_first_last_chunk)] +#![allow(non_snake_case)] + +//#define BA_NCAMPARAMS 11 +static BA_NCAMPARAMS: usize = 11; + +fn sqsum(x: &[f64]) -> f64 { + x.iter().map(|&v| v * v).sum() +} + +#[inline] +fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { + [ + a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0], + ] +} + +fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { + let rsq = sqsum(proj); + let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; + proj[0] = proj[0] * l; + proj[1] = proj[1] * l; +} + +fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { + let sqtheta = sqsum(rot); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let w = rot.map(|v| v * theta_inverse); + let w_cross_pt = cross(&w, &pt); + let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); + for i in 0..3 { + rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let rot_cross_pt = cross(&rot, &pt); + for i in 0..3 { + rotated_pt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { + let C = &cam[3..6]; + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9..], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +#[no_mangle] +pub extern "C" fn dcompute_reproj_error( + cam: *const [f64; 11], + dcam: *mut [f64; 11], + x: *const [f64; 3], + dx: *mut [f64; 3], + w: *const [f64; 1], + wb: *mut [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], + derr: *mut [f64; 2], +) { + rust_dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + +#[no_mangle] +pub extern "C" fn dcompute_zach_weight_error( + w: *const f64, + dw: *mut f64, + err: *mut f64, + derr: *mut f64, +) { + rust_dcompute_zach_weight_error(w, dw, err, derr); +} + +#[autodiff( + rust_dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub fn compute_reproj_error( + cam: *const [f64; 11], + x: *const [f64; 3], + w: *const [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], +) { + let cam = unsafe { &*cam }; + let w = unsafe { *(*w).get_unchecked(0) }; + let x = unsafe { &*x }; + let feat = unsafe { &*feat }; + let mut err = unsafe { &mut *err }; + let mut proj = [0.; 2]; + project(cam, x, &mut proj); + err[0] = w * (proj[0] - feat[0]); + err[1] = w * (proj[1] - feat[1]); +} + +#[autodiff(rust_dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] +pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { + let w = unsafe { *w }; + let mut err = unsafe { *err }; + err = 1. - w * w; +} + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +fn rust_ba_objective( + n: usize, + m: usize, + p: usize, + cams: &[f64], + x: &[f64], + w: &[f64], + obs: &[i32], + feats: &[f64], + reproj_err: &mut [f64], + w_err: &mut [f64], +) { + assert_eq!(cams.len(), n * 11); + assert_eq!(x.len(), m * 3); + assert_eq!(w.len(), p); + assert_eq!(obs.len(), p * 2); + assert_eq!(feats.len(), p * 2); + assert_eq!(reproj_err.len(), p * 2); + assert_eq!(w_err.len(), p); + + for i in 0..p { + let cam_idx = obs[i * 2 + 0] as usize; + let pt_idx = obs[i * 2 + 1] as usize; + let start = cam_idx * BA_NCAMPARAMS; + let cam: &[f64; 11] = unsafe { + cams[start..] + .get_unchecked(..11) + .try_into() + .unwrap_unchecked() + }; + let x: &[f64; 3] = unsafe { + x[pt_idx * 3..] + .get_unchecked(..3) + .try_into() + .unwrap_unchecked() + }; + let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; + let feat: &[f64; 2] = unsafe { + feats[i * 2..] + .get_unchecked(..2) + .try_into() + .unwrap_unchecked() + }; + let reproj_err: &mut [f64; 2] = unsafe { + reproj_err[i * 2..] + .get_unchecked_mut(..2) + .try_into() + .unwrap_unchecked() + }; + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; + compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); + } +} + +#[no_mangle] +extern "C" fn ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; + let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; + let w = unsafe { std::slice::from_raw_parts(w, p) }; + let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; + let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; + let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; + let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; + rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); +} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/main.rs b/enzyme/benchmarks/ReverseMode/ba/src/main.rs new file mode 100644 index 000000000000..13f221be69c1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/main.rs @@ -0,0 +1,26 @@ +use bars::{dcompute_reproj_error, dcompute_zach_weight_error}; +fn main() { + let cam = [0.0; 11]; + let mut dcam = [0.0; 11]; + let x = [0.0; 3]; + let mut dx = [0.0; 3]; + let w = [0.0; 1]; + let mut dw = [0.0; 1]; + let feat = [0.0; 2]; + let mut err = [0.0; 2]; + let mut derr = [0.0; 2]; + dcompute_reproj_error( + &cam as *const [f64;11], + &mut dcam as *mut [f64;11], + &x as *const [f64;3], + &mut dx as *mut [f64;3], + &w as *const [f64;1], + &mut dw as *mut [f64;1], + &feat as *const [f64;2], + &mut err as *mut [f64;2], + &mut derr as *mut [f64;2], + ); + + let mut wb = 0.0; + dcompute_zach_weight_error(&w as *const f64, &mut dw as *mut f64, &mut err as *mut f64, &mut derr as *mut f64); +} From 0430e4489817d5ca9201ccfcdd167107ae6caf9a Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 1 Apr 2024 13:37:05 -0400 Subject: [PATCH 22/88] Benchmark ba --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 150 +++++++++++++++++- enzyme/benchmarks/ReverseMode/ba/Cargo.lock | 9 ++ enzyme/benchmarks/ReverseMode/ba/Cargo.toml | 1 + .../benchmarks/ReverseMode/ba/Makefile.make | 20 +-- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 14 +- 5 files changed, 172 insertions(+), 22 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index 3ade86a0b7b2..5d9178120e76 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -127,6 +127,19 @@ extern "C" { double* reproj_err, double* w_err ); + + void rust2_ba_objective( + int n, + int m, + int p, + double const* cams, + double const* X, + double const* w, + int const* obs, + double const* feats, + double* reproj_err, + double* w_err + ); void dcompute_reproj_error( double const* cam, @@ -169,6 +182,20 @@ extern "C" { ); void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); + + void rust_dcompute_reproj_error( + double const* cam, + double * dcam, + double const* X, + double * dX, + double const* w, + double * wb, + double const* feat, + double *err, + double *derr + ); + + void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); } void read_ba_instance(const string& fn, @@ -486,9 +513,9 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; + enzyme["name"] = "Enzyme c++ combined"; enzyme["runtime"] = tdiff(&start, &end); for(unsigned i=0; i<5; i++) { printf("%f ", result.J.vals[i]); @@ -499,6 +526,125 @@ int main(const int argc, const char* argv[]) { } } + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + + + { + struct timeval start, end; + gettimeofday(&start, NULL); + ba_objective( + input.n, + input.m, + input.p, + input.cams.data(), + input.X.data(), + input.w.data(), + input.obs.data(), + input.feats.data(), + result.reproj_err.data(), + result.w_err.data() + ); + gettimeofday(&end, NULL); + printf("primal c++ t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal c++"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for(unsigned i=0; i<5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + { + + struct timeval start, end; + gettimeofday(&start, NULL); + rust2_ba_objective( + input.n, + input.m, + input.p, + input.cams.data(), + input.X.data(), + input.w.data(), + input.obs.data(), + input.feats.data(), + result.reproj_err.data(), + result.w_err.data() + ); + gettimeofday(&end, NULL); + printf("primal rust t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal rust"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for(unsigned i=0; i<5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme rust combined"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock index 7e322bed2b9a..74e2768e7cd4 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock @@ -5,3 +5,12 @@ version = 3 [[package]] name = "bars" version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml index 1abfe3da5163..160c7716f3d8 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml @@ -16,3 +16,4 @@ lto = "fat" lto = "fat" [dependencies] +libm = { version = "0.2.8", optional = true } diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 6f0f2cc18242..8a13a0e524fb 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -1,23 +1,17 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B ba-unopt.ll ba-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ba.o results.json -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -Xclang -new-struct-path-tbaa -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -Xclang -new-struct-path-tbaa -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S - -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S +$(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm -ba.o: ba-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) +ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a + clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: ba.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 412a6a477109..82318144f63f 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -67,7 +67,7 @@ fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { } #[no_mangle] -pub extern "C" fn dcompute_reproj_error( +pub extern "C" fn rust_dcompute_reproj_error( cam: *const [f64; 11], dcam: *mut [f64; 11], x: *const [f64; 3], @@ -78,21 +78,21 @@ pub extern "C" fn dcompute_reproj_error( err: *mut [f64; 2], derr: *mut [f64; 2], ) { - rust_dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); } #[no_mangle] -pub extern "C" fn dcompute_zach_weight_error( +pub extern "C" fn rust_dcompute_zach_weight_error( w: *const f64, dw: *mut f64, err: *mut f64, derr: *mut f64, ) { - rust_dcompute_zach_weight_error(w, dw, err, derr); + dcompute_zach_weight_error(w, dw, err, derr); } #[autodiff( - rust_dcompute_reproj_error, + dcompute_reproj_error, Reverse, Duplicated, Duplicated, @@ -118,7 +118,7 @@ pub fn compute_reproj_error( err[1] = w * (proj[1] - feat[1]); } -#[autodiff(rust_dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] +#[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; let mut err = unsafe { *err }; @@ -198,7 +198,7 @@ fn rust_ba_objective( } #[no_mangle] -extern "C" fn ba_objective( +extern "C" fn rust2_ba_objective( n: usize, m: usize, p: usize, From 4b0062bd396ac04a089ff718913e4aa6a1ee1471 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 1 Apr 2024 21:49:23 -0400 Subject: [PATCH 23/88] fix ba primal --- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 82318144f63f..768f3fec8e38 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -121,8 +121,7 @@ pub fn compute_reproj_error( #[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; - let mut err = unsafe { *err }; - err = 1. - w * w; + unsafe { *err = 1. - w * w; } } // n number of cameras From 1f27479dbecd8de0a9e1eb10f95b7e562d4e9863 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 01:36:17 -0400 Subject: [PATCH 24/88] adding unsafe gmm version --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 104 ++++++++--- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 162 +---------------- enzyme/benchmarks/ReverseMode/gmm/src/main.rs | 4 +- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 165 ++++++++++++++++++ .../benchmarks/ReverseMode/gmm/src/unsafe.rs | 147 ++++++++++++++++ 5 files changed, 397 insertions(+), 185 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index a25b4d0ded54..45d589c7ae75 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -58,7 +58,19 @@ void gmm_objective( alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb); - + + void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, + double *icfb, const double *x, + Wishart &wishart, double *err, + double *errb); + + void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, + double *err); + void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double * @@ -203,10 +215,11 @@ int main(const int argc, const char* argv[]) { std::vector paths;// = { "1k/gmm_d10_K100.txt" }; - getTests(paths, "data/1k", "1k/"); - getTests(paths, "data/2.5k", "2.5k/"); - getTests(paths, "data/10k", "10k/"); - + // getTests(paths, "data/1k", "1k/"); + // getTests(paths, "data/2.5k", "2.5k/"); + // getTests(paths, "data/10k", "10k/"); + paths.push_back("1k/gmm_d2_K5.txt"); + std::ofstream jsonfile("results.json", std::ofstream::trunc); json test_results; @@ -256,26 +269,27 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; - try { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Adept combined %0.6f\n", tdiff(&start, &end)); - json adept; - adept["name"] = "Adept combined"; - adept["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - adept["result"].push_back(result.gradient[i]); + if (0) { + try { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Adept combined %0.6f\n", tdiff(&start, &end)); + json adept; + adept["name"] = "Adept combined"; + adept["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + adept["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(adept); + } catch (std::bad_alloc) { + printf("Adept combined 88888888 ooms\n"); } - printf("\n"); - test_suite["tools"].push_back(adept); - } catch(std::bad_alloc) { - printf("Adept combined 88888888 ooms\n"); } - } { @@ -331,6 +345,49 @@ int main(const int argc, const char* argv[]) { primal["result"].push_back(res); test_suite["tools"].push_back(primal); } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("rust unsafe primal combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + json primal; + primal["name"] = "Rust unsafe primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Rust unsafe Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); @@ -360,7 +417,6 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index a2ba1d041689..8fcb11ffed10 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,149 +1,9 @@ #![feature(autodiff)] -use std::f64::consts::PI; +pub mod r#unsafe; +pub mod safe; -#[cfg(feature = "libm")] -use libm::lgamma; +use r#unsafe::dgmm_objective as dgmm_objective; -#[cfg(not(feature = "libm"))] -mod cmath { - extern "C" { - pub fn lgamma(x: f64) -> f64; - } -} -#[cfg(not(feature = "libm"))] -#[inline] -fn lgamma(x: f64) -> f64 { - unsafe { cmath::lgamma(x) } -} - -#[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - let k = k as usize; - let n = n as usize; - let d = d as usize; - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut my_err = unsafe { *err }; - - let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; - let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; - let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; - let mut my_derr = unsafe { *derr }; - - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); - - unsafe { *err = my_err }; - unsafe { *derr = my_derr }; -} - -#[no_mangle] -pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let k = k as usize; - let n = n as usize; - let d = d as usize; - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); - unsafe { *err = my_err }; -} - -//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { -// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); -//} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { - let wishart: Wishart = Wishart { gamma, m }; - //let wishart: Wishart = unsafe { *wishart }; - let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} #[derive(Clone, Copy)] #[repr(C)] @@ -151,20 +11,4 @@ pub struct Wishart { pub gamma: f64, pub m: i32, } -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m as usize + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} -fn sqnorm(x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs index 8f4357588ab8..e7ebf74d0aa2 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs @@ -1,5 +1,5 @@ #![feature(autodiff)] -use gmmrs::{Wishart, dgmm_objective}; +use gmmrs::{Wishart, r#unsafe::dgmm_objective}; fn main() { let d = 2; @@ -20,5 +20,5 @@ fn main() { let mut d_err2 = &mut d_err; let wishart2 = &wishart; // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); + unsafe {dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64);} } diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs new file mode 100644 index 000000000000..5f954347f1d7 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -0,0 +1,165 @@ +//#![feature(autodiff)] +use std::f64::consts::PI; +use crate::Wishart; + +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + +#[no_mangle] +pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + + let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let mut my_derr = unsafe { *derr }; + + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); + + unsafe { *err = my_err }; + unsafe { *derr = my_derr }; +} + +#[no_mangle] +pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); + unsafe { *err = my_err }; +} + +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { + let wishart: Wishart = Wishart { gamma, m }; + //let wishart: Wishart = unsafe { *wishart }; + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m as usize + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs new file mode 100644 index 000000000000..b2730538c88e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs @@ -0,0 +1,147 @@ +use std::f64::consts::PI; +use crate::Wishart; + +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + unsafe { dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + unsafe {gmm_objective(d, k, n, alphas, means, icf, x, wishart, err); } +} + +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub unsafe fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub unsafe fn gmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, sum_qs.as_mut_ptr(), qdiags.as_mut_ptr()); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, x.add(ix * d), means.add(ik * d), xcentered.as_mut_ptr()); + qtimesx(d, qdiags.as_mut_ptr().add(ik * d), icf.add(ik * icf_sz + d), xcentered.as_ptr(), qxcentered.as_mut_ptr()); + main_term[ik] = *alphas.add(ik) + sum_qs[ik] - 0.5 * sqnorm(d, qxcentered.as_ptr()); + //main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]); + } + + slse = slse + log_sum_exp(k, main_term.as_ptr()); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, *wishart, sum_qs.as_ptr(), qdiags.as_ptr(), icf); +} + +unsafe fn arr_max(n: usize, x: *const f64) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < *x.add(i) { + max = *x.add(i); + } + } + max +} + +unsafe fn preprocess_qs(d: usize, k: usize, icf: *const f64, sum_qs: *mut f64, qdiags: *mut f64) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + *sum_qs.add(ik) = 0.; + for id in 0..d { + let q = *icf.add(ik * icf_sz + id); + *sum_qs.add(ik) = *sum_qs.add(ik) + q; + *qdiags.add(ik * d + id) = q.exp(); + } + } +} + +unsafe fn subtract(d: usize, x: *const f64, y: *const f64, out: *mut f64) { + for i in 0..d { + *out.add(i) = *x.add(i) - *y.add(i); + } +} + +unsafe fn qtimesx(d: usize, q_diag: *const f64, ltri: *const f64, x: *const f64, out: *mut f64) { + for i in 0..d { + *out.add(i) = *q_diag.add(i) * *x.add(i); + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + *out.add(j) = *out.add(j) + *ltri.add(lparamsidx) * *x.add(i); + lparamsidx += 1; + } + } +} + +unsafe fn log_sum_exp(n: usize, x: *const f64) -> f64 { + let mx = arr_max(n, x); + let mut semx: f64 = 0.0; + + for i in 0..n { + semx = semx + (*x.add(i) - mx).exp(); + } + semx.ln() + mx +} + +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +unsafe fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: *const f64, qdiags: *const f64, icf: *const f64) -> f64 { + let n = p + wishart.m as usize + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let mut out = 0.; + + for ik in 0..k { + let frobenius = sqnorm(p, qdiags.add(ik * p)) + sqnorm(icf_sz - p, icf.add(ik * icf_sz + p)); + out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - wishart.m as f64 * *sum_qs.add(ik); + } + + out - k as f64 * c +} + +unsafe fn sqnorm(n: usize, x: *const f64) -> f64 { + let mut sum = 0.; + for i in 0..n { + sum += *x.add(i) * *x.add(i); + } + sum +} From af3e07891217adc18548d961fdf4a22be2985516 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 4 Apr 2024 02:30:12 -0400 Subject: [PATCH 25/88] Add FFT and LSTM benchmark for Rust Enzyme --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 49 +++++ enzyme/benchmarks/ReverseMode/fft/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/fft/Cargo.toml | 18 ++ .../benchmarks/ReverseMode/fft/Makefile.make | 20 +-- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 56 ++++++ enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 106 +++++++++++ enzyme/benchmarks/ReverseMode/fft/src/main.rs | 14 ++ enzyme/benchmarks/ReverseMode/lstm/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/lstm/Cargo.toml | 18 ++ .../benchmarks/ReverseMode/lstm/Makefile.make | 19 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 169 ++++++++++++++++++ .../benchmarks/ReverseMode/lstm/src/main.rs | 3 + 12 files changed, 466 insertions(+), 20 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/fft/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/fft/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/main.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/lstm/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index e6d13303d1f8..bd765ad1dbd1 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -34,6 +34,20 @@ struct LSTMOutput { }; extern "C" { + void rust_dlstm_objective( + int l, + int c, + int b, + double const* main_params, + double* dmain_params, + double const* extra_params, + double* dextra_params, + double* state, + double const* sequence, + double* loss, + double* dloss + ); + void dlstm_objective( int l, int c, @@ -291,6 +305,41 @@ int main(const int argc, const char* argv[]) { } } + + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = { 0, std::vector(Jcols) }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme (Rust) combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/fft/Cargo.lock b/enzyme/benchmarks/ReverseMode/fft/Cargo.lock new file mode 100644 index 000000000000..44847eca60f6 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "fft" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/fft/Cargo.toml b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml new file mode 100644 index 000000000000..5366aefa719e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "fft" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" +opt-level = 3 + +[profile.dev] +lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index ffeddd5507df..a2de0fdbcc62 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -1,23 +1,17 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B fft-unopt.ll fft-raw.ll fft-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B fft.o results.txt VERBOSE=1 -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S - -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S +$(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib -fft.o: fft-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm +fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a + clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: fft.o ./$^ 1048576 | tee $@ diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index cf9459b9597a..5c67b3be1678 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -33,6 +33,21 @@ extern "C" { int enzyme_dupnoneed; } +extern "C" void rust_dfoobar(int n, double* data, double* ddata); +extern "C" void rust_foobar(int n, double* data); + +static double rust_foobar_and_gradient(unsigned len) { + double *inp = new double[2*len]; + for(int i=0; i<2*len; i++) inp[i] = 2.0; + double *dinp = new double[2*len]; + for(int i=0; i<2*len; i++) dinp[i] = 1.0; + rust_dfoobar(len*2, inp, dinp); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; +} + static double foobar_and_gradient(unsigned len) { double *inp = new double[2*len]; for(int i=0; i<2*len; i++) inp[i] = 2.0; @@ -202,6 +217,46 @@ static void enzyme_sincos(double inp, unsigned len) { } } +static void enzyme_rust_sincos(double inp, unsigned len) { + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2*len]; + for(int i=0; i<2*len; i++) x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2*len]; + for(int i=0; i<2*len; i++) x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) forward %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res2 = rust_foobar_and_gradient(len); + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + } +} /* Function to check if x is power of 2*/ bool isPowerOfTwo (int x) @@ -233,5 +288,6 @@ int main(int argc, char** argv) { adept_sincos(inp, iters); tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); + enzyme_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs new file mode 100644 index 000000000000..e2df837805e0 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -0,0 +1,106 @@ +#![feature(autodiff)] + +use std::slice; +use std::f64::consts::PI; + +fn bitreversal_perm(data: &mut [T]) { + let len = data.len() / 2; + let mut j = 1; + + let mut i = 1; + while i < 2*len { + if j > i { + //dbg!(&i, &j); + data.swap(j-1, i-1); + data.swap(j, i); + } + + let mut m = len; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + i += 2; + } +} + +fn radix2(data: &mut [f64], i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = data[in_n] * wr - data[in_n + 1] * wi; + let tempi = data[in_n] * wi + data[in_n + 1] * wr; + + data[in_n] = data[i] - tempr; + data[in_n + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +fn rescale(data: &mut [f64], scale: f64) { + let scale = 1. / scale; + for elm in data { + *elm *= scale; + } +} + +fn fft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, 1.0, data.len() / 2); +} + +fn ifft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, -1.0, data.len() / 2); + rescale(data, data.len() as f64 / 2.); +} + +#[autodiff(dfoobar, Reverse, Duplicated)] +pub fn foobar(data: &mut [f64]) { + fft(data); + ifft(data); +} + +#[no_mangle] +pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + + let (data, ddata) = unsafe { + ( + slice::from_raw_parts_mut(data, n), + slice::from_raw_parts_mut(ddata, n) + ) + }; + + dfoobar(data, ddata); +} + +#[no_mangle] +pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { + let data = unsafe { slice::from_raw_parts_mut(data, n) }; + + foobar(data); +} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/main.rs b/enzyme/benchmarks/ReverseMode/fft/src/main.rs new file mode 100644 index 000000000000..f2a857806eb2 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/main.rs @@ -0,0 +1,14 @@ +use fft::dfoobar; + +fn main() { + let mut data = vec![1.0; 32]; + for i in 0..16 { + data[i] = 2.0; + } + let mut data_d = vec![1.0; data.len()]; + + dfoobar(&mut data, &mut data_d); + + dbg!(&data_d); + dbg!(&data); +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock b/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock new file mode 100644 index 000000000000..270bf4367433 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "lstm" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml new file mode 100644 index 000000000000..6e659faf3a3b --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "lstm" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" +opt-level = 3 + +[profile.dev] +lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 4323ac694a08..f3cdb818b742 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -1,23 +1,28 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B lstm-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B lstm-raw.ll results.json -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: - rm -f *.ll *.o results.txt + rm -f *.ll *.o results.json + +$(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -O2 --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + @echo $(LOAD) + opt $^ $(LOAD) -o $@ -S %-opt.ll: %-raw.ll opt $^ -o $@ -S #opt $^ -O2 -o $@ -S -lstm.o: lstm-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm +lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs new file mode 100644 index 000000000000..aba88ac76617 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -0,0 +1,169 @@ +#![feature(autodiff)] + +use std::slice; + +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +fn logsumexp(vect: &[f64]) -> f64 { + let mut sum = 0.0; + for &val in vect { + sum += val.exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +fn lstm_model( + hsize: usize, + weight: &[f64], + bias: &[f64], + hidden: &mut [f64], + cell: &mut [f64], + input: &[f64], +) { + let mut gates = vec![0.0; 4 * hsize]; + let (a,b) = gates.split_at_mut(2*hsize); + let ((forget, ingate), (outgate, change)) = ( + a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); + } + + // caching cell + for i in 0..hsize { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for i in 0..hsize { + hidden[i] = outgate[i] * cell[i].tanh(); + } +} + +// Predict LSTM output given an input +fn lstm_predict( + l: usize, + b: usize, + w: &[f64], + w2: &[f64], + s: &mut [f64], + x: &[f64], + x2: &mut [f64], +) { + for i in 0..b { + x2[i] = x[i] * w2[i]; + } + + let mut i = 0; + while i <= 2*l*b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { + let tmp = &mut s[i-2*b..]; + let (a, d) = tmp.split_at_mut(2*b); + let (d, c) = d.split_at_mut(b); + + (a,d,c) + }; + + lstm_model( + b, + &w[i * 4..], + &w[(i + b) * 4..], + s1, + s2, + xp, + ); + + i += 2 * b; + } + + let xp = &s[i-2*b..]; + + for i in 0..b { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: &[f64], + extra_params: &[f64], + state: &mut [f64], + sequence: &[f64], + loss: &mut f64, +) { + let mut total = 0.0; + let mut count = 0; + + let mut input = &sequence[..b]; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + for t in (0..=(c - 1) * b - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); + lse = logsumexp(&ypred); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + let ygold = &sequence[t + b..]; + for i in 0..b { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} + +#[no_mangle] +pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let (main_params, extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + } +} + +#[no_mangle] +pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts_mut(d_main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(d_extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + } +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs new file mode 100644 index 000000000000..e7a11a969c03 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} From c8bcfe9aa02c5b8037ae38f94621822198b422e7 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 03:24:08 -0400 Subject: [PATCH 26/88] adding unsafe Rust fft version (how to run?) --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 63 ++++++++++ enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 104 +-------------- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 104 +++++++++++++++ enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 119 ++++++++++++++++++ 4 files changed, 288 insertions(+), 102 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/unsf.rs diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 5c67b3be1678..3c566c33b31a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -33,9 +33,25 @@ extern "C" { int enzyme_dupnoneed; } +extern "C" void rust_unsafe_dfoobar(int n, double *data, double *ddata); +extern "C" void rust_unsafe_foobar(int n, double *data); extern "C" void rust_dfoobar(int n, double* data, double* ddata); extern "C" void rust_foobar(int n, double* data); +static double rust_unsafe_foobar_and_gradient(unsigned len) { + double *inp = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + inp[i] = 2.0; + double *dinp = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + dinp[i] = 1.0; + rust_unsafe_dfoobar(len * 2, inp, dinp); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; +} + static double rust_foobar_and_gradient(unsigned len) { double *inp = new double[2*len]; for(int i=0; i<2*len; i++) inp[i] = 2.0; @@ -217,6 +233,51 @@ static void enzyme_sincos(double inp, unsigned len) { } } +static void enzyme_unsafe_rust_sincos(double inp, unsigned len) { + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) forward %0.6f res=%f\n", tdiff(&start, &end), + res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res2 = rust_unsafe_foobar_and_gradient(len); + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), + res2); + } +} + static void enzyme_rust_sincos(double inp, unsigned len) { { @@ -281,6 +342,7 @@ int main(int argc, char** argv) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } + N = 2; double inp = -2.1; for(unsigned iters=max(1, N>>5); iters <= N; iters*=2) { @@ -289,5 +351,6 @@ int main(int argc, char** argv) { tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); enzyme_rust_sincos(inp, iters); + // enzyme_unsafe_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs index e2df837805e0..47b0aa1e97fd 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -1,106 +1,6 @@ #![feature(autodiff)] -use std::slice; -use std::f64::consts::PI; +pub mod safe; +pub mod unsf; -fn bitreversal_perm(data: &mut [T]) { - let len = data.len() / 2; - let mut j = 1; - let mut i = 1; - while i < 2*len { - if j > i { - //dbg!(&i, &j); - data.swap(j-1, i-1); - data.swap(j, i); - } - - let mut m = len; - while m >= 2 && j > m { - j -= m; - m >>= 1; - } - - j += m; - i += 2; - } -} - -fn radix2(data: &mut [f64], i_sign: f64, n: usize) { - if n == 1 { - return; - } - - let (a,b) = data.split_at_mut(n); - radix2(a, i_sign, n/2); - radix2(b, i_sign, n/2); - - let wtemp = i_sign * (PI / n as f64).sin(); - let wpi = -i_sign * (2.0 * PI / n as f64).sin(); - let wpr = -2.0 * wtemp * wtemp; - let mut wr = 1.0; - let mut wi = 0.0; - - let mut i = 0; - while i < n { - let in_n = i + n; - - let tempr = data[in_n] * wr - data[in_n + 1] * wi; - let tempi = data[in_n] * wi + data[in_n + 1] * wr; - - data[in_n] = data[i] - tempr; - data[in_n + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; - - let wtemp_new = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp_new * wpi; - - i += 2; - } -} - -fn rescale(data: &mut [f64], scale: f64) { - let scale = 1. / scale; - for elm in data { - *elm *= scale; - } -} - -fn fft(data: &mut [f64]) { - bitreversal_perm(data); - radix2(data, 1.0, data.len() / 2); -} - -fn ifft(data: &mut [f64]) { - bitreversal_perm(data); - radix2(data, -1.0, data.len() / 2); - rescale(data, data.len() as f64 / 2.); -} - -#[autodiff(dfoobar, Reverse, Duplicated)] -pub fn foobar(data: &mut [f64]) { - fft(data); - ifft(data); -} - -#[no_mangle] -pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { - - let (data, ddata) = unsafe { - ( - slice::from_raw_parts_mut(data, n), - slice::from_raw_parts_mut(ddata, n) - ) - }; - - dfoobar(data, ddata); -} - -#[no_mangle] -pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { - let data = unsafe { slice::from_raw_parts_mut(data, n) }; - - foobar(data); -} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs new file mode 100644 index 000000000000..e17599b12683 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -0,0 +1,104 @@ +use std::slice; +use std::f64::consts::PI; + +fn bitreversal_perm(data: &mut [T]) { + let len = data.len() / 2; + let mut j = 1; + + let mut i = 1; + while i < 2*len { + if j > i { + //dbg!(&i, &j); + data.swap(j-1, i-1); + data.swap(j, i); + } + + let mut m = len; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + i += 2; + } +} + +fn radix2(data: &mut [f64], i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = data[in_n] * wr - data[in_n + 1] * wi; + let tempi = data[in_n] * wi + data[in_n + 1] * wr; + + data[in_n] = data[i] - tempr; + data[in_n + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +fn rescale(data: &mut [f64], scale: f64) { + let scale = 1. / scale; + for elm in data { + *elm *= scale; + } +} + +fn fft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, 1.0, data.len() / 2); +} + +fn ifft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, -1.0, data.len() / 2); + rescale(data, data.len() as f64 / 2.); +} + +#[autodiff(dfoobar, Reverse, Duplicated)] +pub fn foobar(data: &mut [f64]) { + fft(data); + ifft(data); +} + +#[no_mangle] +pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + + let (data, ddata) = unsafe { + ( + slice::from_raw_parts_mut(data, n), + slice::from_raw_parts_mut(ddata, n) + ) + }; + + dfoobar(data, ddata); +} + +#[no_mangle] +pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { + let data = unsafe { slice::from_raw_parts_mut(data, n) }; + + foobar(data); +} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs new file mode 100644 index 000000000000..6c5d086ffdf1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -0,0 +1,119 @@ +use std::f64::consts::PI; + +//static void scramble(double* data, unsigned N) { +// int j=1; +// for (int i=1; i<2*N; i+=2) { +// if (j>i) { +// swap(&data[j-1], &data[i-1]); +// swap(&data[j], &data[i]); +// } +// int m = N; +// while (m>=2 && j>m) { +// j -= m; +// m >>= 1; +// } +// j += m; +// } +//} +unsafe fn bitreversal_perm(data: *mut f64, n: usize) { + //let len = data.len() / 2; + let mut j = 1; + + for i in (1..2*n).step_by(2) { + //let mut i = 1; + //while i < 2*len { + if j > i { + std::ptr::swap(data.add(j-1), data.add(i-1)); + std::ptr::swap(data.add(j), data.add(i)); + //data.swap(j-1, i-1); + //data.swap(j, i); + } + + let mut m = n; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + //i += 2; + } +} + +unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let b = data.add(n); + let a = data; + //let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = *data.add(in_n) * wr - *data.add(in_n + 1) * wi; + let tempi = *data.add(in_n) * wi + *data.add(in_n + 1) * wr; + + *data.add(in_n) = *data.add(i) - tempr; + *data.add(in_n + 1) = *data.add(i + 1) - tempi; + *data.add(i) += tempr; + *data.add(i + 1) += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +//static void rescale(double* data, unsigned N) { +// double scale = ((double)1)/N; +// for (unsigned i=0; i<2*N; i++) { +// data[i] *= scale; +// } +//} + +unsafe fn rescale(data: *mut f64, n: usize) { + let scale = 1. / n as f64; + for i in 0..2*n { + *data.add(i) = *data.add(i) * scale; + } +} + +unsafe fn fft(data: *mut f64, n: usize) { + bitreversal_perm(data, n); + radix2(data, 1.0, n); +} + +unsafe fn ifft(data: *mut f64, n: usize) { + bitreversal_perm(data, n); + radix2(data, -1.0, n); + rescale(data, n); +} + +#[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] +pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { + fft(data, n); + ifft(data, n); +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + unsafe {unsafe_dfoobar(n, data, ddata); } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_foobar(n: usize, data: *mut f64) { + unsafe {unsafe_foobar(n, data); } +} From cad42213596cb5e3a6b22a0ab9b907197691663f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 17:31:27 -0400 Subject: [PATCH 27/88] imprv safe rus tto work like c++ --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 6 +++--- enzyme/benchmarks/ReverseMode/fft/src/main.rs | 18 +++++++++++++----- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 13 ++++++------- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 4 ++-- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 3c566c33b31a..6d16839a56ff 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -45,7 +45,7 @@ static double rust_unsafe_foobar_and_gradient(unsigned len) { double *dinp = new double[2 * len]; for (int i = 0; i < 2 * len; i++) dinp[i] = 1.0; - rust_unsafe_dfoobar(len * 2, inp, dinp); + rust_unsafe_dfoobar(len, inp, dinp); double res = dinp[0]; delete[] dinp; delete[] inp; @@ -57,7 +57,7 @@ static double rust_foobar_and_gradient(unsigned len) { for(int i=0; i<2*len; i++) inp[i] = 2.0; double *dinp = new double[2*len]; for(int i=0; i<2*len; i++) dinp[i] = 1.0; - rust_dfoobar(len*2, inp, dinp); + rust_dfoobar(len, inp, dinp); double res = dinp[0]; delete[] dinp; delete[] inp; @@ -351,6 +351,6 @@ int main(int argc, char** argv) { tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); enzyme_rust_sincos(inp, iters); - // enzyme_unsafe_rust_sincos(inp, iters); + enzyme_unsafe_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/main.rs b/enzyme/benchmarks/ReverseMode/fft/src/main.rs index f2a857806eb2..5f76ad96243e 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/main.rs @@ -1,14 +1,22 @@ -use fft::dfoobar; +use core::mem; +use fft::safe;//::dfoobar; +use fft::unsf;//::dfoobar; fn main() { - let mut data = vec![1.0; 32]; - for i in 0..16 { + let len = 16; + let mut data = vec![1.0; 2*len]; + for i in 0..len { data[i] = 2.0; } - let mut data_d = vec![1.0; data.len()]; + let mut data_d = vec![1.0; 2*len]; - dfoobar(&mut data, &mut data_d); + //unsafe {safe::rust_dfoobar(len, data.as_mut_ptr(), data_d.as_mut_ptr());} + //unsafe {safe::rust_foobar(len, data.as_mut_ptr());} + unsafe {unsf::unsafe_dfoobar(len, data.as_mut_ptr(), data_d.as_mut_ptr());} + unsafe {unsf::unsafe_foobar(len, data.as_mut_ptr());} dbg!(&data_d); dbg!(&data); + //mem::forget(data); + //mem::forget(data_d); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index e17599b12683..7332dcb91356 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -59,8 +59,8 @@ fn radix2(data: &mut [f64], i_sign: f64, n: usize) { } } -fn rescale(data: &mut [f64], scale: f64) { - let scale = 1. / scale; +fn rescale(data: &mut [f64], scale: usize) { + let scale = 1. / scale as f64; for elm in data { *elm *= scale; } @@ -74,7 +74,7 @@ fn fft(data: &mut [f64]) { fn ifft(data: &mut [f64]) { bitreversal_perm(data); radix2(data, -1.0, data.len() / 2); - rescale(data, data.len() as f64 / 2.); + rescale(data, data.len() / 2); } #[autodiff(dfoobar, Reverse, Duplicated)] @@ -88,8 +88,8 @@ pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { let (data, ddata) = unsafe { ( - slice::from_raw_parts_mut(data, n), - slice::from_raw_parts_mut(ddata, n) + slice::from_raw_parts_mut(data, n * 2), + slice::from_raw_parts_mut(ddata, n * 2) ) }; @@ -98,7 +98,6 @@ pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { #[no_mangle] pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { - let data = unsafe { slice::from_raw_parts_mut(data, n) }; - + let data = unsafe { slice::from_raw_parts_mut(data, n * 2) }; foobar(data); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 6c5d086ffdf1..653e495bb5f1 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -104,8 +104,8 @@ unsafe fn ifft(data: *mut f64, n: usize) { #[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { - fft(data, n); - ifft(data, n); + fft(data, n / 2); + ifft(data, n / 2); } #[no_mangle] From f0bf16b64e063d94c97270fd1a8a6103fd6a8a81 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 17:52:28 -0400 Subject: [PATCH 28/88] unsafe version not crashing --- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 653e495bb5f1..5391d035095f 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -15,11 +15,11 @@ use std::f64::consts::PI; // j += m; // } //} -unsafe fn bitreversal_perm(data: *mut f64, n: usize) { +unsafe fn bitreversal_perm(data: *mut f64, len: usize) { //let len = data.len() / 2; let mut j = 1; - for i in (1..2*n).step_by(2) { + for i in (1..2*len).step_by(2) { //let mut i = 1; //while i < 2*len { if j > i { @@ -29,7 +29,7 @@ unsafe fn bitreversal_perm(data: *mut f64, n: usize) { //data.swap(j, i); } - let mut m = n; + let mut m = len; while m >= 2 && j > m { j -= m; m >>= 1; @@ -104,8 +104,8 @@ unsafe fn ifft(data: *mut f64, n: usize) { #[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { - fft(data, n / 2); - ifft(data, n / 2); + fft(data, n ); + ifft(data, n ); } #[no_mangle] From 12e9a4a826091b4f228284361aa30d323bde63a0 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 18:01:45 -0400 Subject: [PATCH 29/88] fix lstm makefile --- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index f3cdb818b742..51305ac7db2d 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @echo $(LOAD) @@ -22,7 +22,7 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to #opt $^ -O2 -o $@ -S lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a - clang++ --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ From 688721dfd3d8fbacde48e951b3c6eef6225e942d Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 19:35:44 -0400 Subject: [PATCH 30/88] adding unsafe rust lstm version --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 118 +++++++------ .../benchmarks/ReverseMode/lstm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 151 ++-------------- .../benchmarks/ReverseMode/lstm/src/safe.rs | 167 ++++++++++++++++++ .../benchmarks/ReverseMode/lstm/src/unsf.rs | 114 ++++++++++++ 5 files changed, 362 insertions(+), 190 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index bd765ad1dbd1..03107c62ec41 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -34,51 +34,33 @@ struct LSTMOutput { }; extern "C" { - void rust_dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); - - void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); - - void lstm_objective_b(int l, int c, int b, const double *main_params, double * - main_paramsb, const double *extra_params, double *extra_paramsb, - double *state, const double *sequence, double *loss, double *lossb); - - void adept_dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); +void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, + double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + +void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + +void dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, double *dloss); + +void lstm_objective_b(int l, int c, int b, const double *main_params, + double *main_paramsb, const double *extra_params, + double *extra_paramsb, double *state, + const double *sequence, double *loss, double *lossb); + +void adept_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, double *dloss); } void read_lstm_instance(const string& fn, @@ -322,14 +304,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme (Rust) combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme (safe Rust) combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme (Rust) combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { + enzyme["name"] = "Enzyme (safe Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { printf("%f ", result.gradient[i]); enzyme["result"].push_back(result.gradient[i]); } @@ -340,6 +322,40 @@ int main(const int argc, const char* argv[]) { } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (unsafe Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 51305ac7db2d..23ba9a51ceff 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.json $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - cargo +enzyme rustc --release --lib --crate-type=staticlib + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index aba88ac76617..b6b0e3e33225 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -1,145 +1,16 @@ #![feature(autodiff)] +pub (crate) mod unsf; +pub (crate) mod safe; use std::slice; -// Sigmoid on scalar -fn sigmoid(x: f64) -> f64 { - 1.0 / (1.0 + (-x).exp()) -} - -// log(sum(exp(x), 2)) -fn logsumexp(vect: &[f64]) -> f64 { - let mut sum = 0.0; - for &val in vect { - sum += val.exp(); - } - sum += 2.0; // Adding 2 to sum - sum.ln() -} - -// LSTM OBJECTIVE -// The LSTM model -fn lstm_model( - hsize: usize, - weight: &[f64], - bias: &[f64], - hidden: &mut [f64], - cell: &mut [f64], - input: &[f64], -) { - let mut gates = vec![0.0; 4 * hsize]; - let (a,b) = gates.split_at_mut(2*hsize); - let ((forget, ingate), (outgate, change)) = ( - a.split_at_mut(hsize), b.split_at_mut(hsize)); - - // caching input - for i in 0..hsize { - forget[i] = sigmoid(input[i] * weight[i] + bias[i]); - ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); - outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); - change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); - } - - // caching cell - for i in 0..hsize { - cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; - } - - for i in 0..hsize { - hidden[i] = outgate[i] * cell[i].tanh(); - } -} - -// Predict LSTM output given an input -fn lstm_predict( - l: usize, - b: usize, - w: &[f64], - w2: &[f64], - s: &mut [f64], - x: &[f64], - x2: &mut [f64], -) { - for i in 0..b { - x2[i] = x[i] * w2[i]; - } - - let mut i = 0; - while i <= 2*l*b - 1 { - // make borrow-checker happy with non-overlapping mutable references - let (xp, s1, s2) = if i == 0 { - let (s1, s2) = s.split_at_mut(b); - (x2.as_mut(), s1, s2) - } else { - let tmp = &mut s[i-2*b..]; - let (a, d) = tmp.split_at_mut(2*b); - let (d, c) = d.split_at_mut(b); - - (a,d,c) - }; - - lstm_model( - b, - &w[i * 4..], - &w[(i + b) * 4..], - s1, - s2, - xp, - ); - i += 2 * b; - } - - let xp = &s[i-2*b..]; - - for i in 0..b { - x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; - } -} - -// LSTM objective (loss function) -#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn lstm_objective( - l: usize, - c: usize, - b: usize, - main_params: &[f64], - extra_params: &[f64], - state: &mut [f64], - sequence: &[f64], - loss: &mut f64, -) { - let mut total = 0.0; - let mut count = 0; - - let mut input = &sequence[..b]; - let mut ypred = vec![0.0; b]; - let mut ynorm = vec![0.0; b]; - let mut lse; - - assert!(b > 0); - - for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - lse = logsumexp(&ypred); - for i in 0..b { - ynorm[i] = ypred[i] - lse; - } - - let ygold = &sequence[t + b..]; - for i in 0..b { - total += ygold[i] * ynorm[i]; - } - - count += b; - input = ygold; - } - - *loss = -total / count as f64; +#[no_mangle] +pub extern "C" fn rust_unsafe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } - #[no_mangle] -pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { let (main_params, extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts(extra_params, 3*b), @@ -148,12 +19,16 @@ pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: )}; unsafe { - lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + safe::lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); } } #[no_mangle] -pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_unsafe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} +} +#[no_mangle] +pub extern "C" fn rust_safe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), @@ -164,6 +39,6 @@ pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params )}; unsafe { - d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + safe::d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); } } diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs new file mode 100644 index 000000000000..8734998acfb7 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -0,0 +1,167 @@ +use std::slice; + +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +fn logsumexp(vect: &[f64]) -> f64 { + let mut sum = 0.0; + for &val in vect { + sum += val.exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +fn lstm_model( + hsize: usize, + weight: &[f64], + bias: &[f64], + hidden: &mut [f64], + cell: &mut [f64], + input: &[f64], +) { + let mut gates = vec![0.0; 4 * hsize]; + let (a,b) = gates.split_at_mut(2*hsize); + let ((forget, ingate), (outgate, change)) = ( + a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); + } + + // caching cell + for i in 0..hsize { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for i in 0..hsize { + hidden[i] = outgate[i] * cell[i].tanh(); + } +} + +// Predict LSTM output given an input +fn lstm_predict( + l: usize, + b: usize, + w: &[f64], + w2: &[f64], + s: &mut [f64], + x: &[f64], + x2: &mut [f64], +) { + for i in 0..b { + x2[i] = x[i] * w2[i]; + } + + let mut i = 0; + while i <= 2*l*b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { + let tmp = &mut s[i-2*b..]; + let (a, d) = tmp.split_at_mut(2*b); + let (d, c) = d.split_at_mut(b); + + (a,d,c) + }; + + lstm_model( + b, + &w[i * 4..], + &w[(i + b) * 4..], + s1, + s2, + xp, + ); + + i += 2 * b; + } + + let xp = &s[i-2*b..]; + + for i in 0..b { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub (crate) fn lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: &[f64], + extra_params: &[f64], + state: &mut [f64], + sequence: &[f64], + loss: &mut f64, +) { + let mut total = 0.0; + let mut count = 0; + + let mut input = &sequence[..b]; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + for t in (0..=(c - 1) * b - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); + lse = logsumexp(&ypred); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + let ygold = &sequence[t + b..]; + for i in 0..b { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} + +#[no_mangle] +pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let (main_params, extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + } +} + +#[no_mangle] +pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts_mut(d_main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(d_extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + } +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs new file mode 100644 index 000000000000..3758c8e1e97a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs @@ -0,0 +1,114 @@ +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +unsafe fn logsumexp(vect: *const f64, sz: usize) -> f64 { + let mut sum: f64 = 0.0; + for i in 0..sz { + sum += (*vect.add(i)).exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +unsafe fn lstm_model( + hsize: usize, + weight: *const f64, + bias: *const f64, + hidden: *mut f64, + cell: *mut f64, + input: *const f64, +) { +// // TODO NOTE THIS +// //__builtin_assume(hsize > 0); + let mut gates = vec![0.0; 4 * hsize]; + let forget: *mut f64 = gates.as_mut_ptr(); + let ingate: *mut f64 = gates[hsize..].as_mut_ptr(); + let outgate: *mut f64 = gates[2 * hsize..].as_mut_ptr(); + let change: *mut f64 = gates[3 * hsize..].as_mut_ptr(); + //let (a,b) = gates.split_at_mut(2*hsize); + //let ((forget, ingate), (outgate, change)) = ( + // a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + *forget.add(i) = sigmoid(*input.add(i) * *weight.add(i) + *bias.add(i)); + *ingate.add(i) = sigmoid(*hidden.add(i) * *weight.add(hsize + i) + *bias.add(hsize + i)); + *outgate.add(i) = sigmoid(*input.add(i) * *weight.add(2 * hsize + i) + *bias.add(2 * hsize + i)); + *change.add(i) = (*hidden.add(i) * *weight.add(3 * hsize + i) + *bias.add(3 * hsize + i)).tanh(); + } + + // caching cell + for i in 0..hsize { + *cell.add(i) = *cell.add(i) * *forget.add(i) + *ingate.add(i) * *change.add(i); + } + + for i in 0..hsize { + *hidden.add(i) = *outgate.add(i) * (*cell.add(i)).tanh(); + } +} + +// Predict LSTM output given an input +unsafe fn lstm_predict( + l: usize, + b: usize, + w: *const f64, + w2: *const f64, + s: *mut f64, + x: *const f64, + x2: *mut f64, +) { + for i in 0..b { + *x2.add(i) = *x.add(i) * *w2.add(i); + } + + let mut xp = x2; + let stop = 2 * l * b; + for i in (0..=stop - 1).step_by(2 * b) { + lstm_model(b, w.add(i * 4), w.add((i + b) * 4), s.add(i), s.add(i + b), xp); + xp = s.add(i); + } + + for i in 0..b { + *x2.add(i) = *xp.add(i) * *w2.add(b + i) + *w2.add(2 * b + i); + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_unsafe_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub (crate) unsafe fn lstm_unsafe_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let mut total = 0.0; + let mut count = 0; + + //const double* input = &(sequence[0]); + let mut input = sequence; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + let stop = (c - 1) * b; + for t in (0..=stop - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, ypred.as_mut_ptr()); + lse = logsumexp(ypred.as_mut_ptr(), b); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + //let ygold = &sequence[t + b..]; + let ygold = sequence.add(t + b); + for i in 0..b { + total += *ygold.add(i) * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} From 7ca8092e0ebbd0e1f433c736befcd6e4955afc9b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 19:36:27 -0400 Subject: [PATCH 31/88] run full fft tests --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 6d16839a56ff..b6c5fb7b5eaa 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -342,7 +342,6 @@ int main(int argc, char** argv) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } - N = 2; double inp = -2.1; for(unsigned iters=max(1, N>>5); iters <= N; iters*=2) { From 73f807567c232d50c44c4b0d7a14f86112456aa5 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 20:00:38 -0400 Subject: [PATCH 32/88] Delete enzyme/benchmarks/ReverseMode/lstm/src/main.rs --- enzyme/benchmarks/ReverseMode/lstm/src/main.rs | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} From ba3aa5d7b32ab9af3159083b131d69e062296772 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 6 Apr 2024 04:13:22 -0400 Subject: [PATCH 33/88] cleanup and correctness --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 96 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 8 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 20 +++- 3 files changed, 113 insertions(+), 11 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 03107c62ec41..7472bf37beb2 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -41,6 +41,14 @@ void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, double const *sequence, double *loss, double *dloss); +void rust_unsafe_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + +void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, @@ -173,6 +181,28 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) } } +double calculate_unsafe_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + rust_unsafe_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + +double calculate_safe_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + rust_safe_lstm_objective(input.l, input.c, input.b, + input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + int main(const int argc, const char* argv[]) { printf("starting main\n"); @@ -355,6 +385,72 @@ int main(const int argc, const char* argv[]) { printf("\n"); } } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_unsafe_primal(input); + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (unsafe Rust) primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_safe_primal(input); + gettimeofday(&end, NULL); + printf("Enzyme (safe Rust) primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (safe Rust) primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index 5f954347f1d7..e56b9d5609e5 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -1,4 +1,3 @@ -//#![feature(autodiff)] use std::f64::consts::PI; use crate::Wishart; @@ -55,17 +54,12 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, unsafe { *err = my_err }; } -//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { -// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); -//} - #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { let wishart: Wishart = Wishart { gamma, m }; - //let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; let mut sum_qs = vec![0.; k]; let mut xcentered = vec![0.; d]; diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index b6b0e3e33225..937460f3cee3 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -6,11 +6,17 @@ use std::slice; #[no_mangle] -pub extern "C" fn rust_unsafe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_unsafe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } #[no_mangle] -pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; let (main_params, extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts(extra_params, 3*b), @@ -24,11 +30,17 @@ pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_pa } #[no_mangle] -pub extern "C" fn rust_unsafe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_unsafe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} } #[no_mangle] -pub extern "C" fn rust_safe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), From 273773439e0578b26fbf985a4d5d343df0449fb2 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 6 May 2024 03:52:02 -0400 Subject: [PATCH 34/88] initial (compiling) rust ode version --- .../ReverseMode/ode-real/ode/Cargo.lock | 7 + .../ReverseMode/ode-real/ode/Cargo.toml | 21 ++ .../ReverseMode/ode-real/ode/src/lib.rs | 200 ++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock new file mode 100644 index 000000000000..93dcf6a53b60 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ode" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml new file mode 100644 index 000000000000..3013b597df4e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "ode" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["lib"] + + +[profile.release] +lto = "fat" +opt-level = 3 +#debug = true +#strip = "none" + +[profile.dev] +lto = "fat" + +[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs new file mode 100644 index 000000000000..15a4dc606cde --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs @@ -0,0 +1,200 @@ +#![feature(autodiff)] +#![feature(iter_next_chunk)] +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] +#![allow(non_upper_case_globals)] + +//#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS +//#define BOOST_NO_EXCEPTIONS + +const N: usize = 32; +const xmin: f64 = 0.; +const xmax: f64 = 1.; +const ymin: f64 = 0.; +const ymax: f64 = 1.; + +#[inline(always)] +fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { + (max - min) / (N_var as f64 - 1.) * i as f64 + min +} +#[inline(always)] +fn get(x: &[f64], i: usize, j: usize) -> f64 { + assert!(i > 0); + assert!(j < N); + x[N * i + j] +} + +//#define RANGE(min, max, i, N) ((max-min)/(N-1)*i + min) +//#define GETnb(x, i, j) (x)[N*i+j] +//#define GET(x, i, j) GETnb(x, i, j) +// #define GET(x, i, j) ({ assert(i >=0); assert( j>=0); assert(j f64 { + let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; + let eq2 = t >= 1.1; + if eq1 && eq2 { + 5.0 + } else { + 0.0 + } +} + +fn init_brusselator(u: &mut [f64], v: &mut [f64]) { + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + u[N * i + j] = 22.0 * y * (1.0 - y) * (y * (1.0 - y)).sqrt(); + v[N * i + j] = 27.0 * x * (1.0 - x) * (x * (1.0 - x)).sqrt(); + } + } +} +// __enzyme_autodiff(brusselator_2d_loop, +// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), +// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, +// enzyme_dup, x.data(), dx.data(), +// enzyme_dup, x.data() + N * N, dx.data() + N * N, +// enzyme_dup, p, dp, +// enzyme_const, t); + + +#[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] +fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p: &[f64;3], t: f64) { + let A = p[0]; + let B = p[1]; + let alpha = p[2]; + let dx = 1. / (N - 1) as f64; + let alpha = alpha / (dx * dx); + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + let ip1 = if i == N - 1 { i } else { i + 1 }; + let im1 = if i == 0 { i } else { i - 1 }; + let jp1 = if j == N - 1 { j } else { j + 1 }; + let jm1 = if j == 0 { j } else { j - 1 }; + let u2v = u[N * i + j] * u[N * i + j] * v[N * i + j]; + d_u[N * i + j] = alpha * (u[N * im1 + j] + u[N * ip1 + j] + u[N * i + jp1] + u[N * i + jm1] - 4. * u[N * i + j]) + + B + u2v - (A + 1.) * u[N * i + j] + brusselator_f(x, y, t); + d_v[N * i + j] = alpha * (v[N * im1 + j] + v[N * ip1 + j] + v[N * i + jp1] + v[N * i + jm1] - 4. * v[N * i + j]) + + A * u[N * i + j] - u2v; + } + } +} + +//__attribute__((noinline)) +//void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { +// double A = p[0]; +// double B = p[1]; +// double alpha = p[2]; +// double dx = (double)1/(N-1); +// +// alpha = alpha/(dx*dx); +// +// for(int i=0; i f64 { + let x = unsafe { *x }; + let mut adjoint = unsafe { *adjoint }; + let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; + let mut dp = [0.; 3]; + let mut dx1 = [0.; N * N]; + let mut dx2 = [0.; N * N]; + let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + + let (tmp1, tmp2) = x.split_at(N * N); + let x1: [f64; N * N] = tmp1.try_into().unwrap(); + let x2: [f64; N * N] = tmp2.try_into().unwrap(); + + let mut null1 = [0.; 2 * N * N]; + let mut null2 = [0.; 2 * N * N]; + dbrusselator_2d_loop(&mut null1, &mut dadj1, + &mut null2, &mut dadj2, + &x1, &mut dx1, + &x2, &mut dx2, + &p, &mut dp, t); + dx1[0] +} + + +fn foobar(p: &[f64;3], x: state_type, mut adjoint: state_type, t: f64) -> f64 { + let mut dp = [0.; 3]; + let mut dx1 = [0.; N * N]; + let mut dx2 = [0.; N * N]; + let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + let mut null1 = [0.; 2 * N * N]; + let mut null2 = [0.; 2 * N * N]; + let (tmp1, tmp2) = x.split_at(N * N); + let x1: [f64; N * N] = tmp1.try_into().unwrap(); + let x2: [f64; N * N] = tmp2.try_into().unwrap(); + dbrusselator_2d_loop(&mut null1, &mut dadj1, + &mut null2, &mut dadj2, + &x1, &mut dx1, + &x2, &mut dx2, + &p, &mut dp, t); + dx1[0] +} + +//double foobar(const double* p, const state_type x, const state_type adjoint, double t) { +// double dp[3] = { 0. }; +// +// state_type dx = { 0. }; +// +// state_type dadjoint_inp = adjoint; +// +// state_type dxdu; +// +// __enzyme_autodiff(brusselator_2d_loop, +// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), +// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, +// enzyme_dup, x.data(), dx.data(), +// enzyme_dup, x.data() + N * N, dx.data() + N * N, +// enzyme_dup, p, dp, +// enzyme_const, t); +// +// return dx[0]; +//} + +fn main() { + let p = [3.4, 1., 10.]; + let mut x = [0.; 2 * N * N]; + let mut adjoint = [0.; 2 * N * N]; + init_brusselator(&mut x, &mut adjoint); + let t = 2.1; + let mut res = 0.; + let time = std::time::Instant::now(); + for _ in 0..10000 { + res = foobar(&p, x, adjoint, t); + } + println!("Enzyme combined {} res={}", time.elapsed().as_secs_f64(), res); +} From 1ffbaaa618999c31521bfa4cec805df19a40be4f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 6 May 2024 04:09:05 -0400 Subject: [PATCH 35/88] cleanups --- .../ReverseMode/ode-real/ode/src/lib.rs | 49 ++++--------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs index 15a4dc606cde..83c6d0586790 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs @@ -1,4 +1,6 @@ #![feature(autodiff)] +#![feature(slice_first_last_chunk)] +#![feature(slice_as_chunks)] #![feature(iter_next_chunk)] #![allow(non_snake_case)] #![allow(non_camel_case_types)] @@ -83,35 +85,6 @@ fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p } } -//__attribute__((noinline)) -//void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { -// double A = p[0]; -// double B = p[1]; -// double alpha = p[2]; -// double dx = (double)1/(N-1); -// -// alpha = alpha/(dx*dx); -// -// for(int i=0; i f64 { let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); let mut null1 = [0.; 2 * N * N]; let mut null2 = [0.; 2 * N * N]; - let (tmp1, tmp2) = x.split_at(N * N); - let x1: [f64; N * N] = tmp1.try_into().unwrap(); - let x2: [f64; N * N] = tmp2.try_into().unwrap(); + // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; dbrusselator_2d_loop(&mut null1, &mut dadj1, &mut null2, &mut dadj2, - &x1, &mut dx1, - &x2, &mut dx2, + x1, &mut dx1, + x2, &mut dx2, &p, &mut dp, t); dx1[0] } From 8bd316aa04fcfd4c77b38f2b57cccb8141d4b87e Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 22 May 2024 03:19:42 -0400 Subject: [PATCH 36/88] fix ba bench --- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 768f3fec8e38..72a9aece7737 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -32,7 +32,10 @@ fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; let costheta = theta.cos(); let sintheta = theta.sin(); let theta_inverse = 1. / theta; - let w = rot.map(|v| v * theta_inverse); + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = rot[i] * theta_inverse; + } let w_cross_pt = cross(&w, &pt); let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); for i in 0..3 { From c769dacb20fa89fc23220f10123f7f5911d0d51b Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 22 May 2024 23:16:23 -0600 Subject: [PATCH 37/88] bench gmm: move allocation of scratch space outside AD'd function --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 181 +++++++++++++++--- 1 file changed, 155 insertions(+), 26 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index e56b9d5609e5..ec847941a58f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -1,5 +1,5 @@ -use std::f64::consts::PI; use crate::Wishart; +use std::f64::consts::PI; #[cfg(feature = "libm")] use libm::lgamma; @@ -17,7 +17,21 @@ fn lgamma(x: f64) -> f64 { } #[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { +pub extern "C" fn rust_dgmm_objective( + d: i32, + k: i32, + n: i32, + alphas: *const f64, + dalphas: *mut f64, + means: *const f64, + dmeans: *mut f64, + icf: *const f64, + dicf: *mut f64, + x: *const f64, + wishart: *const Wishart, + err: *mut f64, + derr: *mut f64, +) { let k = k as usize; let n = n as usize; let d = d as usize; @@ -32,15 +46,47 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; + let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = + get_workspace(d, k); - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); + dgmm_objective( + d, + k, + n, + alphas, + d_alphas, + means, + d_means, + icf, + d_icf, + x, + wishart.gamma, + wishart.m, + &mut my_err, + &mut my_derr, + &mut qdiags, + &mut sum_qs, + &mut xcentered, + &mut qxcentered, + &mut main_term, + ); unsafe { *err = my_err }; unsafe { *derr = my_derr }; } #[no_mangle] -pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +pub extern "C" fn rust_gmm_objective( + d: i32, + k: i32, + n: i32, + alphas: *const f64, + means: *const f64, + icf: *const f64, + x: *const f64, + wishart: *const Wishart, + err: *mut f64, +) { let k = k as usize; let n = n as usize; let d = d as usize; @@ -50,30 +96,97 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, let x = unsafe { std::slice::from_raw_parts(x, n * d) }; let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); + let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = + get_workspace(d, k); + gmm_objective( + d, + k, + n, + alphas, + means, + icf, + x, + wishart.gamma, + wishart.m, + &mut my_err, + &mut qdiags, + &mut sum_qs, + &mut xcentered, + &mut qxcentered, + &mut main_term, + ); unsafe { *err = my_err }; } -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { +fn get_workspace(d: usize, k: usize) -> (Vec, Vec, Vec, Vec, Vec) { + let qdiags = vec![0.; d * k]; + let sum_qs = vec![0.; k]; + let xcentered = vec![0.; d]; + let qxcentered = vec![0.; d]; + let main_term = vec![0.; k]; + (qdiags, sum_qs, xcentered, qxcentered, main_term) +} + +#[autodiff( + dgmm_objective, + Reverse, + Const, + Const, + Const, + Duplicated, + Duplicated, + Duplicated, + Const, + Const, + Const, + Duplicated, + Const, + Const, + Const, + Const, + Const +)] +pub fn gmm_objective( + d: usize, + k: usize, + n: usize, + alphas: &[f64], + means: &[f64], + icf: &[f64], + x: &[f64], + gamma: f64, + m: i32, + err: &mut f64, + qdiags: &mut [f64], + sum_qs: &mut [f64], + xcentered: &mut [f64], + qxcentered: &mut [f64], + main_term: &mut [f64], +) { let wishart: Wishart = Wishart { gamma, m }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + preprocess_qs(d, k, icf, sum_qs, qdiags); let mut slse = 0.; for ix in 0..n { for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); + subtract( + d, + &x[ix as usize * d as usize..], + &means[ik as usize * d as usize..], + xcentered, + ); + qtimesx( + d, + &qdiags[ik as usize * d as usize..], + &icf[ik as usize * icf_sz as usize + d as usize..], + &*xcentered, + qxcentered, + ); + main_term[ik as usize] = + alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&*qxcentered); } slse = slse + log_sum_exp(k, &main_term); @@ -81,7 +194,8 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] let lse_alphas = log_sum_exp(k, alphas); - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); + *err = constant + slse - n as f64 * lse_alphas + + log_wishart_prior(d, k, wishart, &sum_qs, &*qdiags, icf); } fn arr_max(n: usize, x: &[f64]) -> f64 { @@ -123,7 +237,7 @@ fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { } for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; + let mut lparamsidx = i * (2 * d - i - 1) / 2; for j in i + 1..d { out[j] = out[j] + ltri[lparamsidx] * x[i]; lparamsidx += 1; @@ -137,19 +251,34 @@ fn log_sum_exp(n: usize, x: &[f64]) -> f64 { semx.ln() + mx } fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() + 0.25 * p * (p - 1.) * PI.ln() + + (1..=p as usize) + .map(|j| lgamma(a + 0.5 * (1. - j as f64))) + .sum::() } -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { +fn log_wishart_prior( + p: usize, + k: usize, + wishart: Wishart, + sum_qs: &[f64], + qdiags: &[f64], + icf: &[f64], +) -> f64 { let n = p + wishart.m as usize + 1; let icf_sz = p * (p + 1) / 2; - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) + - log_gamma_distrib(0.5 * n as f64, p as f64); - let out = (0..k).map(|ik| { - let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); + let out = (0..k) + .map(|ik| { + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz - p]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) + - (wishart.m as f64) * sum_qs[ik as usize] + }) + .sum::(); out - k as f64 * c } From c5e1f19d48b85f85779034d28ed321f0f3e85f3f Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 22 May 2024 23:36:20 -0600 Subject: [PATCH 38/88] bench gmm: switch scratch from Const to Duplicated This makes the reverse mode correct, and a bit faster than the old version (with allocations inside the AD'd code). --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index ec847941a58f..d7f3b78ac75f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -48,6 +48,8 @@ pub extern "C" fn rust_dgmm_objective( let mut my_derr = unsafe { *derr }; let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = get_workspace(d, k); + let (mut bqdiags, mut bsum_qs, mut bxcentered, mut bqxcentered, mut bmain_term) = + get_workspace(d, k); dgmm_objective( d, @@ -65,10 +67,15 @@ pub extern "C" fn rust_dgmm_objective( &mut my_err, &mut my_derr, &mut qdiags, + &mut bqdiags, &mut sum_qs, + &mut bsum_qs, &mut xcentered, + &mut bxcentered, &mut qxcentered, + &mut bqxcentered, &mut main_term, + &mut bmain_term, ); unsafe { *err = my_err }; @@ -140,11 +147,11 @@ fn get_workspace(d: usize, k: usize) -> (Vec, Vec, Vec, Vec, Const, Const, Duplicated, - Const, - Const, - Const, - Const, - Const + Duplicated, + Duplicated, + Duplicated, + Duplicated, + Duplicated )] pub fn gmm_objective( d: usize, From 947852c60542c416f9c167a9a07e707dc42d542c Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 12:52:23 -0600 Subject: [PATCH 39/88] bench gmm: match C++ performance by asserting sizes of work slices --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index d7f3b78ac75f..c809c34e5454 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -174,6 +174,13 @@ pub fn gmm_objective( let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; + // Let the compiler know sizes so it can eliminate bounds checks + assert_eq!(qdiags.len(), d * k); + assert_eq!(sum_qs.len(), k); + assert_eq!(xcentered.len(), d); + assert_eq!(qxcentered.len(), d); + assert_eq!(main_term.len(), k); + preprocess_qs(d, k, icf, sum_qs, qdiags); let mut slse = 0.; From cb9d4030fe2a92d0e25092399fbe039b133e5fa3 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 13:29:23 -0600 Subject: [PATCH 40/88] bench gmm: shed unused import (warning) --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 8fcb11ffed10..4f9fc5336e8e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,9 +1,6 @@ #![feature(autodiff)] -pub mod r#unsafe; pub mod safe; - -use r#unsafe::dgmm_objective as dgmm_objective; - +pub mod r#unsafe; #[derive(Clone, Copy)] #[repr(C)] @@ -11,4 +8,3 @@ pub struct Wishart { pub gamma: f64, pub m: i32, } - From dbffef699335804558770e3529b09d6b18bbf1a6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 13:48:46 -0600 Subject: [PATCH 41/88] bench lstm: optimize using length assertions --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 131 +++++++++++++----- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 8734998acfb7..6a43419af5bf 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -26,10 +26,15 @@ fn lstm_model( input: &[f64], ) { let mut gates = vec![0.0; 4 * hsize]; - let (a,b) = gates.split_at_mut(2*hsize); - let ((forget, ingate), (outgate, change)) = ( - a.split_at_mut(hsize), b.split_at_mut(hsize)); - + let gates = &mut gates[..4 * hsize]; + let (a, b) = gates.split_at_mut(2 * hsize); + let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); + + assert_eq!(weight.len(), 4 * hsize); + assert_eq!(bias.len(), 4 * hsize); + assert_eq!(hidden.len(), hsize); + assert!(cell.len() >= hsize); + assert!(input.len() >= hsize); // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); @@ -63,23 +68,23 @@ fn lstm_predict( } let mut i = 0; - while i <= 2*l*b - 1 { + while i <= 2 * l * b - 1 { // make borrow-checker happy with non-overlapping mutable references let (xp, s1, s2) = if i == 0 { let (s1, s2) = s.split_at_mut(b); (x2.as_mut(), s1, s2) } else { - let tmp = &mut s[i-2*b..]; - let (a, d) = tmp.split_at_mut(2*b); + let tmp = &mut s[i - 2 * b..]; + let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); - (a,d,c) + (a, d, c) }; lstm_model( b, - &w[i * 4..], - &w[(i + b) * 4..], + &w[i * 4..(i + b) * 4], + &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, @@ -88,7 +93,7 @@ fn lstm_predict( i += 2 * b; } - let xp = &s[i-2*b..]; + let xp = &s[i - 2 * b..]; for i in 0..b { x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; @@ -96,8 +101,19 @@ fn lstm_predict( } // LSTM objective (loss function) -#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] -pub (crate) fn lstm_objective( +#[autodiff( + d_lstm_objective, + Reverse, + Const, + Const, + Const, + Duplicated, + Duplicated, + Const, + Const, + Duplicated +)] +pub(crate) fn lstm_objective( l: usize, c: usize, b: usize, @@ -112,14 +128,15 @@ pub (crate) fn lstm_objective( let mut input = &sequence[..b]; let mut ypred = vec![0.0; b]; + let ypred = &mut ypred[..b]; let mut ynorm = vec![0.0; b]; - let mut lse; + let ynorm = &mut ynorm[..b]; assert!(b > 0); for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - lse = logsumexp(&ypred); + lstm_predict(l, b, main_params, extra_params, state, input, ypred); + let lse = logsumexp(&ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; } @@ -137,31 +154,77 @@ pub (crate) fn lstm_objective( } #[no_mangle] -pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { - let (main_params, extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), - slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) - )}; +pub extern "C" fn rust_lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: *const f64, + extra_params: *const f64, + state: *mut f64, + sequence: *const f64, + loss: *mut f64, +) { + let (main_params, extra_params, state, sequence) = unsafe { + ( + slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(state, 2 * l * b), + slice::from_raw_parts(sequence, c * b), + ) + }; unsafe { - lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + lstm_objective( + l, + c, + b, + main_params, + extra_params, + state, + sequence, + &mut *loss, + ); } } #[no_mangle] -pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { - let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), - slice::from_raw_parts_mut(d_main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), - slice::from_raw_parts_mut(d_extra_params, 3*b), - slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) - )}; +pub extern "C" fn rust_dlstm_objective( + l: usize, + c: usize, + b: usize, + main_params: *const f64, + d_main_params: *mut f64, + extra_params: *const f64, + d_extra_params: *mut f64, + state: *mut f64, + sequence: *const f64, + res: *mut f64, + d_res: *mut f64, +) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe { + ( + slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts_mut(d_main_params, 2 * l * 4 * b), + slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(d_extra_params, 3 * b), + slice::from_raw_parts_mut(state, 2 * l * b), + slice::from_raw_parts(sequence, c * b), + ) + }; unsafe { - d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + d_lstm_objective( + l, + c, + b, + main_params, + d_main_params, + extra_params, + d_extra_params, + state, + sequence, + &mut *res, + &mut *d_res, + ); } } From 0174227cb5b3fbf76312112b40a50c05fb6a66e3 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 24 May 2024 03:21:55 -0400 Subject: [PATCH 42/88] adding unsafe ba version --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 130 ++++++++--- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 204 +----------------- enzyme/benchmarks/ReverseMode/ba/src/safe.rs | 203 +++++++++++++++++ .../benchmarks/ReverseMode/ba/src/unsafe.rs | 139 ++++++++++++ 4 files changed, 445 insertions(+), 231 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index 5d9178120e76..aa62cf2a165f 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -127,19 +127,16 @@ extern "C" { double* reproj_err, double* w_err ); - - void rust2_ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err - ); + + void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, + int const *obs, double const *feats, + double *reproj_err, double *w_err); + + void rust2_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); void dcompute_reproj_error( double const* cam, @@ -183,17 +180,17 @@ extern "C" { void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); - void rust_dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); + void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); + + void rust_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); } @@ -362,10 +359,22 @@ int main(const int argc, const char* argv[]) { std::string path = "/mnt/Data/git/Enzyme/apps/ADBench/data/ba/ba1_n49_m7776_p31843.txt"; std::vector paths = { - "ba10_n1197_m126327_p563734.txt", "ba14_n356_m226730_p1255268.txt", "ba18_n1936_m649673_p5213733.txt", "ba2_n21_m11315_p36455.txt", "ba6_n539_m65220_p277273.txt", "test.txt", - "ba11_n1723_m156502_p678718.txt", "ba15_n1102_m780462_p4052340.txt", "ba19_n4585_m1324582_p9125125.txt", "ba3_n161_m48126_p182072.txt", "ba7_n93_m61203_p287451.txt", - "ba12_n253_m163691_p899155.txt", "ba16_n1544_m942409_p4750193.txt", "ba1_n49_m7776_p31843.txt", "ba4_n372_m47423_p204472.txt", "ba8_n88_m64298_p383937.txt", - "ba13_n245_m198739_p1091386.txt", "ba17_n1778_m993923_p5001946.txt", "ba20_n13682_m4456117_p2987644.txt", "ba5_n257_m65132_p225911.txt", "ba9_n810_m88814_p393775.txt", + "ba10_n1197_m126327_p563734.txt", + "ba14_n356_m226730_p1255268.txt", // "ba18_n1936_m649673_p5213733.txt", + // "ba2_n21_m11315_p36455.txt", + // "ba6_n539_m65220_p277273.txt", + // "test.txt", + // "ba11_n1723_m156502_p678718.txt", + // "ba15_n1102_m780462_p4052340.txt", + // "ba19_n4585_m1324582_p9125125.txt", + // "ba3_n161_m48126_p182072.txt", "ba7_n93_m61203_p287451.txt", + // "ba12_n253_m163691_p899155.txt", + // "ba16_n1544_m942409_p4750193.txt", "ba1_n49_m7776_p31843.txt", + // "ba4_n372_m47423_p204472.txt", "ba8_n88_m64298_p383937.txt", + // "ba13_n245_m198739_p1091386.txt", + // "ba17_n1778_m993923_p5001946.txt", + // "ba20_n13682_m4456117_p2987644.txt", + // "ba5_n257_m65132_p225911.txt", "ba9_n810_m88814_p393775.txt", }; std::ofstream jsonfile("results.json", std::ofstream::trunc); @@ -571,7 +580,40 @@ int main(const int argc, const char* argv[]) { } } - + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + { + + struct timeval start, end; + gettimeofday(&start, NULL); + rust2_unsafe_ba_objective(input.n, input.m, input.p, input.cams.data(), + input.X.data(), input.w.data(), + input.obs.data(), input.feats.data(), + result.reproj_err.data(), result.w_err.data()); + gettimeofday(&end, NULL); + printf("primal unsafe rust t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal unsafe rust"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + { struct BAInput input; read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); @@ -626,6 +668,35 @@ int main(const int argc, const char* argv[]) { BASparseMat(input.n, input.m, input.p) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme unsafe rust combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + { struct timeval start, end; gettimeofday(&start, NULL); @@ -642,7 +713,6 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } test_suite["llvm-version"] = __clang_version__; diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 72a9aece7737..1f665012c07a 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -2,87 +2,10 @@ #![feature(slice_first_last_chunk)] #![allow(non_snake_case)] -//#define BA_NCAMPARAMS 11 -static BA_NCAMPARAMS: usize = 11; - -fn sqsum(x: &[f64]) -> f64 { - x.iter().map(|&v| v * v).sum() -} - -#[inline] -fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { - [ - a[1] * b[2] - a[2] * b[1], - a[2] * b[0] - a[0] * b[2], - a[0] * b[1] - a[1] * b[0], - ] -} - -fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { - let rsq = sqsum(proj); - let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; - proj[0] = proj[0] * l; - proj[1] = proj[1] * l; -} - -fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { - let sqtheta = sqsum(rot); - if sqtheta != 0. { - let theta = sqtheta.sqrt(); - let costheta = theta.cos(); - let sintheta = theta.sin(); - let theta_inverse = 1. / theta; - let mut w = [0.; 3]; - for i in 0..3 { - w[i] = rot[i] * theta_inverse; - } - let w_cross_pt = cross(&w, &pt); - let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); - for i in 0..3 { - rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; - } - } else { - let rot_cross_pt = cross(&rot, &pt); - for i in 0..3 { - rotated_pt[i] = pt[i] + rot_cross_pt[i]; - } - } -} - -fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { - let C = &cam[3..6]; - let mut Xo = [0.; 3]; - let mut Xcam = [0.; 3]; - - Xo[0] = X[0] - C[0]; - Xo[1] = X[1] - C[1]; - Xo[2] = X[2] - C[2]; - - rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); - - proj[0] = Xcam[0] / Xcam[2]; - proj[1] = Xcam[1] / Xcam[2]; - - radial_distort(&cam[9..], proj); - - proj[0] = proj[0] * cam[6] + cam[7]; - proj[1] = proj[1] * cam[6] + cam[8]; -} +pub mod safe; +pub mod r#unsafe; -#[no_mangle] -pub extern "C" fn rust_dcompute_reproj_error( - cam: *const [f64; 11], - dcam: *mut [f64; 11], - x: *const [f64; 3], - dx: *mut [f64; 3], - w: *const [f64; 1], - wb: *mut [f64; 1], - feat: *const [f64; 2], - err: *mut [f64; 2], - derr: *mut [f64; 2], -) { - dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); -} +static BA_NCAMPARAMS: usize = 11; #[no_mangle] pub extern "C" fn rust_dcompute_zach_weight_error( @@ -94,130 +17,9 @@ pub extern "C" fn rust_dcompute_zach_weight_error( dcompute_zach_weight_error(w, dw, err, derr); } -#[autodiff( - dcompute_reproj_error, - Reverse, - Duplicated, - Duplicated, - Duplicated, - Const, - Duplicated -)] -pub fn compute_reproj_error( - cam: *const [f64; 11], - x: *const [f64; 3], - w: *const [f64; 1], - feat: *const [f64; 2], - err: *mut [f64; 2], -) { - let cam = unsafe { &*cam }; - let w = unsafe { *(*w).get_unchecked(0) }; - let x = unsafe { &*x }; - let feat = unsafe { &*feat }; - let mut err = unsafe { &mut *err }; - let mut proj = [0.; 2]; - project(cam, x, &mut proj); - err[0] = w * (proj[0] - feat[0]); - err[1] = w * (proj[1] - feat[1]); -} - #[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; unsafe { *err = 1. - w * w; } } -// n number of cameras -// m number of points -// p number of observations -// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] -// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) -// [C1 C2 C3]' is the camera center -// f is the focal length in pixels -// [u0 v0]' is the principal point -// k1, k2 are radial distortion parameters -// X: 3*m points -// obs: 2*p observations (pairs cameraIdx, pointIdx) -// feats: 2*p features (x,y coordinates corresponding to observations) -// reproj_err: 2*p errors of observations -// w_err: p weight "error" terms -fn rust_ba_objective( - n: usize, - m: usize, - p: usize, - cams: &[f64], - x: &[f64], - w: &[f64], - obs: &[i32], - feats: &[f64], - reproj_err: &mut [f64], - w_err: &mut [f64], -) { - assert_eq!(cams.len(), n * 11); - assert_eq!(x.len(), m * 3); - assert_eq!(w.len(), p); - assert_eq!(obs.len(), p * 2); - assert_eq!(feats.len(), p * 2); - assert_eq!(reproj_err.len(), p * 2); - assert_eq!(w_err.len(), p); - - for i in 0..p { - let cam_idx = obs[i * 2 + 0] as usize; - let pt_idx = obs[i * 2 + 1] as usize; - let start = cam_idx * BA_NCAMPARAMS; - let cam: &[f64; 11] = unsafe { - cams[start..] - .get_unchecked(..11) - .try_into() - .unwrap_unchecked() - }; - let x: &[f64; 3] = unsafe { - x[pt_idx * 3..] - .get_unchecked(..3) - .try_into() - .unwrap_unchecked() - }; - let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; - let feat: &[f64; 2] = unsafe { - feats[i * 2..] - .get_unchecked(..2) - .try_into() - .unwrap_unchecked() - }; - let reproj_err: &mut [f64; 2] = unsafe { - reproj_err[i * 2..] - .get_unchecked_mut(..2) - .try_into() - .unwrap_unchecked() - }; - compute_reproj_error(cam, x, w, feat, reproj_err); - } - - for i in 0..p { - let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; - compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); - } -} - -#[no_mangle] -extern "C" fn rust2_ba_objective( - n: usize, - m: usize, - p: usize, - cams: *const f64, - x: *const f64, - w: *const f64, - obs: *const i32, - feats: *const f64, - reproj_err: *mut f64, - w_err: *mut f64, -) { - let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; - let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; - let w = unsafe { std::slice::from_raw_parts(w, p) }; - let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; - let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; - let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; - let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; - rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); -} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs new file mode 100644 index 000000000000..c38f5359cc30 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs @@ -0,0 +1,203 @@ +use crate::BA_NCAMPARAMS; +use crate::compute_zach_weight_error; + +fn sqsum(x: &[f64]) -> f64 { + x.iter().map(|&v| v * v).sum() +} + +#[inline] +fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { + [ + a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0], + ] +} + +fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { + let rsq = sqsum(proj); + let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; + proj[0] = proj[0] * l; + proj[1] = proj[1] * l; +} + +fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { + let sqtheta = sqsum(rot); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = rot[i] * theta_inverse; + } + let w_cross_pt = cross(&w, &pt); + let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); + for i in 0..3 { + rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let rot_cross_pt = cross(&rot, &pt); + for i in 0..3 { + rotated_pt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { + let C = &cam[3..6]; + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9..], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +#[no_mangle] +pub extern "C" fn rust_dcompute_reproj_error( + cam: *const [f64; 11], + dcam: *mut [f64; 11], + x: *const [f64; 3], + dx: *mut [f64; 3], + w: *const [f64; 1], + wb: *mut [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], + derr: *mut [f64; 2], +) { + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + +#[autodiff( + dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub fn compute_reproj_error( + cam: *const [f64; 11], + x: *const [f64; 3], + w: *const [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], +) { + let cam = unsafe { &*cam }; + let w = unsafe { *(*w).get_unchecked(0) }; + let x = unsafe { &*x }; + let feat = unsafe { &*feat }; + let err = unsafe { &mut *err }; + let mut proj = [0.; 2]; + project(cam, x, &mut proj); + err[0] = w * (proj[0] - feat[0]); + err[1] = w * (proj[1] - feat[1]); +} + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +fn rust_ba_objective( + n: usize, + m: usize, + p: usize, + cams: &[f64], + x: &[f64], + w: &[f64], + obs: &[i32], + feats: &[f64], + reproj_err: &mut [f64], + w_err: &mut [f64], +) { + assert_eq!(cams.len(), n * 11); + assert_eq!(x.len(), m * 3); + assert_eq!(w.len(), p); + assert_eq!(obs.len(), p * 2); + assert_eq!(feats.len(), p * 2); + assert_eq!(reproj_err.len(), p * 2); + assert_eq!(w_err.len(), p); + + for i in 0..p { + let cam_idx = obs[i * 2 + 0] as usize; + let pt_idx = obs[i * 2 + 1] as usize; + let start = cam_idx * BA_NCAMPARAMS; + let cam: &[f64; 11] = unsafe { + cams[start..] + .get_unchecked(..11) + .try_into() + .unwrap_unchecked() + }; + let x: &[f64; 3] = unsafe { + x[pt_idx * 3..] + .get_unchecked(..3) + .try_into() + .unwrap_unchecked() + }; + let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; + let feat: &[f64; 2] = unsafe { + feats[i * 2..] + .get_unchecked(..2) + .try_into() + .unwrap_unchecked() + }; + let reproj_err: &mut [f64; 2] = unsafe { + reproj_err[i * 2..] + .get_unchecked_mut(..2) + .try_into() + .unwrap_unchecked() + }; + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; + compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); + } +} + +#[no_mangle] +extern "C" fn rust2_ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; + let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; + let w = unsafe { std::slice::from_raw_parts(w, p) }; + let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; + let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; + let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; + let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; + rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); +} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs new file mode 100644 index 000000000000..477d900c3310 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs @@ -0,0 +1,139 @@ +use crate::BA_NCAMPARAMS; +use crate::compute_zach_weight_error; + +unsafe fn sqsum(x: *const f64, n: usize) -> f64 { + let mut sum = 0.; + for i in 0..n { + let v = unsafe { *x.add(i) }; + sum += v * v; + } + sum +} + +#[inline] +unsafe fn cross(a: *const f64, b: *const f64, out: *mut f64) { + *out.add(0) = *a.add(1) * *b.add(2) - *a.add(2) * *b.add(1); + *out.add(1) = *a.add(2) * *b.add(0) - *a.add(0) * *b.add(2); + *out.add(2) = *a.add(0) * *b.add(1) - *a.add(1) * *b.add(0); +} + +unsafe fn radial_distort(rad_params: *const f64, proj: *mut f64) { + let rsq = sqsum(proj, 2); + let l = 1. + *rad_params.add(0) * rsq + *rad_params.add(1) * rsq * rsq; + *proj.add(0) = *proj.add(0) * l; + *proj.add(1) = *proj.add(1) * l; +} + +unsafe fn rodrigues_rotate_point(rot: *const f64, pt: *const f64, rotated_pt: *mut f64) { + let sqtheta = sqsum(rot, 3); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = *rot.add(i) * theta_inverse; + } + let mut w_cross_pt = [0.; 3]; + cross(w.as_ptr(), pt, w_cross_pt.as_mut_ptr()); + let tmp = (w[0] * *pt.add(0) + w[1] * *pt.add(1) + w[2] * *pt.add(2)) * (1. - costheta); + for i in 0..3 { + *rotated_pt.add(i) = *pt.add(i) * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let mut rot_cross_pt = [0.; 3]; + cross(rot, pt, rot_cross_pt.as_mut_ptr()); + for i in 0..3 { + *rotated_pt.add(i) = *pt.add(i) + rot_cross_pt[i]; + } + } +} + +unsafe fn project(cam: *const f64, X: *const f64, proj: *mut f64) { + let C = cam.add(3); + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = *X.add(0) - *C.add(0); + Xo[1] = *X.add(1) - *C.add(1); + Xo[2] = *X.add(2) - *C.add(2); + + rodrigues_rotate_point(cam, Xo.as_ptr(), Xcam.as_mut_ptr()); + + *proj.add(0) = Xcam[0] / Xcam[2]; + *proj.add(1) = Xcam[1] / Xcam[2]; + + radial_distort(cam.add(9), proj); + *proj.add(0) = *proj.add(0) * *cam.add(6) + *cam.add(7); + *proj.add(1) = *proj.add(1) * *cam.add(6) + *cam.add(8); +} + +#[no_mangle] +pub unsafe extern "C" fn rust_unsafe_dcompute_reproj_error( + cam: *const f64, + dcam: *mut f64, + x: *const f64, + dx: *mut f64, + w: *const f64, + wb: *mut f64, + feat: *const f64, + err: *mut f64, + derr: *mut f64, +) { + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + + +#[autodiff( + dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub unsafe fn compute_reproj_error( + cam: *const f64, + x: *const f64, + w: *const f64, + feat: *const f64, + err: *mut f64, +) { + let mut proj = [0.; 2]; + project(cam, x, proj.as_mut_ptr()); + *err.add(0) = *w * (proj[0] - *feat.add(0)); + *err.add(1) = *w * (proj[1] - *feat.add(1)); +} + +#[no_mangle] +unsafe extern "C" fn rust2_unsafe_ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + for i in 0..p { + let cam_idx = *obs.add(i * 2 + 0) as usize; + let pt_idx = *obs.add(i * 2 + 1) as usize; + let start = cam_idx * BA_NCAMPARAMS; + + let cam: *const f64 = cams.add(start); + let x: *const f64 = x.add(pt_idx * 3); + let w: *const f64 = w.add(i); + let feat: *const f64 = feats.add(i * 2); + let reproj_err: *mut f64 = reproj_err.add(i * 2); + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + compute_zach_weight_error(w.add(i), w_err.add(i)); + } +} From 629f87c4b0a184d52390e830c27965eaa4e25544 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 00:40:38 -0400 Subject: [PATCH 43/88] smaller perf improvements --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 6a43419af5bf..76c4316fab51 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -28,14 +28,17 @@ fn lstm_model( let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; let (a, b) = gates.split_at_mut(2 * hsize); - let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); + let (forget, ingate) = a.split_at_mut(hsize); + let (outgate, change) = b.split_at_mut(hsize); assert_eq!(weight.len(), 4 * hsize); assert_eq!(bias.len(), 4 * hsize); assert_eq!(hidden.len(), hsize); + assert_eq!(ingate.len(), hsize); + assert_eq!(change.len(), hsize); assert!(cell.len() >= hsize); assert!(input.len() >= hsize); - // caching input + // Using unchecked indexing here was slightly slower for some reason for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); @@ -66,33 +69,42 @@ fn lstm_predict( for i in 0..b { x2[i] = x[i] * w2[i]; } - - let mut i = 0; - while i <= 2 * l * b - 1 { - // make borrow-checker happy with non-overlapping mutable references - let (xp, s1, s2) = if i == 0 { - let (s1, s2) = s.split_at_mut(b); - (x2.as_mut(), s1, s2) - } else { + + let (s1, s2) = s.split_at_mut(b); + lstm_model( + b, + &w[0..b * 4], + &w[b * 4..2 * b * 4], + s1, + s2, + x2.as_mut(), + ); + + assert_eq!(s.len(), 2 * b * l); + assert_eq!(w.len(), 4 * b * l); + for i in 1..l { + let i = i * 2 * b; + let (xp, s1, s2) = { let tmp = &mut s[i - 2 * b..]; let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); - (a, d, c) }; + let (w1, w2) = w.split_at((i + b) * 4); lstm_model( b, + //&w1[i * 4..], + //&w2[0..(i + 2 * b) * 4], &w[i * 4..(i + b) * 4], &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, ); - - i += 2 * b; } + let i = 2 * l * b; let xp = &s[i - 2 * b..]; for i in 0..b { From c6f44b33f91f3440f7084741974f7df23e4ec7d2 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 01:16:23 -0400 Subject: [PATCH 44/88] small improvment --- enzyme/benchmarks/ReverseMode/lstm/src/safe.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 76c4316fab51..3cb5ca449747 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -6,6 +6,7 @@ fn sigmoid(x: f64) -> f64 { } // log(sum(exp(x), 2)) +#[inline] fn logsumexp(vect: &[f64]) -> f64 { let mut sum = 0.0; for &val in vect { @@ -136,18 +137,17 @@ pub(crate) fn lstm_objective( loss: &mut f64, ) { let mut total = 0.0; - let mut count = 0; let mut input = &sequence[..b]; let mut ypred = vec![0.0; b]; - let ypred = &mut ypred[..b]; let mut ynorm = vec![0.0; b]; - let ynorm = &mut ynorm[..b]; assert!(b > 0); - for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, ypred); + let limit = (c - 1) * b; + for j in 0..(c - 1) { + let t = j * b; + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); let lse = logsumexp(&ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; @@ -158,9 +158,9 @@ pub(crate) fn lstm_objective( total += ygold[i] * ynorm[i]; } - count += b; input = ygold; } + let count = (c - 1) * b; *loss = -total / count as f64; } From fb6df5b3ca6fc8ee7da96e2b8956193146c8c159 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 21:25:24 -0400 Subject: [PATCH 45/88] clean up ode-real example (not building) --- .../ReverseMode/ode-real/{ode => }/Cargo.lock | 0 .../ReverseMode/ode-real/{ode => }/Cargo.toml | 5 ++--- .../ReverseMode/ode-real/Makefile.make | 18 ++++++++++++++---- .../ReverseMode/ode-real/{ode => }/src/lib.rs | 17 +++++++++++++---- 4 files changed, 29 insertions(+), 11 deletions(-) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/Cargo.lock (100%) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/Cargo.toml (99%) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/src/lib.rs (89%) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.lock similarity index 100% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock rename to enzyme/benchmarks/ReverseMode/ode-real/Cargo.lock diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml similarity index 99% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml rename to enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 3013b597df4e..27a031a49570 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -5,10 +5,11 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] + [lib] crate-type = ["lib"] - [profile.release] lto = "fat" opt-level = 3 @@ -17,5 +18,3 @@ opt-level = 3 [profile.dev] lto = "fat" - -[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index 3dd680e5a1c4..16033d158a3a 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -1,24 +1,34 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt +$(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib + %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + @echo $(LOAD) + opt $^ $(LOAD) -o $@ -S %-opt.ll: %-raw.ll opt $^ -o $@ -S #opt $^ -O2 -o $@ -S -ode.o: ode-opt.ll +ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a clang++ -O2 $^ -o $@ $(BENCHLINK) +#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a +# clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm -lode -L $(dir)/benchmarks/ReverseMode/ode/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + + results.txt: ode.o ./$^ 1000 | tee $@ ./$^ 1000 >> $@ diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs similarity index 89% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs rename to enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 83c6d0586790..23995eaa5626 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -96,13 +96,16 @@ fn lorenz(x: &state_type, dxdt: &mut state_type, t: f64) { } #[no_mangle] -pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, x: *const state_type, adjoint: *mut state_type, t: f64) -> f64 { +pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, dp: *mut f64, x: *const state_type, dx: *mut state_type, adjoint: *mut state_type, t: f64) -> f64 { let x = unsafe { *x }; let mut adjoint = unsafe { *adjoint }; let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; - let mut dp = [0.; 3]; - let mut dx1 = [0.; N * N]; - let mut dx2 = [0.; N * N]; + let mut dp: [f64;3] = unsafe { dp.cast::<[f64;3]>().as_mut().unwrap() }; + + let (mut dx1, mut dx2) = dx.split_at_mut(N * N); + //let mut dp = [0.; 3]; + //let mut dx1 = [0.; N * N]; + //let mut dx2 = [0.; N * N]; let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 @@ -116,6 +119,12 @@ pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, x: *const state_type, x2, &mut dx2, &p, &mut dp, t); dx1[0] + //brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), + // nullptr, dadjoint_inp.data() + N * N, + // x.data(), dx.data(), + // x.data() + N * N, dx.data() + N * N, + // p, dp, + // t); } From a6d4a7c7d55f6da1e057ec1eb6e26cc20bd992ef Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 20:49:14 -0400 Subject: [PATCH 46/88] add ba.cpp version without restrict --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 296 ++++++++---------- enzyme/benchmarks/ReverseMode/ba/ba.cpp | 118 +++---- .../benchmarks/ReverseMode/ba/ba_mayalias.h | 198 ++++++++++++ 3 files changed, 372 insertions(+), 240 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index aa62cf2a165f..6a3f97737985 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -115,84 +115,68 @@ struct BAOutput { }; extern "C" { - void ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err - ); - - void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, - double const *X, double const *w, - int const *obs, double const *feats, - double *reproj_err, double *w_err); - - void rust2_ba_objective(int n, int m, int p, double const *cams, - double const *X, double const *w, int const *obs, - double const *feats, double *reproj_err, - double *w_err); - - void dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); - - void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); - - void compute_reproj_error_b( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); +void ba_objective_restrict(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); - void compute_zach_weight_error_b(double const* w, double* dw, double* err, double* derr); - - void adept_compute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); +void ba_objective(int n, int m, int p, double const *cams, double const *X, + double const *w, int const *obs, double const *feats, + double *reproj_err, double *w_err); - void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); +void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); - void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, - double const *X, double *dX, - double const *w, double *wb, - double const *feat, double *err, - double *derr); +void rust2_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, double *w_err); - void rust_dcompute_reproj_error(double const *cam, double *dcam, +void dcompute_reproj_error_restrict(double const *cam, double *dcam, double const *X, double *dX, double const *w, double *wb, double const *feat, double *err, double *derr); - void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); +void dcompute_zach_weight_error_restrict(double const *w, double *dw, + double *err, double *derr); + +void dcompute_reproj_error(double const *cam, double *dcam, double const *X, + double *dX, double const *w, double *wb, + double const *feat, double *err, double *derr); + +void dcompute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); + +void compute_reproj_error_b(double const *cam, double *dcam, double const *X, + double *dX, double const *w, double *wb, + double const *feat, double *err, double *derr); + +void compute_zach_weight_error_b(double const *w, double *dw, double *err, + double *derr); + +void adept_compute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, double const *w, + double *wb, double const *feat, double *err, + double *derr); + +void adept_compute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); + +void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); + +void rust_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, double const *w, + double *wb, double const *feat, double *err, + double *derr); + +void rust_dcompute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); } void read_ba_instance(const string& fn, @@ -394,27 +378,6 @@ int main(const int argc, const char* argv[]) { BASparseMat(input.n, input.m, input.p) }; - //BASparseMat(this->input.n, this->input.m, this->input.p) - - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); - - for(unsigned i=0; iinput.n, this->input.m, this->input.p) - - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); - - for(unsigned i=0; i(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); printf("Adept combined %0.6f\n", tdiff(&start, &end)); json adept; adept["name"] = "Adept combined"; adept["runtime"] = tdiff(&start, &end); - for(unsigned i=0; i<5; i++) { + for (unsigned i = 0; i < 5; i++) { printf("%f ", result.J.vals[i]); adept["result"].push_back(result.J.vals[i]); } printf("\n"); test_suite["tools"].push_back(adept); } - } { struct BAInput input; - read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); - struct BAOutput result = { - std::vector(2 * input.p), - std::vector(input.p), - BASparseMat(input.n, input.m, input.p) - }; + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; - //BASparseMat(this->input.n, this->input.m, this->input.p) + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme restrict c++ combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict c++ combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); + { - for(unsigned i=0; i(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian( + input, result); gettimeofday(&end, NULL); - printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme aliasing c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme c++ combined"; enzyme["runtime"] = tdiff(&start, &end); - for(unsigned i=0; i<5; i++) { + for (unsigned i = 0; i < 5; i++) { printf("%f ", result.J.vals[i]); enzyme["result"].push_back(result.J.vals[i]); } printf("\n"); test_suite["tools"].push_back(enzyme); } - } { struct BAInput input; - read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); - struct BAOutput result = { - std::vector(2 * input.p), - std::vector(input.p), - BASparseMat(input.n, input.m, input.p) - }; + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + ba_objective_restrict(input.n, input.m, input.p, input.cams.data(), + input.X.data(), input.w.data(), input.obs.data(), + input.feats.data(), result.reproj_err.data(), + result.w_err.data()); + gettimeofday(&end, NULL); + printf("primal restrict c++ t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal restrict c++"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; { struct timeval start, end; gettimeofday(&start, NULL); - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); + ba_objective(input.n, input.m, input.p, input.cams.data(), input.X.data(), + input.w.data(), input.obs.data(), input.feats.data(), + result.reproj_err.data(), result.w_err.data()); gettimeofday(&end, NULL); - printf("primal c++ t=%0.6f\n", tdiff(&start, &end)); + printf("primal aliasing c++ t=%0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "primal c++"; + enzyme["name"] = "primal aliasing c++"; enzyme["runtime"] = tdiff(&start, &end); for(unsigned i=0; i<5; i++) { printf("%f ", result.reproj_err[i]); diff --git a/enzyme/benchmarks/ReverseMode/ba/ba.cpp b/enzyme/benchmarks/ReverseMode/ba/ba.cpp index b71e05a0a011..602af73d8d5f 100644 --- a/enzyme/benchmarks/ReverseMode/ba/ba.cpp +++ b/enzyme/benchmarks/ReverseMode/ba/ba.cpp @@ -43,17 +43,13 @@ double sqsum(int n, double const* x) return res; } - - -void cross(double const* a, double const* b, double* out) -{ +void cross_restrict(double const *__restrict a, double const *__restrict b, + double *__restrict out) { out[0] = a[1] * b[2] - a[2] * b[1]; out[1] = a[2] * b[0] - a[0] * b[2]; out[2] = a[0] * b[1] - a[1] * b[0]; } - - /* ===================================================================== */ /* MAIN LOGIC */ /* ===================================================================== */ @@ -68,8 +64,9 @@ void cross(double const* a, double const* b, double* out) // n = w / theta; // n_x = au_cross_matrix(n); // R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta)); -void rodrigues_rotate_point(double const* __restrict rot, double const* __restrict pt, double *__restrict rotatedPt) -{ +void rodrigues_rotate_point_restrict(double const *__restrict rot, + double const *__restrict pt, + double *__restrict rotatedPt) { int i; double sqtheta = sqsum(3, rot); if (sqtheta != 0) @@ -87,7 +84,7 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri w[i] = rot[i] * theta_inverse; } - cross(w, pt, w_cross_pt); + cross_restrict(w, pt, w_cross_pt); tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); @@ -100,7 +97,7 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri else { double rot_cross_pt[3]; - cross(rot, pt, rot_cross_pt); + cross_restrict(rot, pt, rot_cross_pt); for (i = 0; i < 3; i++) { @@ -109,8 +106,6 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri } } - - void radial_distort(double const* rad_params, double *proj) { double rsq, L; @@ -120,10 +115,8 @@ void radial_distort(double const* rad_params, double *proj) proj[1] = proj[1] * L; } - - -void project(double const* __restrict cam, double const* __restrict X, double* __restrict proj) -{ +void project_restrict(double const *__restrict cam, double const *__restrict X, + double *__restrict proj) { double const* C = &cam[3]; double Xo[3], Xcam[3]; @@ -131,7 +124,7 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ Xo[1] = X[1] - C[1]; Xo[2] = X[2] - C[2]; - rodrigues_rotate_point(&cam[0], Xo, Xcam); + rodrigues_rotate_point_restrict(&cam[0], Xo, Xcam); proj[0] = Xcam[0] / Xcam[2]; proj[1] = Xcam[1] / Xcam[2]; @@ -142,8 +135,6 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ proj[1] = proj[1] * cam[6] + cam[8]; } - - // cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] // r1, r2, r3 are angle - axis rotation parameters(Rodrigues) // [C1 C2 C3]' is the camera center @@ -158,30 +149,23 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ // distorted = radial_distort(projective2euclidean(Xcam), radial_parameters) // proj = distorted * f + principal_point // err = sqsum(proj - measurement) -void compute_reproj_error( - double const* __restrict cam, - double const* __restrict X, - double const* __restrict w, - double const* __restrict feat, - double * __restrict err -) -{ +void compute_reproj_error_restrict(double const *__restrict cam, + double const *__restrict X, + double const *__restrict w, + double const *__restrict feat, + double *__restrict err) { double proj[2]; - project(cam, X, proj); + project_restrict(cam, X, proj); err[0] = (*w)*(proj[0] - feat[0]); err[1] = (*w)*(proj[1] - feat[1]); } - - -void compute_zach_weight_error(double const* w, double* err) -{ +void compute_zach_weight_error_restrict(double const *__restrict w, + double *__restrict err) { *err = 1 - (*w)*(*w); } - - // n number of cameras // m number of points // p number of observations @@ -196,36 +180,23 @@ void compute_zach_weight_error(double const* w, double* err) // feats: 2*p features (x,y coordinates corresponding to observations) // reproj_err: 2*p errors of observations // w_err: p weight "error" terms -void ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err -) -{ +void ba_objective_restrict(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err) { int i; for (i = 0; i < p; i++) { int camIdx = obs[i * 2 + 0]; int ptIdx = obs[i * 2 + 1]; - compute_reproj_error( - &cams[camIdx * BA_NCAMPARAMS], - &X[ptIdx * 3], - &w[i], - &feats[i * 2], - &reproj_err[2 * i] - ); + compute_reproj_error_restrict(&cams[camIdx * BA_NCAMPARAMS], + &X[ptIdx * 3], &w[i], &feats[i * 2], + &reproj_err[2 * i]); } for (i = 0; i < p; i++) { - compute_zach_weight_error(&w[i], &w_err[i]); + compute_zach_weight_error_restrict(&w[i], &w_err[i]); } } @@ -234,32 +205,21 @@ extern int enzyme_dup; extern int enzyme_dupnoneed; void __enzyme_autodiff(...) noexcept; -void dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr -) -{ - __enzyme_autodiff(compute_reproj_error, - enzyme_dup, cam, dcam, - enzyme_dup, X, dX, - enzyme_dup, w, wb, - enzyme_const, feat, - enzyme_dupnoneed, err, derr); +void dcompute_reproj_error_restrict(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr) { + __enzyme_autodiff(compute_reproj_error_restrict, enzyme_dup, cam, dcam, + enzyme_dup, X, dX, enzyme_dup, w, wb, enzyme_const, feat, + enzyme_dupnoneed, err, derr); } -void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr) { - __enzyme_autodiff(compute_zach_weight_error, - enzyme_dup, w, dw, - enzyme_dupnoneed, err, derr); +void dcompute_zach_weight_error_restrict(double const *w, double *dw, + double *err, double *derr) { + __enzyme_autodiff(compute_zach_weight_error_restrict, enzyme_dup, w, dw, + enzyme_dupnoneed, err, derr); } - } @@ -911,3 +871,5 @@ void adept_compute_zach_weight_error(double const* w, double* dw, double* err, d *dw = aw.get_gradient(); } + +#include "ba_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h b/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h new file mode 100644 index 000000000000..25197b52d7b2 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h @@ -0,0 +1,198 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +extern "C" { + +/* ===================================================================== */ +/* UTILS */ +/* ===================================================================== */ + +void cross(double const *a, double const *b, double *out) { + out[0] = a[1] * b[2] - a[2] * b[1]; + out[1] = a[2] * b[0] - a[0] * b[2]; + out[2] = a[0] * b[1] - a[1] * b[0]; +} + +/* ===================================================================== */ +/* MAIN LOGIC */ +/* ===================================================================== */ + +void compute_zach_weight_error(double const *w, double *err) { + *err = 1 - (*w) * (*w); +} + +// rot: 3 rotation parameters +// pt: 3 point to be rotated +// rotatedPt: 3 rotated point +// this is an efficient evaluation (part of +// the Ceres implementation) +// easy to understand calculation in matlab: +// theta = sqrt(sum(w. ^ 2)); +// n = w / theta; +// n_x = au_cross_matrix(n); +// R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta)); +void rodrigues_rotate_point(double const *rot, double const *pt, + double *rotatedPt) { + int i; + double sqtheta = sqsum(3, rot); + if (sqtheta != 0) + { + double theta, costheta, sintheta, theta_inverse; + double w[3], w_cross_pt[3], tmp; + + theta = sqrt(sqtheta); + costheta = cos(theta); + sintheta = sin(theta); + theta_inverse = 1.0 / theta; + + for (i = 0; i < 3; i++) + { + w[i] = rot[i] * theta_inverse; + } + + cross(w, pt, w_cross_pt); + + tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * + (1. - costheta); + + for (i = 0; i < 3; i++) + { + rotatedPt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } + else + { + double rot_cross_pt[3]; + cross(rot, pt, rot_cross_pt); + + for (i = 0; i < 3; i++) + { + rotatedPt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +void project(double const *cam, double const *X, double *proj) { + double const* C = &cam[3]; + double Xo[3], Xcam[3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(&cam[0], Xo, Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +// cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3 point +// feats: 2 feature (x,y coordinates) +// reproj_err: 2 +// projection function: +// Xcam = R * (X - C) +// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters) +// proj = distorted * f + principal_point +// err = sqsum(proj - measurement) +void compute_reproj_error(double const *cam, double const *X, double const *w, + double const *feat, double *err) { + double proj[2]; + project(cam, X, proj); + + err[0] = (*w)*(proj[0] - feat[0]); + err[1] = (*w)*(proj[1] - feat[1]); +} + + + + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +void ba_objective( + int n, + int m, + int p, + double const* cams, + double const* X, + double const* w, + int const* obs, + double const* feats, + double* reproj_err, + double* w_err +) +{ + int i; + for (i = 0; i < p; i++) + { + int camIdx = obs[i * 2 + 0]; + int ptIdx = obs[i * 2 + 1]; + compute_reproj_error( + &cams[camIdx * BA_NCAMPARAMS], + &X[ptIdx * 3], + &w[i], + &feats[i * 2], + &reproj_err[2 * i] + ); + } + + for (i = 0; i < p; i++) + { + compute_zach_weight_error(&w[i], &w_err[i]); + } +} + +extern int enzyme_const; +extern int enzyme_dup; +extern int enzyme_dupnoneed; +void __enzyme_autodiff(...) noexcept; + +void dcompute_reproj_error( + double const* cam, + double * dcam, + double const* X, + double * dX, + double const* w, + double * wb, + double const* feat, + double *err, + double *derr +) +{ + __enzyme_autodiff(compute_reproj_error, + enzyme_dup, cam, dcam, + enzyme_dup, X, dX, + enzyme_dup, w, wb, + enzyme_const, feat, + enzyme_dupnoneed, err, derr); +} + +void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr) { + __enzyme_autodiff(compute_zach_weight_error, + enzyme_dup, w, dw, + enzyme_dupnoneed, err, derr); +} + +} From d08142b3f4a68bcfd942c58fe059e85c55884fff Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 21:00:13 -0400 Subject: [PATCH 47/88] add gmm version without restrict --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 148 +++++++++++------- enzyme/benchmarks/ReverseMode/gmm/gmm.cpp | 49 +++--- .../benchmarks/ReverseMode/gmm/gmm_mayalias.h | 64 ++++++++ 3 files changed, 177 insertions(+), 84 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 45d589c7ae75..91b2220bf950 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -33,52 +33,50 @@ struct GMMParameters { }; extern "C" { -void gmm_objective( - int d, - int k, - int n, - double const* alphas, - double const* means, - double const* icf, - double const* x, - Wishart wishart, - double* err -); - void dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void gmm_objective_b(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void adept_dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, - double *alphasb, const double *means, - double *meansb, const double *icf, - double *icfb, const double *x, - Wishart &wishart, double *err, - double *errb); - - void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, - const double *means, const double *icf, - const double *x, Wishart &wishart, - double *err); - - void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart &wishart, double *err, double * - errb); - - void rust_gmm_objective(int d, int k, int n, const double *alphas, - const double *means, const double *icf, - const double *x, Wishart &wishart, double *err); +void gmm_objective(int d, int k, int n, double const *alphas, + double const *means, double const *icf, double const *x, + Wishart wishart, double *err); +void gmm_objective_restrict(int d, int k, int n, double const *alphas, + double const *means, double const *icf, + double const *x, Wishart wishart, double *err); +void dgmm_objective_restrict(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart wishart, double *err, + double *errb); +void dgmm_objective(int d, int k, int n, const double *alphas, double *alphasb, + const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, + double *errb); + +void gmm_objective_b(int d, int k, int n, const double *alphas, double *alphasb, + const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, + double *err, double *errb); + +void adept_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, double *meansb, + const double *icf, double *icfb, const double *x, + Wishart wishart, double *err, double *errb); + +void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart &wishart, double *err, + double *errb); + +void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, double *err); + +void rust_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, double *meansb, + const double *icf, double *icfb, const double *x, + Wishart &wishart, double *err, double *errb); + +void rust_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, const double *x, + Wishart &wishart, double *err); } void read_gmm_instance(const string& fn, @@ -302,14 +300,44 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme c++ restrict combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme c++ mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; + enzyme["name"] = "Enzyme mayalias combined"; enzyme["runtime"] = tdiff(&start, &end); for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); i++) { @@ -319,9 +347,8 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } - + { struct GMMInput input; @@ -337,10 +364,25 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); auto res = primal(input); gettimeofday(&end, NULL); - printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); - + printf("c++ primal mayalias combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + + json primal; + primal["name"] = "C++ primal mayalias"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("c++ primal restrict combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + json primal; - primal["name"] = "C++ primal"; + primal["name"] = "C++ primal restrict"; primal["runtime"] = tdiff(&start, &end); primal["result"].push_back(res); test_suite["tools"].push_back(primal); diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp index 866059217b96..37fa90574157 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp @@ -202,21 +202,13 @@ void Qtimesx( } } - - -void gmm_objective( - int d, - int k, - int n, - double const* __restrict alphas, - double const* __restrict means, - double const* __restrict icf, - double const* __restrict x, - Wishart wishart, - double* __restrict err -) -{ - #define int int64_t +void gmm_objective_restrict(int d, int k, int n, + double const *__restrict alphas, + double const *__restrict means, + double const *__restrict icf, + double const *__restrict x, Wishart wishart, + double *__restrict err) { +#define int int64_t int ix, ik; const double CONSTANT = -n * d * 0.5 * log(2 * PI); int icf_sz = d * (d + 1) / 2; @@ -265,23 +257,16 @@ extern int enzyme_dupnoneed; void __enzyme_autodiff(...) noexcept; // * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c -void dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb) { - __enzyme_autodiff( - gmm_objective, - enzyme_const, d, - enzyme_const, k, - enzyme_const, n, - enzyme_dup, alphas, alphasb, - enzyme_dup, means, meansb, - enzyme_dup, icf, icfb, - enzyme_const, x, - enzyme_const, wishart, - enzyme_dupnoneed, err, errb); +void dgmm_objective_restrict(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart wishart, double *err, + double *errb) { + __enzyme_autodiff(gmm_objective_restrict, enzyme_const, d, enzyme_const, k, + enzyme_const, n, enzyme_dup, alphas, alphasb, enzyme_dup, + means, meansb, enzyme_dup, icf, icfb, enzyme_const, x, + enzyme_const, wishart, enzyme_dupnoneed, err, errb); } - } @@ -1050,3 +1035,5 @@ void adept_dgmm_objective(int d, int k, int n, const double *alphas, double * delete[] ameans; delete[] aicf; } + +#include "gmm_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h new file mode 100644 index 000000000000..91e207fbcceb --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h @@ -0,0 +1,64 @@ +void gmm_objective(int d, int k, int n, double const *alphas, + double const *means, double const *icf, double const *x, + Wishart wishart, double *err) { +#define int int64_t + int ix, ik; + const double CONSTANT = -n * d * 0.5 * log(2 * PI); + int icf_sz = d * (d + 1) / 2; + + double *Qdiags = (double *)malloc(d * k * sizeof(double)); + double *sum_qs = (double *)malloc(k * sizeof(double)); + double *xcentered = (double *)malloc(d * sizeof(double)); + double *Qxcentered = (double *)malloc(d * sizeof(double)); + double *main_term = (double *)malloc(k * sizeof(double)); + + preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]); + + double slse = 0.; + for (ix = 0; ix < n; ix++) { + for (ik = 0; ik < k; ik++) { + subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]); + Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], + &Qxcentered[0]); + // two caches for qxcentered at idx 0 and at arbitrary index + main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]); + } + + // storing cmp for max of main_term + // 2 x (0 and arbitrary) storing sub to exp + // storing sum for use in log + slse = slse + log_sum_exp(k, &main_term[0]); + } + + // storing cmp of alphas + double lse_alphas = log_sum_exp(k, alphas); + + *err = CONSTANT + slse - n * lse_alphas + + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf); + + free(Qdiags); + free(sum_qs); + free(xcentered); + free(Qxcentered); + free(main_term); +#undef int +} + +// * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c +void dgmm_objective(int d, int k, int n, const double *alphas, double * + alphasb, const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, double * + errb) { + __enzyme_autodiff( + gmm_objective, + enzyme_const, d, + enzyme_const, k, + enzyme_const, n, + enzyme_dup, alphas, alphasb, + enzyme_dup, means, meansb, + enzyme_dup, icf, icfb, + enzyme_const, x, + enzyme_const, wishart, + enzyme_dupnoneed, err, errb); +} + From d6b35b34b68d952ef0766ce2c7ec9b2ce3e78f9a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 21:06:44 -0400 Subject: [PATCH 48/88] add lstm version without restrict --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 55 +++++- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 86 +++------ .../ReverseMode/lstm/lstm_mayalias.h | 175 ++++++++++++++++++ 3 files changed, 251 insertions(+), 65 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 7472bf37beb2..6318f5077edc 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -60,6 +60,12 @@ void dlstm_objective(int l, int c, int b, double const *main_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + void lstm_objective_b(int l, int c, int b, const double *main_params, double *main_paramsb, const double *extra_params, double *extra_paramsb, double *state, @@ -297,25 +303,58 @@ int main(const int argc, const char* argv[]) { int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme restrict combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); + enzyme["name"] = "Enzyme mayalias combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); } test_suite["tools"].push_back(enzyme); printf("\n"); } - } { diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index dbbc9929a7cc..e643efd738a3 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -50,15 +50,10 @@ double logsumexp(double const* vect, int sz) // LSTM OBJECTIVE // The LSTM model -void lstm_model( - int hsize, - double const* __restrict weight, - double const* __restrict bias, - double* __restrict hidden, - double* __restrict cell, - double const* __restrict input -) -{ +void lstm_model_restrict(int hsize, double const *__restrict weight, + double const *__restrict bias, + double *__restrict hidden, double *__restrict cell, + double const *__restrict input) { // TODO NOTE THIS //__builtin_assume(hsize > 0); @@ -94,16 +89,9 @@ void lstm_model( } // Predict LSTM output given an input -void lstm_predict( - int l, - int b, - double const* __restrict w, - double const* __restrict w2, - double* __restrict s, - double const* __restrict x, - double* __restrict x2 -) -{ +void lstm_predict_restrict(int l, int b, double const *__restrict w, + double const *__restrict w2, double *__restrict s, + double const *__restrict x, double *__restrict x2) { int i; for (i = 0; i < b; i++) { @@ -113,7 +101,8 @@ void lstm_predict( double* xp = x2; for (i = 0; i <= 2 * l * b - 1; i += 2 * b) { - lstm_model(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), &(s[i + b]), xp); + lstm_model_restrict(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), + &(s[i + b]), xp); xp = &(s[i]); } @@ -124,17 +113,12 @@ void lstm_predict( } // LSTM objective (loss function) -void lstm_objective( - int l, - int c, - int b, - double const* __restrict main_params, - double const* __restrict extra_params, - double* __restrict state, - double const* __restrict sequence, - double* __restrict loss -) -{ +void lstm_objective_restrict(int l, int c, int b, + double const *__restrict main_params, + double const *__restrict extra_params, + double *__restrict state, + double const *__restrict sequence, + double *__restrict loss) { int i, t; double total = 0.0; int count = 0; @@ -147,7 +131,8 @@ void lstm_objective( __builtin_assume(b>0); for (t = 0; t <= (c - 1) * b - 1; t += b) { - lstm_predict(l, b, main_params, extra_params, state, input, ypred); + lstm_predict_restrict(l, b, main_params, extra_params, state, input, + ypred); lse = logsumexp(ypred, b); for (i = 0; i < b; i++) { @@ -177,32 +162,17 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss -) -{ - __enzyme_autodiff(lstm_objective, - enzyme_const, l, - enzyme_const, c, - enzyme_const, b, - enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, - enzyme_const, state, - enzyme_const, sequence, - enzyme_dupnoneed, loss, dloss - ); +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss) { + __enzyme_autodiff(lstm_objective_restrict, enzyme_const, l, enzyme_const, c, + enzyme_const, b, enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, enzyme_const, + state, enzyme_const, sequence, enzyme_dupnoneed, loss, + dloss); } - } @@ -728,3 +698,5 @@ void adept_dlstm_objective(int l, int c, int b, const double *main_params, doubl } #endif + +#include "lstm_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h new file mode 100644 index 000000000000..d2bbdb631224 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -0,0 +1,175 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/* + * File "lstm_b_tapenade_generated.c" is generated by Tapenade 3.14 (r7259) from this file. + * To reproduce such a generation you can use Tapenade CLI + * (can be downloaded from http://www-sop.inria.fr/tropics/tapenade/downloading.html) + * + * After installing use the next command to generate a file: + * + * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c + * + * This will produce a file "lstm_tapenade_b.c" which content will be the same as the content of the file "lstm_b_tapenade_generated.c", + * except one-line header. Moreover a log-file "lstm_tapenade_b.msg" will be produced. + * + * NOTE: the code in "lstm_b_tapenade_generated.c" is wrong and won't work. + * REPAIRED SOURCE IS STORED IN THE FILE "lstm_b.c". + * You can either use diff tool or read "lstm_b.c" header to figure out what changes was performed to fix the code. + * + * NOTE: you can also use Tapenade web server (http://tapenade.inria.fr:8080/tapenade/index.jsp) + * for generating but the result can be slightly different. + */ + +// #include "../adbench/lstm.h" + +extern "C" { +// #include "lstm.h" + +// UTILS +// Sigmoid on scalar +// double sigmoid(double x) +//{ +// return 1.0 / (1.0 + exp(-x)); +//} +// +//// log(sum(exp(x), 2)) +// double logsumexp(double const* vect, int sz) +//{ +// double sum = 0.0; +// int i; +// +// for (i = 0; i < sz; i++) +// { +// sum += exp(vect[i]); +// } +// +// sum += 2; +// return log(sum); +// } + +// LSTM OBJECTIVE +// The LSTM model +void lstm_model(int hsize, double const *weight, double const *bias, + double *hidden, double *cell, double const *input) { + // TODO NOTE THIS + //__builtin_assume(hsize > 0); + + double *gates = (double *)malloc(4 * hsize * sizeof(double)); + double *forget = &(gates[0]); + double *ingate = &(gates[hsize]); + double *outgate = &(gates[2 * hsize]); + double *change = &(gates[3 * hsize]); + + int i; + // caching input + // hidden (needed) + for (i = 0; i < hsize; i++) { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = + sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = tanh(hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]); + } + + // caching cell (needed) + for (i = 0; i < hsize; i++) { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for (i = 0; i < hsize; i++) { + hidden[i] = outgate[i] * tanh(cell[i]); + } + + free(gates); +} + +// Predict LSTM output given an input +void lstm_predict(int l, int b, double const *w, double const *w2, double *s, + double const *x, double *x2) { + int i; + for (i = 0; i < b; i++) { + x2[i] = x[i] * w2[i]; + } + + double *xp = x2; + for (i = 0; i <= 2 * l * b - 1; i += 2 * b) { + lstm_model(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), &(s[i + b]), xp); + xp = &(s[i]); + } + + for (i = 0; i < b; i++) { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +void lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss) { + int i, t; + double total = 0.0; + int count = 0; + const double *input = &(sequence[0]); + double *ypred = (double *)malloc(b * sizeof(double)); + double *ynorm = (double *)malloc(b * sizeof(double)); + const double *ygold; + double lse; + + __builtin_assume(b > 0); + for (t = 0; t <= (c - 1) * b - 1; t += b) { + lstm_predict(l, b, main_params, extra_params, state, input, ypred); + lse = logsumexp(ypred, b); + for (i = 0; i < b; i++) { + ynorm[i] = ypred[i] - lse; + } + + ygold = &(sequence[t + b]); + for (i = 0; i < b; i++) { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count; + + free(ypred); + free(ynorm); +} + +extern int enzyme_const; +extern int enzyme_dup; +extern int enzyme_dupnoneed; +void __enzyme_autodiff(...) noexcept; + +// * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c + +void dlstm_objective( + int l, + int c, + int b, + double const* main_params, + double* dmain_params, + double const* extra_params, + double* dextra_params, + double* state, + double const* sequence, + double* loss, + double* dloss +) +{ + __enzyme_autodiff(lstm_objective, + enzyme_const, l, + enzyme_const, c, + enzyme_const, b, + enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, + enzyme_const, state, + enzyme_const, sequence, + enzyme_dupnoneed, loss, dloss + ); +} + +} From 55a76b9ecaafb1aa7e66e3dc757376118af1b610 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 22 Jul 2024 17:16:43 -0400 Subject: [PATCH 49/88] Revert "smaller perf improvements" This reverts commit 629f87c4b0a184d52390e830c27965eaa4e25544. --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 3cb5ca449747..ea9e71a67560 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -29,17 +29,14 @@ fn lstm_model( let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; let (a, b) = gates.split_at_mut(2 * hsize); - let (forget, ingate) = a.split_at_mut(hsize); - let (outgate, change) = b.split_at_mut(hsize); + let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); assert_eq!(weight.len(), 4 * hsize); assert_eq!(bias.len(), 4 * hsize); assert_eq!(hidden.len(), hsize); - assert_eq!(ingate.len(), hsize); - assert_eq!(change.len(), hsize); assert!(cell.len() >= hsize); assert!(input.len() >= hsize); - // Using unchecked indexing here was slightly slower for some reason + // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); @@ -70,42 +67,33 @@ fn lstm_predict( for i in 0..b { x2[i] = x[i] * w2[i]; } - - let (s1, s2) = s.split_at_mut(b); - lstm_model( - b, - &w[0..b * 4], - &w[b * 4..2 * b * 4], - s1, - s2, - x2.as_mut(), - ); - - assert_eq!(s.len(), 2 * b * l); - assert_eq!(w.len(), 4 * b * l); - for i in 1..l { - let i = i * 2 * b; - let (xp, s1, s2) = { + + let mut i = 0; + while i <= 2 * l * b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { let tmp = &mut s[i - 2 * b..]; let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); + (a, d, c) }; - let (w1, w2) = w.split_at((i + b) * 4); lstm_model( b, - //&w1[i * 4..], - //&w2[0..(i + 2 * b) * 4], &w[i * 4..(i + b) * 4], &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, ); + + i += 2 * b; } - let i = 2 * l * b; let xp = &s[i - 2 * b..]; for i in 0..b { From de87081b328c07bd09a94b5185d52e1c7d49eeef Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 22 Jul 2024 18:26:53 -0400 Subject: [PATCH 50/88] adding lstm primal cxx overhead benchmark --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 160 ++++++++++++++---- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 22 +-- .../ReverseMode/lstm/lstm_mayalias.h | 41 ++--- 3 files changed, 153 insertions(+), 70 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 6318f5077edc..a24c39132215 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -49,16 +49,25 @@ void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); +void cxx_restrict_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective(int l, int c, int b, double const *main_params, - double *dmain_params, double const *extra_params, - double *dextra_params, double *state, - double const *sequence, double *loss, double *dloss); +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, @@ -187,6 +196,28 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) } } +double calculate_mayalias_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + cxx_mayalias_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + +double calculate_restrict_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + cxx_restrict_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + double calculate_unsafe_primal(struct LSTMInput &input) { double loss = 0.0; for (int i = 0; i < 100; i++) { @@ -257,38 +288,39 @@ int main(const int argc, const char* argv[]) { } - { + //{ - struct LSTMInput input = {}; + // struct LSTMInput input = {}; - // Read instance - read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, input.main_params, input.extra_params, input.state, - input.sequence); + //// Read instance + // read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + // input.main_params, input.extra_params, input.state, + // input.sequence); - std::vector state = std::vector(input.state.size()); + // std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; - struct LSTMOutput result = { 0, std::vector(Jcols) }; + // int Jcols = 8 * input.l * input.b + 3 * input.b; + // struct LSTMOutput result = { 0, std::vector(Jcols) }; - { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Adept combined %0.6f\n", tdiff(&start, &end)); - json adept; - adept["name"] = "Adept combined"; - adept["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - adept["result"].push_back(result.gradient[i]); - } - test_suite["tools"].push_back(adept); - printf("\n"); - } + //{ + // struct timeval start, end; + // gettimeofday(&start, NULL); + // calculate_jacobian(input, result); + // gettimeofday(&end, NULL); + // printf("Adept combined %0.6f\n", tdiff(&start, &end)); + // json adept; + // adept["name"] = "Adept combined"; + // adept["runtime"] = tdiff(&start, &end); + // for (unsigned i = result.gradient.size() - 5; + // i < result.gradient.size(); i++) { + // printf("%f ", result.gradient[i]); + // adept["result"].push_back(result.gradient[i]); + // } + // test_suite["tools"].push_back(adept); + // printf("\n"); + //} - } + //} { @@ -340,7 +372,7 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); printf("Enzyme mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; @@ -438,6 +470,72 @@ int main(const int argc, const char* argv[]) { int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_mayalias_primal(input); + gettimeofday(&end, NULL); + printf("C++ mayalias primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "C++ mayalias primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_restrict_primal(input); + gettimeofday(&end, NULL); + printf("C++ restrict primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "C++ restrict primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index e643efd738a3..ade0b2237510 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -113,12 +113,12 @@ void lstm_predict_restrict(int l, int b, double const *__restrict w, } // LSTM objective (loss function) -void lstm_objective_restrict(int l, int c, int b, - double const *__restrict main_params, - double const *__restrict extra_params, - double *__restrict state, - double const *__restrict sequence, - double *__restrict loss) { +void cxx_restrict_lstm_objective(int l, int c, int b, + double const *__restrict main_params, + double const *__restrict extra_params, + double *__restrict state, + double const *__restrict sequence, + double *__restrict loss) { int i, t; double total = 0.0; int count = 0; @@ -167,11 +167,11 @@ void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss) { - __enzyme_autodiff(lstm_objective_restrict, enzyme_const, l, enzyme_const, c, - enzyme_const, b, enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, enzyme_const, - state, enzyme_const, sequence, enzyme_dupnoneed, loss, - dloss); + __enzyme_autodiff(cxx_restrict_lstm_objective, enzyme_const, l, + enzyme_const, c, enzyme_const, b, enzyme_dup, main_params, + dmain_params, enzyme_dup, extra_params, dextra_params, + enzyme_const, state, enzyme_const, sequence, + enzyme_dupnoneed, loss, dloss); } } diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h index d2bbdb631224..06401ff35a66 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -104,9 +104,9 @@ void lstm_predict(int l, int b, double const *w, double const *w2, double *s, } // LSTM objective (loss function) -void lstm_objective(int l, int c, int b, double const *main_params, - double const *extra_params, double *state, - double const *sequence, double *loss) { +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss) { int i, t; double total = 0.0; int count = 0; @@ -146,30 +146,15 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss -) -{ - __enzyme_autodiff(lstm_objective, - enzyme_const, l, - enzyme_const, c, - enzyme_const, b, - enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, - enzyme_const, state, - enzyme_const, sequence, - enzyme_dupnoneed, loss, dloss - ); +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss) { + __enzyme_autodiff(cxx_mayalias_lstm_objective, enzyme_const, l, enzyme_const, + c, enzyme_const, b, enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, enzyme_const, + state, enzyme_const, sequence, enzyme_dupnoneed, loss, + dloss); } - } From 7d70dc555a44c024736fd3ed7dd17aabf2261cb8 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 18 Sep 2024 22:21:16 -0400 Subject: [PATCH 51/88] fix ode-real example, correct results, faster than c++, without dupnoneed --- .../ReverseMode/ode-real/Cargo.toml | 1 + .../ReverseMode/ode-real/Makefile.make | 36 ++-- .../benchmarks/ReverseMode/ode-real/ode.cpp | 199 ++++-------------- .../ReverseMode/ode-real/src/lib.rs | 158 +++++++------- 4 files changed, 139 insertions(+), 255 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 27a031a49570..880d7aca1567 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -13,6 +13,7 @@ crate-type = ["lib"] [profile.release] lto = "fat" opt-level = 3 +panic = 'abort' #debug = true #strip = "none" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index 16033d158a3a..083ef1176feb 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -1,4 +1,4 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s .PHONY: clean @@ -10,23 +10,31 @@ clean: $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# %-unopt.ll: %.cpp +# clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# +# %-raw.ll: %-unopt.ll +# @echo $(LOAD) +# opt $^ $(LOAD) -o $@ -S +# +# %-opt.ll: %-raw.ll +# opt $^ -o $@ -S +# #opt $^ -O2 -o $@ -S -%-raw.ll: %-unopt.ll - @echo $(LOAD) - opt $^ $(LOAD) -o $@ -S +#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a +# clang++ -O2 $^ -o $@ $(BENCHLINK) -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S -ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a - clang++ -O2 $^ -o $@ $(BENCHLINK) +ode.o: ode.cpp $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a + #/home/manuel/prog/llvm18/build/bin/clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 -#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -# clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm -lode -L $(dir)/benchmarks/ReverseMode/ode/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 +#fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a +# clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + +#gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a +# clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: ode.o diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp index 7c7113df9641..17e677abc65d 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp @@ -109,14 +109,27 @@ void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const dou typedef boost::array< double , 2 * N * N > state_type; -void lorenz( const state_type &x , state_type &dxdt , double t ) + +void lorenz( const state_type &x, state_type &dxdt, double t ) { // Extract the parameters double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; brusselator_2d_loop(dxdt.c_array(), dxdt.c_array() + N * N, x.data(), x.data() + N * N, p, t); } -// init_brusselator(x.c_array(), x.c_array() + N*N) +extern "C" void rust_lorenz(const double* x, double* dxdt, double t); +extern "C" void rust_dbrusselator_2d_loop(const double* x, double* dx, double* adjoint, const double* p, double* dp, double t); + +double rustfoobar(const double *p, /*const*/ state_type x, const state_type adjoint, double t) { + double dp[3] = { 0. }; + + state_type dx = { 0. }; + + state_type dadjoint_inp = adjoint; + + rust_dbrusselator_2d_loop(dadjoint_inp.c_array(), x.c_array(), dx.c_array(), p, dp, t); + return dx[0]; +} double foobar(const double* p, const state_type x, const state_type adjoint, double t) { double dp[3] = { 0. }; @@ -128,10 +141,10 @@ double foobar(const double* p, const state_type x, const state_type adjoint, dou state_type dxdu; __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), -// enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, - enzyme_dupnoneed, nullptr, dadjoint_inp.data(), - enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, + enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), + enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, + // enzyme_dupnoneed, nullptr, dadjoint_inp.data(), + // enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, enzyme_dup, x.data(), dx.data(), enzyme_dup, x.data() + N * N, dx.data() + N * N, enzyme_dup, p, dp, @@ -545,171 +558,47 @@ int main(int argc, char** argv) { res = foobar(p, x, adjoint, t); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); - } - //printf("res=%f\n", foobar(1000)); -} - - -#if 0 - -typedef boost::array< double , 6 > state_type; - -void lorenz( const state_type &x , state_type &dxdt , double t ) -{ - // Extract the parameters - double k1 = x[3]; - double k2 = x[4]; - double k3 = x[5]; - - dxdt[0] = -k1 * x[0] + k3 * x[1] * x[2]; - dxdt[1] = k1 * x[0] - k2 * x[1] * x[1] - k3 * x[1] * x[2]; - dxdt[2] = k2 * x[1] * x[1]; - - // Don't change the parameters p - dxdt[3] = 0; - dxdt[4] = 0; - dxdt[5] = 0; -} - -double foobar(double* p, uint64_t iters) { - state_type x = { 1.0, 0, 0, p[0], p[1], p[2] }; // initial conditions - double t = 1e5; - typedef controlled_runge_kutta< runge_kutta_dopri5< state_type , typename state_type::value_type , state_type , double > > stepper_type; - //typedef euler< state_type , typename state_type::value_type , state_type , double > stepper_type; - integrate_const( stepper_type(), lorenz , x , 0.0 , t, t/iters ); - - return x[0]; -} - -typedef boost::array< adouble , 6 > astate_type; - -void alorenz( const astate_type &x , astate_type &dxdt , adouble t ) -{ - // Extract the parameters - adouble k1 = x[3]; - adouble k2 = x[4]; - adouble k3 = x[5]; - - dxdt[0] = -k1 * x[0] + k3 * x[1] * x[2]; - dxdt[1] = k1 * x[0] - k2 * x[1] * x[1] - k3 * x[1] * x[2]; - dxdt[2] = k2 * x[1] * x[1]; - - // Don't change the parameters p - dxdt[3] = 0; - dxdt[4] = 0; - dxdt[5] = 0; -} - -adouble afoobar(adouble* p, uint64_t iters) { - astate_type x = { 1.0, 0, 0, p[0], p[1], p[2] }; // initial conditions - double t = 1e5; - typedef controlled_runge_kutta< runge_kutta_dopri5< astate_type , typename astate_type::value_type , astate_type , adouble > > stepper_type; - //typedef euler< astate_type , typename astate_type::value_type , astate_type , adouble > stepper_type; - integrate_const( stepper_type(), alorenz , x , 0.0 , t, t/iters ); - - return x[0]; -} - -static -double afoobar_and_gradient(double* p_in, double* dp_out, uint64_t iters) { - adept::Stack stack; - adouble x[3] = { p_in[0], p_in[1], p_in[2] }; - stack.new_recording(); - adouble y = afoobar(x, iters); - y.set_gradient(1.0); - stack.compute_adjoint(); - for(int i=0; i<3; i++) - dp_out[i] = x[i].get_gradient(); - return y.value(); -} - -static void adept_sincos(uint64_t iters) { - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); - - gettimeofday(&end, NULL); - printf("Adept real %0.6f res=%f\n", tdiff(&start, &end), res); - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - adept::Stack stack; - adouble p[3] = { 0.04,3e7,1e4 }; - // stack.new_recording(); - adouble resa = afoobar(p, iters); - double res = resa.value(); - - gettimeofday(&end, NULL); - printf("Adept forward %0.6f res=%f\n", tdiff(&start, &end), res); - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double p[3] = { 0.04,3e7,1e4 }; - double dp[3] = { 0 }; - afoobar_and_gradient(p, dp, iters); - - gettimeofday(&end, NULL); - printf("Adept combined %0.6f res'=%f\n", tdiff(&start, &end), dp[0]); + printf("C++ Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); } -} - -static void enzyme_sincos(double inp, uint64_t iters) { - + { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); + double res; + for(int i=0; i<10000; i++) + res = rustfoobar(p, x, adjoint, t); - gettimeofday(&end, NULL); - printf("Enzyme real %0.6f res=%f\n", tdiff(&start, &end), res); + gettimeofday(&end, NULL); + printf("Rust Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); + for(int i=0; i<10000; i++) { + lorenz(x, x2, t); + } - gettimeofday(&end, NULL); - printf("Enzyme forward %0.6f res=%f\n", tdiff(&start, &end), res); + gettimeofday(&end, NULL); + printf("C++ fwd %0.6f res=%f\n", tdiff(&start, &end), x2[0]); } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; - double p[3] = { 0.04,3e7,1e4 }; - double dp[3] = { 0 }; - __enzyme_autodiff(foobar, p, dp, iters); + for(int i=0; i<10000; i++) + rust_lorenz(x.c_array(), x2.c_array(), t); - gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f res'=%f\n", tdiff(&start, &end), dp[0]); + gettimeofday(&end, NULL); + printf("Rust fwd %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } -} -int main(int argc, char** argv) { - int max_iters = atoi(argv[1]) ; - double inp = 2.1; - //for(int iters=max_iters/20; iters<=max_iters; iters+=max_iters/20) { - auto iters = max_iters; - printf("iters=%d\n", iters); - adept_sincos(inp, iters); - enzyme_sincos(inp, iters); - //} + //printf("res=%f\n", foobar(1000)); } -#endif diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 23995eaa5626..bd27f7930d9c 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -1,5 +1,4 @@ #![feature(autodiff)] -#![feature(slice_first_last_chunk)] #![feature(slice_as_chunks)] #![feature(iter_next_chunk)] #![allow(non_snake_case)] @@ -26,12 +25,6 @@ fn get(x: &[f64], i: usize, j: usize) -> f64 { x[N * i + j] } -//#define RANGE(min, max, i, N) ((max-min)/(N-1)*i + min) -//#define GETnb(x, i, j) (x)[N*i+j] -//#define GET(x, i, j) GETnb(x, i, j) -// #define GET(x, i, j) ({ assert(i >=0); assert( j>=0); assert(j f64 { let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; let eq2 = t >= 1.1; @@ -43,26 +36,21 @@ fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { } fn init_brusselator(u: &mut [f64], v: &mut [f64]) { + assert!(u.len() == N * N); + assert!(v.len() == N * N); for i in 0..N { for j in 0..N { let x = range(xmin, xmax, i, N); let y = range(ymin, ymax, j, N); - u[N * i + j] = 22.0 * y * (1.0 - y) * (y * (1.0 - y)).sqrt(); - v[N * i + j] = 27.0 * x * (1.0 - x) * (x * (1.0 - x)).sqrt(); + u[N * i + j] = 22.0 * (y * (1.0 - y)) * (y * (1.0 - y)).sqrt(); + v[N * i + j] = 27.0 * (x * (1.0 - x)) * (x * (1.0 - x)).sqrt(); } } } -// __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), -// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, -// enzyme_dup, x.data(), dx.data(), -// enzyme_dup, x.data() + N * N, dx.data() + N * N, -// enzyme_dup, p, dp, -// enzyme_const, t); - +#[no_mangle] #[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] -fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p: &[f64;3], t: f64) { +fn brusselator_2d_loop(d_u: &mut [f64;N*N], d_v: &mut [f64;N*N], u: &[f64;N*N], v: &[f64;N*N], p: &[f64;3], t: f64) { let A = p[0]; let B = p[1]; let alpha = p[2]; @@ -85,96 +73,94 @@ fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p } } -type state_type = [f64; 2 * N * N]; +type StateType = [f64; 2 * N * N]; + +#[no_mangle] +pub extern "C" fn rust_lorenz(x: *const StateType, dxdt: *mut StateType, t: f64) { + let x: &StateType = unsafe { &*x }; + let dxdt: &mut StateType = unsafe { &mut *dxdt }; + lorenz(x, dxdt, t); +} -fn lorenz(x: &state_type, dxdt: &mut state_type, t: f64) { +fn lorenz(x: &StateType, dxdt: &mut StateType, t: f64) { let p = [3.4, 1., 10.]; let (tmp1, tmp2) = dxdt.split_at_mut(N * N); let mut dxdt1: [f64; N * N] = tmp1.try_into().unwrap(); let mut dxdt2: [f64; N * N] = tmp2.try_into().unwrap(); - brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &x[..], &x[N * N..], &p, t); + let (tmp1, tmp2) = x.split_at(N * N); + let u: [f64; N * N] = tmp1.try_into().unwrap(); + let v: [f64; N * N] = tmp2.try_into().unwrap(); + brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &u, &v, &p, t); } #[no_mangle] -pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, dp: *mut f64, x: *const state_type, dx: *mut state_type, adjoint: *mut state_type, t: f64) -> f64 { - let x = unsafe { *x }; - let mut adjoint = unsafe { *adjoint }; - let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; - let mut dp: [f64;3] = unsafe { dp.cast::<[f64;3]>().as_mut().unwrap() }; - - let (mut dx1, mut dx2) = dx.split_at_mut(N * N); - //let mut dp = [0.; 3]; - //let mut dx1 = [0.; N * N]; - //let mut dx2 = [0.; N * N]; - let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); +pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const StateType, dx: *mut StateType, p: *const [f64;3], dp: *mut [f64;3], t: f64) { + let x: &StateType = unsafe { &*x }; + let dx: &mut StateType = unsafe { &mut *dx }; + let adjoint: &mut StateType = unsafe { &mut *adjoint }; + + let p: &[f64;3] = unsafe { &*p }; + let dp: &mut [f64;3] = unsafe { &mut *dp }; + + //assert!(p[0] == 3.4); + //assert!(p[1] == 1.); + //assert!(p[2] == 10.); + //assert!(t == 2.1); + + //let mut x1 = [0.; 2 * N * N]; + //let mut dx1 = [0.; 2 *N * N]; + //let (tmp1, tmp2) = x1.split_at_mut(N * N); + //let mut x1: [f64; N * N] = tmp1.try_into().unwrap(); + //let mut x2: [f64; N * N] = tmp2.try_into().unwrap(); + //init_brusselator(&mut x1, &mut x2); + //for i in 0..N*N { + // let tmp = (x1[i] - x[i]).abs(); + // if (tmp / x[i] > 1e-5) { + // dbg!(tmp); + // dbg!(tmp / x[i]); + // dbg!(i); + // dbg!(x1[i]); + // dbg!(x[i]); + // println!("x1[{}] = {} != x[{}] = {}", i, x1[i], i, x[i]); + // panic!(); + // } + //} + + // Alternative ways to split the inputs + //let [ mut dx1, mut dx2]: [[f64; N*N]; 2] = unsafe { *std::mem::transmute::<*mut StateType, &mut [[f64; N*N]; 2]>(dx) }; + //let [dx1, dx2]: &mut [[f64; N*N];2] = unsafe { dx.cast::<[[f64; N*N]; 2]>().as_mut().unwrap() }; // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([dx1, dx2], []): (&mut [[f64; N*N]], &mut [f64]) = dx.as_chunks_mut() else { unreachable!() }; + let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - let mut null1 = [0.; 2 * N * N]; - let mut null2 = [0.; 2 * N * N]; - dbrusselator_2d_loop(&mut null1, &mut dadj1, - &mut null2, &mut dadj2, - x1, &mut dx1, - x2, &mut dx2, - &p, &mut dp, t); - dx1[0] - //brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), - // nullptr, dadjoint_inp.data() + N * N, - // x.data(), dx.data(), - // x.data() + N * N, dx.data() + N * N, - // p, dp, - // t); + let mut null1 = [0.; 1 * N * N]; + let mut null2 = [0.; 1 * N * N]; + dbrusselator_2d_loop(&mut null1, dadj1, + &mut null2, dadj2, + x1, dx1, + x2, dx2, + p, dp, t); + return; } -fn foobar(p: &[f64;3], x: state_type, mut adjoint: state_type, t: f64) -> f64 { +fn foobar(p: &[f64;3], x: StateType, mut adjoint: StateType, t: f64) -> f64 { let mut dp = [0.; 3]; let mut dx1 = [0.; N * N]; let mut dx2 = [0.; N * N]; - let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); - let mut null1 = [0.; 2 * N * N]; - let mut null2 = [0.; 2 * N * N]; + // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; + //let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + let mut null1 = [0.; 1 * N * N]; + let mut null2 = [0.; 1 * N * N]; // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - dbrusselator_2d_loop(&mut null1, &mut dadj1, - &mut null2, &mut dadj2, + dbrusselator_2d_loop(&mut null1, dadj1, + &mut null2, dadj2, x1, &mut dx1, x2, &mut dx2, &p, &mut dp, t); dx1[0] } - -//double foobar(const double* p, const state_type x, const state_type adjoint, double t) { -// double dp[3] = { 0. }; -// -// state_type dx = { 0. }; -// -// state_type dadjoint_inp = adjoint; -// -// state_type dxdu; -// -// __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), -// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, -// enzyme_dup, x.data(), dx.data(), -// enzyme_dup, x.data() + N * N, dx.data() + N * N, -// enzyme_dup, p, dp, -// enzyme_const, t); -// -// return dx[0]; -//} - -fn main() { - let p = [3.4, 1., 10.]; - let mut x = [0.; 2 * N * N]; - let mut adjoint = [0.; 2 * N * N]; - init_brusselator(&mut x, &mut adjoint); - let t = 2.1; - let mut res = 0.; - let time = std::time::Instant::now(); - for _ in 0..10000 { - res = foobar(&p, x, adjoint, t); - } - println!("Enzyme combined {} res={}", time.elapsed().as_secs_f64(), res); -} From fdb0d0977dc2c32b0467e87bfdd6e3d9a825d874 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 19 Sep 2024 00:43:19 -0400 Subject: [PATCH 52/88] remove boost leftovers --- .../benchmarks/ReverseMode/ode-real/ode.cpp | 66 +++++++++---------- .../ReverseMode/ode-real/src/lib.rs | 35 ++-------- 2 files changed, 35 insertions(+), 66 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp index 17e677abc65d..c0c5064b833d 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp @@ -24,20 +24,8 @@ float tdiff(struct timeval *start, struct timeval *end) { return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec); } -#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS -#define BOOST_NO_EXCEPTIONS #include -#include - -#include - -#include -void boost::throw_exception(std::exception const & e){ - //do nothing -} - using namespace std; -using namespace boost::numeric::odeint; #define N 32 #define xmin 0. @@ -107,27 +95,29 @@ void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const dou } } -typedef boost::array< double , 2 * N * N > state_type; - +typedef double state_type[2*N*N]; void lorenz( const state_type &x, state_type &dxdt, double t ) { // Extract the parameters double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; - brusselator_2d_loop(dxdt.c_array(), dxdt.c_array() + N * N, x.data(), x.data() + N * N, p, t); + brusselator_2d_loop(dxdt, dxdt + N * N, x, x + N * N, p, t); } extern "C" void rust_lorenz(const double* x, double* dxdt, double t); -extern "C" void rust_dbrusselator_2d_loop(const double* x, double* dx, double* adjoint, const double* p, double* dp, double t); +extern "C" void rust_dbrusselator_2d_loop(double* adjoint, const double* x, double* dx, const double* p, double* dp, double t); -double rustfoobar(const double *p, /*const*/ state_type x, const state_type adjoint, double t) { +double rustfoobar(const double *p, const state_type x, const state_type adjoint, double t) { double dp[3] = { 0. }; state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } - rust_dbrusselator_2d_loop(dadjoint_inp.c_array(), x.c_array(), dx.c_array(), p, dp, t); + rust_dbrusselator_2d_loop(dadjoint_inp, x, dx, p, dp, t); return dx[0]; } @@ -136,17 +126,20 @@ double foobar(const double* p, const state_type x, const state_type adjoint, dou state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } state_type dxdu; __enzyme_autodiff(brusselator_2d_loop, - enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), - enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, - // enzyme_dupnoneed, nullptr, dadjoint_inp.data(), - // enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, - enzyme_dup, x.data(), dx.data(), - enzyme_dup, x.data() + N * N, dx.data() + N * N, + enzyme_dup, dxdu, dadjoint_inp, + enzyme_dup, dxdu + N * N, dadjoint_inp + N * N, + // enzyme_dupnoneed, nullptr, dadjoint_inp, + // enzyme_dupnoneed, nullptr, dadjoint_inp + N * N, + enzyme_dup, x, dx, + enzyme_dup, x + N * N, dx + N * N, enzyme_dup, p, dp, enzyme_const, t); @@ -499,14 +492,17 @@ double tfoobar(const double* p, const state_type x, const state_type adjoint, do state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } state_type dxdu; - brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), - nullptr, dadjoint_inp.data() + N * N, - x.data(), dx.data(), - x.data() + N * N, dx.data() + N * N, + brusselator_2d_loop_b(nullptr, dadjoint_inp, + nullptr, dadjoint_inp + N * N, + x, dx, + x + N * N, dx + N * N, p, dp, t); @@ -518,10 +514,10 @@ int main(int argc, char** argv) { const double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; state_type x; - init_brusselator(x.data(), x.data() + N * N); + init_brusselator(x, x + N * N); state_type adjoint; - init_brusselator(adjoint.data(), adjoint.data() + N * N); + init_brusselator(adjoint, adjoint + N * N); double t = 2.1; @@ -592,13 +588,11 @@ int main(int argc, char** argv) { state_type x2; for(int i=0; i<10000; i++) - rust_lorenz(x.c_array(), x2.c_array(), t); + rust_lorenz(x, x2, t); gettimeofday(&end, NULL); printf("Rust fwd %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } - - //printf("res=%f\n", foobar(1000)); } diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index bd27f7930d9c..2347ca8e0f80 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -18,12 +18,6 @@ const ymax: f64 = 1.; fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { (max - min) / (N_var as f64 - 1.) * i as f64 + min } -#[inline(always)] -fn get(x: &[f64], i: usize, j: usize) -> f64 { - assert!(i > 0); - assert!(j < N); - x[N * i + j] -} fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; @@ -35,6 +29,7 @@ fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { } } +#[expect(unused)] fn init_brusselator(u: &mut [f64], v: &mut [f64]) { assert!(u.len() == N * N); assert!(v.len() == N * N); @@ -102,10 +97,10 @@ pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const S let p: &[f64;3] = unsafe { &*p }; let dp: &mut [f64;3] = unsafe { &mut *dp }; - //assert!(p[0] == 3.4); - //assert!(p[1] == 1.); - //assert!(p[2] == 10.); - //assert!(t == 2.1); + assert!(p[0] == 3.4); + assert!(p[1] == 1.); + assert!(p[2] == 10.); + assert!(t == 2.1); //let mut x1 = [0.; 2 * N * N]; //let mut dx1 = [0.; 2 *N * N]; @@ -144,23 +139,3 @@ pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const S p, dp, t); return; } - - -fn foobar(p: &[f64;3], x: StateType, mut adjoint: StateType, t: f64) -> f64 { - let mut dp = [0.; 3]; - let mut dx1 = [0.; N * N]; - let mut dx2 = [0.; N * N]; - // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 - let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; - //let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); - let mut null1 = [0.; 1 * N * N]; - let mut null2 = [0.; 1 * N * N]; - // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 - let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - dbrusselator_2d_loop(&mut null1, dadj1, - &mut null2, dadj2, - x1, &mut dx1, - x2, &mut dx2, - &p, &mut dp, t); - dx1[0] -} From 747e4f9b2459c5ad646ff342b386986daf136e40 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 1 Oct 2024 17:21:52 -0400 Subject: [PATCH 53/88] fix makefiles to use new rustflags --- enzyme/benchmarks/ReverseMode/ba/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 4 ++-- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 8a13a0e524fb..1e4ed61859a7 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index e3c15f4dcc11..7eff2a950c86 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json -f %s; fi +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json VERBOSE=1 -f %s .PHONY: clean @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 23ba9a51ceff..0d21100d7567 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.json $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm From 05d3e8776487c3d6fa0948d789de361d3384da8f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 1 Oct 2024 22:18:43 -0400 Subject: [PATCH 54/88] add tanh support for llvm19+ --- enzyme/Enzyme/FunctionUtils.cpp | 5 ++++ enzyme/Enzyme/GradientUtils.cpp | 5 ++++ enzyme/Enzyme/InstructionDerivatives.td | 12 ++++----- enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp | 11 ++++++++ enzyme/Enzyme/Utils.h | 5 ++++ enzyme/test/Enzyme/ReverseMode/tanh19.ll | 29 +++++++++++++++++++++ 6 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 enzyme/test/Enzyme/ReverseMode/tanh19.ll diff --git a/enzyme/Enzyme/FunctionUtils.cpp b/enzyme/Enzyme/FunctionUtils.cpp index 384180656e10..033ccc56c226 100644 --- a/enzyme/Enzyme/FunctionUtils.cpp +++ b/enzyme/Enzyme/FunctionUtils.cpp @@ -2799,6 +2799,11 @@ bool guaranteedDataDependent(Value *z) { case Intrinsic::sqrt: case Intrinsic::sin: case Intrinsic::cos: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: +#endif return guaranteedDataDependent(II->getArgOperand(0)); default: break; diff --git a/enzyme/Enzyme/GradientUtils.cpp b/enzyme/Enzyme/GradientUtils.cpp index ac1a79491f3c..0e0cf5fe5e52 100644 --- a/enzyme/Enzyme/GradientUtils.cpp +++ b/enzyme/Enzyme/GradientUtils.cpp @@ -4177,6 +4177,11 @@ bool GradientUtils::shouldRecompute(const Value *val, case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::exp: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::tanh: + case Intrinsic::cosh: + case Intrinsic::sinh: +#endif case Intrinsic::log: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_p: diff --git a/enzyme/Enzyme/InstructionDerivatives.td b/enzyme/Enzyme/InstructionDerivatives.td index 9f765f88826f..fe1a97a67fd8 100644 --- a/enzyme/Enzyme/InstructionDerivatives.td +++ b/enzyme/Enzyme/InstructionDerivatives.td @@ -335,12 +335,6 @@ def : CallPattern<(Op $x, $y), [ReadNone, NoUnwind] >; -def : CallPattern<(Op $x), - ["tanh"], - [(FDiv (DiffeRet), (FMul(Call<(SameTypesFunc<"cosh">), [ReadNone,NoUnwind]> $x):$c, $c))], - (ForwardFromSummedReverse), - [ReadNone, NoUnwind] - >; def : CallPattern<(Op $x), ["tanhf"], [(FDiv (DiffeRet), (FMul(Call<(SameTypesFunc<"coshf">), [ReadNone,NoUnwind]> $x):$c, $c))], @@ -872,6 +866,12 @@ def : CallPattern<(Op (Op $x, $y):$z), [ReadNone, NoUnwind] >; +def : IntrPattern<(Op $x), + [["tanh"]], + [(FDiv (DiffeRet), (FMul(Intrinsic<"cosh"> $x):$c, $c))], + (ForwardFromSummedReverse) + >; + def : IntrPattern<(Op $x), [["sin"]], [(FMul (DiffeRet), (Intrinsic<"cos"> $x))] , diff --git a/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp b/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp index aed7767652f0..c59f712b0121 100644 --- a/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp +++ b/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp @@ -118,9 +118,15 @@ const llvm::StringMap LIBM_FUNCTIONS = { {"atan", Intrinsic::not_intrinsic}, {"atan2", Intrinsic::not_intrinsic}, {"__nv_atan2", Intrinsic::not_intrinsic}, +#if LLVM_VERSION_MAJOR >= 19 + {"cosh", Intrinsic::cosh}, + {"sinh", Intrinsic::sinh}, + {"tanh", Intrinsic::tanh}, +#else {"cosh", Intrinsic::not_intrinsic}, {"sinh", Intrinsic::not_intrinsic}, {"tanh", Intrinsic::not_intrinsic}, +#endif {"acosh", Intrinsic::not_intrinsic}, {"asinh", Intrinsic::not_intrinsic}, {"atanh", Intrinsic::not_intrinsic}, @@ -3849,6 +3855,11 @@ void TypeAnalyzer::visitIntrinsicInst(llvm::IntrinsicInst &I) { case Intrinsic::exp2: case Intrinsic::sin: case Intrinsic::cos: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: +#endif case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: diff --git a/enzyme/Enzyme/Utils.h b/enzyme/Enzyme/Utils.h index 9b66730d14d5..a8ce244caadc 100644 --- a/enzyme/Enzyme/Utils.h +++ b/enzyme/Enzyme/Utils.h @@ -1693,6 +1693,11 @@ static inline bool isNoEscapingAllocation(const llvm::Function *F) { case Intrinsic::exp: case Intrinsic::cos: case Intrinsic::sin: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::tanh: + case Intrinsic::cosh: + case Intrinsic::sinh: +#endif case Intrinsic::copysign: case Intrinsic::fabs: return true; diff --git a/enzyme/test/Enzyme/ReverseMode/tanh19.ll b/enzyme/test/Enzyme/ReverseMode/tanh19.ll new file mode 100644 index 000000000000..2d22ab6b6328 --- /dev/null +++ b/enzyme/test/Enzyme/ReverseMode/tanh19.ll @@ -0,0 +1,29 @@ +; RUN: if [ %llvmver -ge 19 ]; then %opt < %s %newLoadEnzyme -enzyme-preopt=false -passes="enzyme,function(mem2reg,instsimplify,%simplifycfg)" -S | FileCheck %s; fi + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare double @llvm.tanh.f64(double) #14 + +; Function Attrs: nounwind readnone uwtable +define double @tester(double %x) { +entry: + %0 = call double @llvm.tanh.f64(double %x) + ret double %0 +} + +define double @test_derivative(double %x) { +entry: + %0 = tail call double (double (double)*, ...) @__enzyme_autodiff(double (double)* nonnull @tester, double %x) + ret double %0 +} + +; Function Attrs: nounwind +declare double @__enzyme_autodiff(double (double)*, ...) + +; CHECK: define internal { double } @diffetester(double %x, double %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = call fast double @llvm.cosh.f64(double %x) +; CHECK-NEXT: %1 = fmul fast double %0, %0 +; CHECK-NEXT: %2 = fdiv fast double %differeturn, %1 +; CHECK-NEXT: %3 = insertvalue { double } undef, double %2, 0 +; CHECK-NEXT: ret { double } %3 +; CHECK-NEXT: } From 1fe64c0add8df4d9074e7150c803243a00662432 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Oct 2024 02:21:12 -0400 Subject: [PATCH 55/88] fix safe fft performance --- enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 1 + enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs index 47b0aa1e97fd..84b16d077ac7 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(slice_swap_unchecked)] #![feature(autodiff)] pub mod safe; diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index 7332dcb91356..d44633a3813a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -9,8 +9,8 @@ fn bitreversal_perm(data: &mut [T]) { while i < 2*len { if j > i { //dbg!(&i, &j); - data.swap(j-1, i-1); - data.swap(j, i); + unsafe {data.swap_unchecked(j-1, i-1);} + unsafe {data.swap_unchecked(j, i);} } let mut m = len; From 7f26f432d7467362250ac7bf68040de912b9ea6b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Oct 2024 02:55:05 -0400 Subject: [PATCH 56/88] fix (mostly) safe lstm perf --- enzyme/benchmarks/ReverseMode/lstm/src/safe.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index ea9e71a67560..ad6481790a19 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -31,11 +31,11 @@ fn lstm_model( let (a, b) = gates.split_at_mut(2 * hsize); let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); - assert_eq!(weight.len(), 4 * hsize); - assert_eq!(bias.len(), 4 * hsize); - assert_eq!(hidden.len(), hsize); - assert!(cell.len() >= hsize); - assert!(input.len() >= hsize); + debug_assert_eq!(weight.len(), 4 * hsize); + debug_assert_eq!(bias.len(), 4 * hsize); + debug_assert_eq!(hidden.len(), hsize); + debug_assert!(cell.len() >= hsize); + debug_assert!(input.len() >= hsize); // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); @@ -130,7 +130,7 @@ pub(crate) fn lstm_objective( let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; - assert!(b > 0); + debug_assert!(b > 0); let limit = (c - 1) * b; for j in 0..(c - 1) { From 7ecd27b8252e24abddcb5df77960a1d00a1f58a6 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 23 Oct 2024 17:08:27 -0700 Subject: [PATCH 57/88] make lstm benchmark fair by running c++ with O3 --- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 0d21100d7567..d8524225f6c1 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -14,15 +14,16 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - @echo $(LOAD) + @echo opt $(LOAD) -o $@ -S opt $^ $(LOAD) -o $@ -S %-opt.ll: %-raw.ll + @echo opt $^ -o $@ -S opt $^ -o $@ -S #opt $^ -O2 -o $@ -S lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a - clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ -pthread -O3 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ From aaef8b3513c4dbbd0678256218b796387c1d6f06 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 23 Oct 2024 17:09:10 -0700 Subject: [PATCH 58/88] make ba bench fair by running c++ with O3 --- enzyme/benchmarks/ReverseMode/ba/Makefile.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 1e4ed61859a7..91bf50a386d7 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a - clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: ba.o ./$^ From 51ef17a6fa16756e2bc00ed3f9d34edb423f5563 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 23 Oct 2024 17:10:04 -0700 Subject: [PATCH 59/88] make ode-real/bruss fair by adjusting both --- enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml | 3 ++- enzyme/benchmarks/ReverseMode/ode-real/Makefile.make | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 880d7aca1567..48b6af750067 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -14,8 +14,9 @@ crate-type = ["lib"] lto = "fat" opt-level = 3 panic = 'abort' +strip = true +codegen-units = 1 #debug = true -#strip = "none" [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index 083ef1176feb..eeafb349c16f 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -1,4 +1,5 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B results.txt VERBOSE=1 -f %s +# RUNN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s .PHONY: clean @@ -28,7 +29,7 @@ $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo ode.o: ode.cpp $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a #/home/manuel/prog/llvm18/build/bin/clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 #fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a # clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 From 9c9f276cdc293b2a5a1f0f5bea9234e5c0457401 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 26 Oct 2024 23:16:43 -0400 Subject: [PATCH 60/88] equalize benchmark configs and move over to new use std::autodiff::autodiff --- .gitignore | 2 + enzyme/benchmarks/ReverseMode/adbench/gmm.h | 12 ++-- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 66 +++++++++---------- enzyme/benchmarks/ReverseMode/ba/Cargo.toml | 6 +- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 2 +- enzyme/benchmarks/ReverseMode/ba/src/safe.rs | 1 + .../benchmarks/ReverseMode/ba/src/unsafe.rs | 1 + enzyme/benchmarks/ReverseMode/fft/Cargo.toml | 4 ++ .../benchmarks/ReverseMode/fft/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 1 + enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 1 + enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 6 +- .../benchmarks/ReverseMode/gmm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 1 + .../benchmarks/ReverseMode/gmm/src/unsafe.rs | 1 + enzyme/benchmarks/ReverseMode/lstm/Cargo.toml | 4 ++ .../benchmarks/ReverseMode/lstm/Makefile.make | 35 +++++----- .../benchmarks/ReverseMode/lstm/src/safe.rs | 7 ++ .../benchmarks/ReverseMode/lstm/src/unsf.rs | 2 + .../ReverseMode/ode-real/Cargo.toml | 6 +- .../ReverseMode/ode-real/src/lib.rs | 2 + 21 files changed, 99 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index a3c2751a0f5c..39cffb5ce16c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ enzyme/benchmarks/ReverseMode/*/*.ll enzyme/benchmarks/ReverseMode/*/*.bc enzyme/benchmarks/ReverseMode/*/*.o enzyme/benchmarks/ReverseMode/*/*.exe +enzyme/benchmarks/ReverseMode/*/target/ enzyme/benchmarks/ReverseMode/*/results.txt enzyme/benchmarks/ReverseMode/*/results.json .cache + diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 91b2220bf950..e9551ba7820e 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -213,10 +213,10 @@ int main(const int argc, const char* argv[]) { std::vector paths;// = { "1k/gmm_d10_K100.txt" }; - // getTests(paths, "data/1k", "1k/"); - // getTests(paths, "data/2.5k", "2.5k/"); - // getTests(paths, "data/10k", "10k/"); - paths.push_back("1k/gmm_d2_K5.txt"); + //getTests(paths, "data/1k", "1k/"); + //getTests(paths, "data/2.5k", "2.5k/"); + getTests(paths, "data/10k", "10k/"); + //paths.push_back("1k/gmm_d2_K5.txt"); std::ofstream jsonfile("results.json", std::ofstream::trunc); json test_results; @@ -267,7 +267,7 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; - if (0) { + //if (0) { try { struct timeval start, end; gettimeofday(&start, NULL); @@ -287,7 +287,7 @@ int main(const int argc, const char* argv[]) { } catch (std::bad_alloc) { printf("Adept combined 88888888 ooms\n"); } - } + //} } { diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index a24c39132215..fda5f8e3a0f2 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -288,39 +288,39 @@ int main(const int argc, const char* argv[]) { } - //{ - - // struct LSTMInput input = {}; - - //// Read instance - // read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, - // input.main_params, input.extra_params, input.state, - // input.sequence); - - // std::vector state = std::vector(input.state.size()); - - // int Jcols = 8 * input.l * input.b + 3 * input.b; - // struct LSTMOutput result = { 0, std::vector(Jcols) }; - - //{ - // struct timeval start, end; - // gettimeofday(&start, NULL); - // calculate_jacobian(input, result); - // gettimeofday(&end, NULL); - // printf("Adept combined %0.6f\n", tdiff(&start, &end)); - // json adept; - // adept["name"] = "Adept combined"; - // adept["runtime"] = tdiff(&start, &end); - // for (unsigned i = result.gradient.size() - 5; - // i < result.gradient.size(); i++) { - // printf("%f ", result.gradient[i]); - // adept["result"].push_back(result.gradient[i]); - // } - // test_suite["tools"].push_back(adept); - // printf("\n"); - //} - - //} + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = { 0, std::vector(Jcols) }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Adept combined %0.6f\n", tdiff(&start, &end)); + json adept; + adept["name"] = "Adept combined"; + adept["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + adept["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(adept); + printf("\n"); + } + + } { diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml index 160c7716f3d8..4bc9c2195925 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml @@ -3,7 +3,6 @@ name = "bars" version = "0.1.0" edition = "2021" - [lib] crate-type = ["cdylib"] @@ -11,6 +10,11 @@ crate-type = ["cdylib"] [profile.release] lto = "fat" +opt-level = 3 +codegen-units = 1 +unwind = "abort" +strip = true +#overflow-checks = false [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 1f665012c07a..7efd43ff2b28 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -1,7 +1,7 @@ #![feature(autodiff)] -#![feature(slice_first_last_chunk)] #![allow(non_snake_case)] +use std::autodiff::autodiff; pub mod safe; pub mod r#unsafe; diff --git a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs index c38f5359cc30..c15822fda650 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs @@ -1,5 +1,6 @@ use crate::BA_NCAMPARAMS; use crate::compute_zach_weight_error; +use std::autodiff::autodiff; fn sqsum(x: &[f64]) -> f64 { x.iter().map(|&v| v * v).sum() diff --git a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs index 477d900c3310..874b9ba9e9d1 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs @@ -1,5 +1,6 @@ use crate::BA_NCAMPARAMS; use crate::compute_zach_weight_error; +use std::autodiff::autodiff; unsafe fn sqsum(x: *const f64, n: usize) -> f64 { let mut sum = 0.; diff --git a/enzyme/benchmarks/ReverseMode/fft/Cargo.toml b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml index 5366aefa719e..cf8862df4f7a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml @@ -13,6 +13,10 @@ crate-type = ["lib"] [profile.release] lto = "fat" opt-level = 3 +codegen-units = 1 +unwind = "abort" +strip = true +#overflow-checks = false [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index a2de0fdbcc62..288168048035 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: fft.o ./$^ 1048576 | tee $@ diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index d44633a3813a..e6c088cda996 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -1,5 +1,6 @@ use std::slice; use std::f64::consts::PI; +use std::autodiff::autodiff; fn bitreversal_perm(data: &mut [T]) { let len = data.len() / 2; diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 5391d035095f..efd242594ffd 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -1,4 +1,5 @@ use std::f64::consts::PI; +use std::autodiff::autodiff; //static void scramble(double* data, unsigned N) { // int j=1; diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 85dfa6310c34..1ae02738efc6 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -14,8 +14,10 @@ libm = ["dep:libm"] [profile.release] lto = "fat" opt-level = 3 -#debug = true -#strip = "none" +codegen-units = 1 +panic = "abort" +strip = true +#overflow-checks = false [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 7eff2a950c86..52ed73a6fc07 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.to RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a - clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index c809c34e5454..8f3f255d2d4c 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -1,5 +1,6 @@ use crate::Wishart; use std::f64::consts::PI; +use std::autodiff::autodiff; #[cfg(feature = "libm")] use libm::lgamma; diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs index b2730538c88e..1b82cebf7514 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs @@ -1,5 +1,6 @@ use std::f64::consts::PI; use crate::Wishart; +use std::autodiff::autodiff; #[cfg(feature = "libm")] use libm::lgamma; diff --git a/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml index 6e659faf3a3b..d28f845bba29 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml @@ -13,6 +13,10 @@ crate-type = ["lib"] [profile.release] lto = "fat" opt-level = 3 +codegen-units = 1 +unwind = "abort" +strip = true +#overflow-checks = false [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index d8524225f6c1..a87e2e036292 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B lstm-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B lstm-raw.ll results.json -f %s .PHONY: clean @@ -8,22 +8,23 @@ clean: rm -f *.ll *.o results.json $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib - -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - @echo opt $(LOAD) -o $@ -S - opt $^ $(LOAD) -o $@ -S - -%-opt.ll: %-raw.ll - @echo opt $^ -o $@ -S - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S - -lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a - clang++ -pthread -O3 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + cargo +enzyme rustc --release --lib --crate-type=staticlib + #RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib + +# %-unopt.ll: %.cpp +# clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# +# %-raw.ll: %-unopt.ll +# @echo opt $(LOAD) -o $@ -S +# opt $^ $(LOAD) -o $@ -S +# +# %-opt.ll: %-raw.ll +# @echo opt $^ -o $@ -S +# opt $^ -o $@ -S +# #opt $^ -O2 -o $@ -S + +lstm.o: lstm.cpp $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ $(LOAD) $(BENCH) -pthread -O3 lstm.cpp -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index ad6481790a19..f3225db5c7eb 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -1,4 +1,5 @@ use std::slice; +use std::autodiff::autodiff; // Sigmoid on scalar fn sigmoid(x: f64) -> f64 { @@ -31,10 +32,15 @@ fn lstm_model( let (a, b) = gates.split_at_mut(2 * hsize); let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); + #[cfg(debug_assertions)] debug_assert_eq!(weight.len(), 4 * hsize); + #[cfg(debug_assertions)] debug_assert_eq!(bias.len(), 4 * hsize); + #[cfg(debug_assertions)] debug_assert_eq!(hidden.len(), hsize); + #[cfg(debug_assertions)] debug_assert!(cell.len() >= hsize); + #[cfg(debug_assertions)] debug_assert!(input.len() >= hsize); // caching input for i in 0..hsize { @@ -130,6 +136,7 @@ pub(crate) fn lstm_objective( let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; + #[cfg(debug_assertions)] debug_assert!(b > 0); let limit = (c - 1) * b; diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs index 3758c8e1e97a..a93613ea2dda 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs @@ -1,3 +1,5 @@ +use std::autodiff::autodiff; + // Sigmoid on scalar fn sigmoid(x: f64) -> f64 { 1.0 / (1.0 + (-x).exp()) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 48b6af750067..b7386a4ba87c 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -13,10 +13,10 @@ crate-type = ["lib"] [profile.release] lto = "fat" opt-level = 3 -panic = 'abort' -strip = true codegen-units = 1 -#debug = true +unwind = "abort" +strip = true +#overflow-checks = false [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 2347ca8e0f80..4c1d8bfdca1c 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -8,6 +8,8 @@ //#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS //#define BOOST_NO_EXCEPTIONS +use std::autodiff::autodiff; + const N: usize = 32; const xmin: f64 = 0.; const xmax: f64 = 1.; From 5ebcd1e35cd7da44aa4a5099ba1ab129760421e6 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 27 Oct 2024 01:27:13 -0400 Subject: [PATCH 61/88] move Rust to DuplicatedOnly --- enzyme/benchmarks/ReverseMode/ba/src/safe.rs | 4 ++-- .../benchmarks/ReverseMode/ba/src/unsafe.rs | 4 ++-- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 4 ++-- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 2 +- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 6 +++--- .../benchmarks/ReverseMode/gmm/src/unsafe.rs | 2 +- .../benchmarks/ReverseMode/lstm/src/safe.rs | 20 +++++++------------ .../benchmarks/ReverseMode/lstm/src/unsf.rs | 2 +- 8 files changed, 19 insertions(+), 25 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs index c15822fda650..3530c79e5a8e 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs @@ -78,7 +78,7 @@ pub extern "C" fn rust_dcompute_reproj_error( err: *mut [f64; 2], derr: *mut [f64; 2], ) { - dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); + unsafe {dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr)}; } #[autodiff( @@ -88,7 +88,7 @@ pub extern "C" fn rust_dcompute_reproj_error( Duplicated, Duplicated, Const, - Duplicated + DuplicatedOnly )] pub fn compute_reproj_error( cam: *const [f64; 11], diff --git a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs index 874b9ba9e9d1..09f74be9b6f8 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs @@ -82,7 +82,7 @@ pub unsafe extern "C" fn rust_unsafe_dcompute_reproj_error( err: *mut f64, derr: *mut f64, ) { - dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); + unsafe {dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr)}; } @@ -93,7 +93,7 @@ pub unsafe extern "C" fn rust_unsafe_dcompute_reproj_error( Duplicated, Duplicated, Const, - Duplicated + DuplicatedOnly )] pub unsafe fn compute_reproj_error( cam: *const f64, diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index e6c088cda996..af9169c4227e 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -78,7 +78,7 @@ fn ifft(data: &mut [f64]) { rescale(data, data.len() / 2); } -#[autodiff(dfoobar, Reverse, Duplicated)] +#[autodiff(dfoobar, Reverse, DuplicatedOnly)] pub fn foobar(data: &mut [f64]) { fft(data); ifft(data); @@ -94,7 +94,7 @@ pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { ) }; - dfoobar(data, ddata); + unsafe {dfoobar(data, ddata)}; } #[no_mangle] diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index efd242594ffd..a0af8bf35302 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -103,7 +103,7 @@ unsafe fn ifft(data: *mut f64, n: usize) { rescale(data, n); } -#[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] +#[autodiff(unsafe_dfoobar, Reverse, Const, DuplicatedOnly)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { fft(data, n ); ifft(data, n ); diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index 8f3f255d2d4c..9356b1178afb 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -52,7 +52,7 @@ pub extern "C" fn rust_dgmm_objective( let (mut bqdiags, mut bsum_qs, mut bxcentered, mut bqxcentered, mut bmain_term) = get_workspace(d, k); - dgmm_objective( + unsafe { dgmm_objective( d, k, n, @@ -77,7 +77,7 @@ pub extern "C" fn rust_dgmm_objective( &mut bqxcentered, &mut main_term, &mut bmain_term, - ); + )}; unsafe { *err = my_err }; unsafe { *derr = my_derr }; @@ -147,7 +147,7 @@ fn get_workspace(d: usize, k: usize) -> (Vec, Vec, Vec, Vec, Const, Const, Const, - Duplicated, + DuplicatedOnly, Duplicated, Duplicated, Duplicated, diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs index 1b82cebf7514..aa91938565ab 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs @@ -38,7 +38,7 @@ pub extern "C" fn rust_unsafe_gmm_objective(d: i32, k: i32, n: i32, alphas: *con // gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); //} -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, DuplicatedOnly)] pub unsafe fn gmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index f3225db5c7eb..d6847a4d5d72 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -32,16 +32,11 @@ fn lstm_model( let (a, b) = gates.split_at_mut(2 * hsize); let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); - #[cfg(debug_assertions)] - debug_assert_eq!(weight.len(), 4 * hsize); - #[cfg(debug_assertions)] - debug_assert_eq!(bias.len(), 4 * hsize); - #[cfg(debug_assertions)] - debug_assert_eq!(hidden.len(), hsize); - #[cfg(debug_assertions)] - debug_assert!(cell.len() >= hsize); - #[cfg(debug_assertions)] - debug_assert!(input.len() >= hsize); + //debug_assert_eq!(weight.len(), 4 * hsize); + //debug_assert_eq!(bias.len(), 4 * hsize); + //debug_assert_eq!(hidden.len(), hsize); + //debug_assert!(cell.len() >= hsize); + //debug_assert!(input.len() >= hsize); // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); @@ -118,7 +113,7 @@ fn lstm_predict( Duplicated, Const, Const, - Duplicated + DuplicatedOnly )] pub(crate) fn lstm_objective( l: usize, @@ -136,8 +131,7 @@ pub(crate) fn lstm_objective( let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; - #[cfg(debug_assertions)] - debug_assert!(b > 0); + //debug_assert!(b > 0); let limit = (c - 1) * b; for j in 0..(c - 1) { diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs index a93613ea2dda..498bf96a9983 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs @@ -81,7 +81,7 @@ unsafe fn lstm_predict( } // LSTM objective (loss function) -#[autodiff(d_lstm_unsafe_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +#[autodiff(d_lstm_unsafe_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, DuplicatedOnly)] pub (crate) unsafe fn lstm_unsafe_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { let mut total = 0.0; let mut count = 0; From aeed32d58d82841303a8a6da601070b3196bcd2b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 28 Oct 2024 14:41:50 -0400 Subject: [PATCH 62/88] cleanup makefiles --- enzyme/benchmarks/ReverseMode/ba/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 91bf50a386d7..f995bc33eaa3 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -1,4 +1,4 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ba.o results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ba.o results.json VERBOSE=1 -f %s .PHONY: clean diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index a87e2e036292..eb9d531b0cd8 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B lstm-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B lstm.o results.json VERBOSE=1 -f %s .PHONY: clean @@ -9,7 +9,6 @@ clean: $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib - #RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib # %-unopt.ll: %.cpp # clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm From 1542a3867e6138f77a2593abd968c3a13a0b07bd Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 28 Oct 2024 18:57:25 -0400 Subject: [PATCH 63/88] update fft-tapenade to be correct but segfault --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 74 +++---- enzyme/benchmarks/ReverseMode/fft/fft.h | 245 +++++++++++++--------- 2 files changed, 183 insertions(+), 136 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index b6c5fb7b5eaa..0613934397a6 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -153,43 +153,43 @@ static void adept_sincos(double inp, unsigned len) { static void tapenade_sincos(double inp, unsigned len) { - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - foobar(x, len); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Tapenade real %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double* x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - foobar(x, len); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Tapenade forward %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double res2 = tfoobar_and_gradient(len); - - gettimeofday(&end, NULL); - printf("Tapenade combined %0.6f res'=%f\n", tdiff(&start, &end), res2); - } + // { + // struct timeval start, end; + // gettimeofday(&start, NULL); + + // double *x = new double[2*len]; + // for(int i=0; i<2*len; i++) x[i] = 2.0; + // foobar(x, len); + // double res = x[0]; + + // gettimeofday(&end, NULL); + // printf("Tapenade real %0.6f res=%f\n", tdiff(&start, &end), res); + // delete[] x; + // } + + // { + // struct timeval start, end; + // gettimeofday(&start, NULL); + + // double* x = new double[2*len]; + // for(int i=0; i<2*len; i++) x[i] = 2.0; + // foobar(x, len); + // double res = x[0]; + + // gettimeofday(&end, NULL); + // printf("Tapenade forward %0.6f res=%f\n", tdiff(&start, &end), res); + // delete[] x; + // } + + // { + // struct timeval start, end; + // gettimeofday(&start, NULL); + + // double res2 = tfoobar_and_gradient(len); + + // gettimeofday(&end, NULL); + // printf("Tapenade combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + // } } static void enzyme_sincos(double inp, unsigned len) { diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 809196b76cc3..79d8647d5300 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -164,39 +164,113 @@ static void ifft(aVector data, unsigned N) { //! Tapenade extern "C" { + /* Generated by TAPENADE (INRIA, Ecuador team) - Tapenade 3.15 (master) - 15 Apr 2020 11:54 + Tapenade 3.16 (bugfix_servletAD) - 4 Jan 2024 17:44 */ #include +#include +#include + +/* + Differentiation of swap in reverse (adjoint) mode: + gradient of useful results: *a *b + with respect to varying inputs: *a *b + Plus diff mem management of: a:in b:in +*/ +inline void swap_b(double *a, double *ab, double *b, double *bb) { + double temp = *a; + double tempb = 0.0; + *a = *b; + *b = temp; + tempb = *bb; + *bb = *ab; + *ab = tempb; +} + +inline void swap_c(double *a, double *b) { + double temp = *a; + *a = *b; + *b = temp; +} + +static void recursiveApply_c(double *data, int iSign, unsigned int N) { + unsigned int arg1; + double *arg10; + unsigned int arg2; + if (N == 1) + return; + else { + arg1 = N/2; + recursiveApply_c(data, iSign, arg1); + arg10 = data + N; + arg2 = N/2; + recursiveApply_c(arg10, iSign, arg2); + double wtemp = iSign*sin(3.14/N); + double wpi = -iSign*sin(2*3.14/N); + double wpr = -2.0*wtemp*wtemp; + double wr = 1.0; + double wi = 0.0; + for (unsigned int i = 0; i <= N-1; i += 2) { + int iN = i + N; + double tempr = data[iN]*wr - data[iN+1]*wi; + double tempi = data[iN]*wi + data[iN+1]*wr; + data[iN] = data[i] - tempr; + data[iN + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + wtemp = wr; + wr += wr*wpr - wi*wpi; + wi += wi*wpr + wtemp*wpi; + } + } +} /* - Differentiation of recursiveApply in reverse (adjoint) mode (with options context): + Differentiation of recursiveApply in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void recursiveApply_b(double *data, double *datab, int iSign, unsigned +static void recursiveApply_b(double *data, double *datab, int iSign, unsigned int N) { - int arg1; + unsigned int arg1; double *arg10; double *arg10b; - int arg2; + unsigned int arg2; + int branch; if (N != 1) { arg1 = N/2; + pushReal8(*data); + recursiveApply_c(data, iSign, arg1); arg10b = datab + N; arg10 = data + N; arg2 = N/2; - double wtemp = iSign*sin(3.1415926536/N); - double wpi = -iSign*sin(2*3.1415926536/N); + if (arg10) { + pushReal8(*arg10); + pushControl1b(1); + } else + pushControl1b(0); + recursiveApply_c(arg10, iSign, arg2); + double wtemp = iSign*sin(3.14/N); + double wpi = -iSign*sin(2*3.14/N); double wpr = -2.0*wtemp*wtemp; double wr = 1.0; double wi = 0.0; - for (int i = 0; i <= N-1; i += 2) { + for (unsigned int i = 0; i <= N-1; i += 2) { int iN = i + N; double tempr = data[iN]*wr - data[iN+1]*wi; double tempi = data[iN]*wi + data[iN+1]*wr; + double temprb; + double tempib; double tmp; double tmp0; + tmp = data[i] - tempr; + data[iN] = tmp; + tmp0 = data[i + 1] - tempi; + data[iN + 1] = tmp0; + data[i] = data[i] + tempr; + data[i + 1] = data[i + 1] + tempi; wtemp = wr; pushReal8(wr); wr = wr + (wr*wpr - wi*wpi); @@ -204,13 +278,7 @@ static void recursiveApply_b(double *data, double *datab, int iSign, unsigned wi = wi + (wi*wpr + wtemp*wpi); pushInteger4(iN); } - pushPointer8(arg10b); - pushInteger4(arg2); - pushInteger4(arg1); - popInteger4(&arg1); - popInteger4(&arg2); - popPointer8((void **)&arg10b); - for (int i = N-(N-1)%2-1; i >= 0; i -= 2) { + for (unsigned int i = N-(N-1)%2-1; i >= 0; i -= 2) { int iN; double tempr; double temprb = 0.0; @@ -233,65 +301,17 @@ static void recursiveApply_b(double *data, double *datab, int iSign, unsigned datab[iN + 1] = datab[iN + 1] + wr*tempib - wi*temprb; datab[iN] = datab[iN] + wi*tempib + wr*temprb; } + popControl1b(&branch); + if (branch == 1) + popReal8(arg10); recursiveApply_b(arg10, arg10b, iSign, arg2); + popReal8(data); recursiveApply_b(data, datab, iSign, arg1); } } -static void recursiveApply_nodiff(double *data, int iSign, unsigned int N) { - int arg1; - double *arg10; - int arg2; - if (N == 1) - return; - else { - arg1 = N/2; - recursiveApply_nodiff(data, iSign, arg1); - arg10 = data + N; - arg2 = N/2; - recursiveApply_nodiff(arg10, iSign, arg2); - double wtemp = iSign*sin(3.1415926536/N); - double wpi = -iSign*sin(2*3.1415926536/N); - double wpr = -2.0*wtemp*wtemp; - double wr = 1.0; - double wi = 0.0; - for (int i = 0; i <= N-1; i += 2) { - int iN = i + N; - double tempr = data[iN]*wr - data[iN+1]*wi; - double tempi = data[iN]*wi + data[iN+1]*wr; - data[iN] = data[i] - tempr; - data[iN + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; - wtemp = wr; - wr += wr*wpr - wi*wpi; - wi += wi*wpr + wtemp*wpi; - } - } -} - /* - Differentiation of swap in reverse (adjoint) mode (with options context): - gradient of useful results: *a *b - with respect to varying inputs: *a *b - Plus diff mem management of: a:in b:in -*/ -static void swap_b(double *a, double *ab, double *b, double *bb) { - double temp = *a; - double tempb = 0.0; - tempb = *bb; - *bb = *ab; - *ab = tempb; -} - -static void swap_nodiff(double *a, double *b) { - double temp = *a; - *a = *b; - *b = temp; -} - -/* - Differentiation of scramble in reverse (adjoint) mode (with options context): + Differentiation of scramble in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data Plus diff mem management of: data:in @@ -301,9 +321,15 @@ static void scramble_b(double *data, double *datab, unsigned int N) { int branch; for (int i = 1; i <= 2*N-1; i += 2) { int adCount; - if (j > i) + if (j > i) { + pushReal8(data[i - 1]); + pushReal8(data[j - 1]); + swap_c(&(data[j - 1]), &(data[i - 1])); + pushReal8(data[i]); + pushReal8(data[j]); + swap_c(&(data[j]), &(data[i])); pushControl1b(0); - else + } else pushControl1b(1); int m = N; adCount = 0; @@ -327,19 +353,23 @@ static void scramble_b(double *data, double *datab, unsigned int N) { popInteger4(&j); popControl1b(&branch); if (branch == 0) { + popReal8(&(data[j])); + popReal8(&(data[i])); swap_b(&(data[j]), &(datab[j]), &(data[i]), &(datab[i])); - swap_b(&(data[j - 1]), &(datab[j - 1]), &(data[i - 1]), &(datab[i + popReal8(&(data[j - 1])); + popReal8(&(data[i - 1])); + swap_b(&(data[j - 1]), &(datab[j - 1]), &(data[i - 1]), &(datab[i - 1])); } } } -static void scramble_nodiff(double *data, unsigned int N) { +static void scramble_c(double *data, unsigned int N) { int j = 1; for (int i = 1; i <= 2*N-1; i += 2) { if (j > i) { - swap_nodiff(&(data[j - 1]), &(data[i - 1])); - swap_nodiff(&(data[j]), &(data[i])); + swap_c(&(data[j - 1]), &(data[i - 1])); + swap_c(&(data[j]), &(data[i])); } int m = N; while(m >= 2 && j > m) { @@ -351,74 +381,91 @@ static void scramble_nodiff(double *data, unsigned int N) { } /* - Differentiation of rescale in reverse (adjoint) mode (with options context): + Differentiation of rescale in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data Plus diff mem management of: data:in */ static void rescale_b(double *data, double *datab, unsigned int N) { double scale = (double)1/N; - pushReal8(scale); - popReal8(&scale); - for (int i = 2*N-1; i > -1; --i) + for (unsigned int i = 0; i < 2*N; ++i) + data[i] = data[i]*scale; + for (unsigned int i = 2*N-1; i > -1; --i) datab[i] = scale*datab[i]; } -static void rescale_nodiff(double *data, unsigned int N) { +static void rescale_c(double *data, unsigned int N) { double scale = (double)1/N; - for (int i = 0; i < 2*N; ++i) + for (unsigned int i = 0; i < 2*N; ++i) data[i] *= scale; } /* - Differentiation of fft in reverse (adjoint) mode (with options context): + Differentiation of fiveft in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void fft_b(double *data, double *datab, unsigned int N) { +void fiveft_b(double *data, double *datab, unsigned int N) { + pushReal8(*data); + scramble_c(data, N); + pushReal8(*data); + recursiveApply_c(data, 1, N); + popReal8(data); recursiveApply_b(data, datab, 1, N); + popReal8(data); scramble_b(data, datab, N); } -static void fft_nodiff(double *data, unsigned int N) { - scramble_nodiff(data, N); - recursiveApply_nodiff(data, 1, N); +void fiveft_c(double *data, unsigned int N) { + scramble_c(data, N); + recursiveApply_c(data, 1, N); } /* - Differentiation of ifft in reverse (adjoint) mode (with options context): + Differentiation of ifiveft in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void ifft_b(double *data, double *datab, unsigned int N) { +void ifiveft_b(double *data, double *datab, unsigned int N) { + pushReal8(*data); + scramble_c(data, N); + pushReal8(*data); + recursiveApply_c(data, -1, N); + pushReal8(*data); + rescale_c(data, N); + popReal8(data); rescale_b(data, datab, N); + popReal8(data); recursiveApply_b(data, datab, -1, N); + popReal8(data); scramble_b(data, datab, N); } -static void ifft_nodiff(double *data, unsigned int N) { - scramble_nodiff(data, N); - recursiveApply_nodiff(data, -1, N); - rescale_nodiff(data, N); +void ifiveft_c(double *data, unsigned int N) { + scramble_c(data, N); + recursiveApply_c(data, -1, N); + rescale_c(data, N); } /* - Differentiation of foobar in reverse (adjoint) mode (with options context): + Differentiation of foobar in reverse (adjoint) mode: gradient of useful results: *data with respect to varying inputs: *data - RW status of diff variables: *data:in-out + RW status of diff variables: data:(loc) *data:in-out Plus diff mem management of: data:in */ void foobar_b(double *data, double *datab, unsigned int len) { - double chksum = 0.0; - int i; - ifft_b(data, datab, len); - fft_b(data, datab, len); + pushReal8(*data); + fiveft_c(data, len); + pushReal8(*data); + ifiveft_c(data, len); + popReal8(data); + ifiveft_b(data, datab, len); + popReal8(data); + fiveft_b(data, datab, len); } - } - #endif /* _fft_h_ */ From 047f121bf46e864198d40cd6eb9684ad45b14a19 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Oct 2024 03:14:19 -0400 Subject: [PATCH 64/88] add mayalias/unsafe ode-real/BRUSS benchmark versions --- .../benchmarks/ReverseMode/ode-real/ode.cpp | 163 ++++++++++++++++-- .../ReverseMode/ode-real/src/lib.rs | 99 +++-------- .../ReverseMode/ode-real/src/safe.rs | 75 ++++++++ .../ReverseMode/ode-real/src/unsf.rs | 79 +++++++++ 4 files changed, 328 insertions(+), 88 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/src/unsf.rs diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp index c0c5064b833d..17007c8de727 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp @@ -64,7 +64,39 @@ void init_brusselator(double* __restrict u, double* __restrict v) { } __attribute__((noinline)) -void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { +void brusselator_2d_loop_restrict(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { + double A = p[0]; + double B = p[1]; + double alpha = p[2]; + double dx = (double)1/(N-1); + + alpha = alpha/(dx*dx); + + for(int i=0; i(brusselator_2d_loop_restrict, + enzyme_dup, dxdu, dadjoint_inp, + enzyme_dup, dxdu + N * N, dadjoint_inp + N * N, + // enzyme_dupnoneed, nullptr, dadjoint_inp, + // enzyme_dupnoneed, nullptr, dadjoint_inp + N * N, + enzyme_dup, x, dx, + enzyme_dup, x + N * N, dx + N * N, + enzyme_dup, p, dp, + enzyme_const, t); + + return dx[0]; +} + +double foobar_norestrict(const double* p, const state_type x, const state_type adjoint, double t) { double dp[3] = { 0. }; state_type dx = { 0. }; @@ -133,7 +213,7 @@ double foobar(const double* p, const state_type x, const state_type adjoint, dou state_type dxdu; - __enzyme_autodiff(brusselator_2d_loop, + __enzyme_autodiff(brusselator_2d_loop_norestrict, enzyme_dup, dxdu, dadjoint_inp, enzyme_dup, dxdu + N * N, dadjoint_inp + N * N, // enzyme_dupnoneed, nullptr, dadjoint_inp, @@ -551,10 +631,22 @@ int main(int argc, char** argv) { double res; for(int i=0; i<10000; i++) - res = foobar(p, x, adjoint, t); + res = foobar_norestrict(p, x, adjoint, t); + + gettimeofday(&end, NULL); + printf("C++ Enzyme combined mayalias %0.6f res=%f\n", tdiff(&start, &end), res); + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res; + for(int i=0; i<10000; i++) + res = foobar_restrict(p, x, adjoint, t); gettimeofday(&end, NULL); - printf("C++ Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); + printf("C++ Enzyme combined restrict %0.6f res=%f\n", tdiff(&start, &end), res); } { @@ -563,10 +655,35 @@ int main(int argc, char** argv) { double res; for(int i=0; i<10000; i++) - res = rustfoobar(p, x, adjoint, t); + res = rustfoobar_safe(p, x, adjoint, t); + + gettimeofday(&end, NULL); + printf("Rust Enzyme combined safe %0.6f res=%f\n", tdiff(&start, &end), res); + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res; + for(int i=0; i<10000; i++) + res = rustfoobar_unsf(p, x, adjoint, t); + + gettimeofday(&end, NULL); + printf("Rust Enzyme combined unsf %0.6f res=%f\n", tdiff(&start, &end), res); + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; + + for(int i=0; i<10000; i++) { + lorenz_norestrict(x, x2, t); + } gettimeofday(&end, NULL); - printf("Rust Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); + printf("C++ fwd mayalias %0.6f res=%f\n", tdiff(&start, &end), x2[0]); } { @@ -575,11 +692,23 @@ int main(int argc, char** argv) { state_type x2; for(int i=0; i<10000; i++) { - lorenz(x, x2, t); + lorenz_restrict(x, x2, t); } gettimeofday(&end, NULL); - printf("C++ fwd %0.6f res=%f\n", tdiff(&start, &end), x2[0]); + printf("C++ fwd restrict %0.6f res=%f\n", tdiff(&start, &end), x2[0]); + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; + + for(int i=0; i<10000; i++) + rust_lorenz_safe(x, x2, t); + + gettimeofday(&end, NULL); + printf("Rust fwd safe %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } { @@ -588,10 +717,10 @@ int main(int argc, char** argv) { state_type x2; for(int i=0; i<10000; i++) - rust_lorenz(x, x2, t); + rust_lorenz_unsf(x, x2, t); gettimeofday(&end, NULL); - printf("Rust fwd %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); + printf("Rust fwd unsf %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } //printf("res=%f\n", foobar(1000)); diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 4c1d8bfdca1c..4fbc7e75f054 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -1,97 +1,54 @@ #![feature(autodiff)] #![feature(slice_as_chunks)] #![feature(iter_next_chunk)] +#![feature(array_ptr_get)] #![allow(non_snake_case)] #![allow(non_camel_case_types)] #![allow(non_upper_case_globals)] -//#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS -//#define BOOST_NO_EXCEPTIONS +pub mod safe; +pub mod unsf; -use std::autodiff::autodiff; +type StateType = [f64; 2 * N * N]; const N: usize = 32; -const xmin: f64 = 0.; -const xmax: f64 = 1.; -const ymin: f64 = 0.; -const ymax: f64 = 1.; - -#[inline(always)] -fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { - (max - min) / (N_var as f64 - 1.) * i as f64 + min -} - -fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { - let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; - let eq2 = t >= 1.1; - if eq1 && eq2 { - 5.0 - } else { - 0.0 - } -} -#[expect(unused)] -fn init_brusselator(u: &mut [f64], v: &mut [f64]) { - assert!(u.len() == N * N); - assert!(v.len() == N * N); - for i in 0..N { - for j in 0..N { - let x = range(xmin, xmax, i, N); - let y = range(ymin, ymax, j, N); - u[N * i + j] = 22.0 * (y * (1.0 - y)) * (y * (1.0 - y)).sqrt(); - v[N * i + j] = 27.0 * (x * (1.0 - x)) * (x * (1.0 - x)).sqrt(); - } - } -} #[no_mangle] -#[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] -fn brusselator_2d_loop(d_u: &mut [f64;N*N], d_v: &mut [f64;N*N], u: &[f64;N*N], v: &[f64;N*N], p: &[f64;3], t: f64) { - let A = p[0]; - let B = p[1]; - let alpha = p[2]; - let dx = 1. / (N - 1) as f64; - let alpha = alpha / (dx * dx); - for i in 0..N { - for j in 0..N { - let x = range(xmin, xmax, i, N); - let y = range(ymin, ymax, j, N); - let ip1 = if i == N - 1 { i } else { i + 1 }; - let im1 = if i == 0 { i } else { i - 1 }; - let jp1 = if j == N - 1 { j } else { j + 1 }; - let jm1 = if j == 0 { j } else { j - 1 }; - let u2v = u[N * i + j] * u[N * i + j] * v[N * i + j]; - d_u[N * i + j] = alpha * (u[N * im1 + j] + u[N * ip1 + j] + u[N * i + jp1] + u[N * i + jm1] - 4. * u[N * i + j]) - + B + u2v - (A + 1.) * u[N * i + j] + brusselator_f(x, y, t); - d_v[N * i + j] = alpha * (v[N * im1 + j] + v[N * ip1 + j] + v[N * i + jp1] + v[N * i + jm1] - 4. * v[N * i + j]) - + A * u[N * i + j] - u2v; - } - } +pub extern "C" fn rust_lorenz_unsf(x: *const StateType, dxdt: *mut StateType, t: f64) { + let x: &StateType = unsafe { &*x }; + let dxdt: &mut StateType = unsafe { &mut *dxdt }; + unsafe {unsf::lorenz(x, dxdt, t)}; } -type StateType = [f64; 2 * N * N]; #[no_mangle] -pub extern "C" fn rust_lorenz(x: *const StateType, dxdt: *mut StateType, t: f64) { +pub extern "C" fn rust_lorenz_safe(x: *const StateType, dxdt: *mut StateType, t: f64) { let x: &StateType = unsafe { &*x }; let dxdt: &mut StateType = unsafe { &mut *dxdt }; - lorenz(x, dxdt, t); + safe::lorenz(x, dxdt, t); } -fn lorenz(x: &StateType, dxdt: &mut StateType, t: f64) { - let p = [3.4, 1., 10.]; - let (tmp1, tmp2) = dxdt.split_at_mut(N * N); - let mut dxdt1: [f64; N * N] = tmp1.try_into().unwrap(); - let mut dxdt2: [f64; N * N] = tmp2.try_into().unwrap(); - let (tmp1, tmp2) = x.split_at(N * N); - let u: [f64; N * N] = tmp1.try_into().unwrap(); - let v: [f64; N * N] = tmp2.try_into().unwrap(); - brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &u, &v, &p, t); +#[no_mangle] +pub extern "C" fn rust_dbrusselator_2d_loop_unsf(adjoint: *mut StateType, x: *const StateType, dx: *mut StateType, p: *const [f64;3], dp: *mut [f64;3], t: f64) { + let mut null1 = [0.; 1 * N * N]; + let mut null2 = [0.; 1 * N * N]; + let dx1: *mut f64 = dx.as_mut_ptr(); + let dx2: *mut f64 = unsafe { dx.as_mut_ptr().add(N*N) }; + let dadj1: *mut f64 = adjoint.as_mut_ptr(); + let dadj2: *mut f64 = unsafe { adjoint.as_mut_ptr().add(N*N) }; + let x1: *const f64 = x.as_ptr(); + let x2: *const f64 = unsafe { x.as_ptr().add(N*N) }; + + unsafe {unsf::dbrusselator_2d_loop_unsf(null1.as_mut_ptr(), dadj1, + null2.as_mut_ptr(), dadj2, + x1, dx1, + x2, dx2, + p as *mut f64, dp as *mut f64, t)}; } #[no_mangle] -pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const StateType, dx: *mut StateType, p: *const [f64;3], dp: *mut [f64;3], t: f64) { +pub extern "C" fn rust_dbrusselator_2d_loop_safe(adjoint: *mut StateType, x: *const StateType, dx: *mut StateType, p: *const [f64;3], dp: *mut [f64;3], t: f64) { let x: &StateType = unsafe { &*x }; let dx: &mut StateType = unsafe { &mut *dx }; let adjoint: &mut StateType = unsafe { &mut *adjoint }; @@ -134,7 +91,7 @@ pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const S let mut null1 = [0.; 1 * N * N]; let mut null2 = [0.; 1 * N * N]; - dbrusselator_2d_loop(&mut null1, dadj1, + safe::dbrusselator_2d_loop(&mut null1, dadj1, &mut null2, dadj2, x1, dx1, x2, dx2, diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/safe.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/safe.rs new file mode 100644 index 000000000000..ddf36851b09c --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/safe.rs @@ -0,0 +1,75 @@ +use std::autodiff::autodiff; + +const N: usize = 32; +const xmin: f64 = 0.; +const xmax: f64 = 1.; +const ymin: f64 = 0.; +const ymax: f64 = 1.; + +#[inline(always)] +fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { + (max - min) / (N_var as f64 - 1.) * i as f64 + min +} + +fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { + let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; + let eq2 = t >= 1.1; + if eq1 && eq2 { + 5.0 + } else { + 0.0 + } +} + +#[expect(unused)] +fn init_brusselator(u: &mut [f64], v: &mut [f64]) { + assert!(u.len() == N * N); + assert!(v.len() == N * N); + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + u[N * i + j] = 22.0 * (y * (1.0 - y)) * (y * (1.0 - y)).sqrt(); + v[N * i + j] = 27.0 * (x * (1.0 - x)) * (x * (1.0 - x)).sqrt(); + } + } +} + +#[no_mangle] +#[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] +pub fn brusselator_2d_loop(d_u: &mut [f64;N*N], d_v: &mut [f64;N*N], u: &[f64;N*N], v: &[f64;N*N], p: &[f64;3], t: f64) { + let A = p[0]; + let B = p[1]; + let alpha = p[2]; + let dx = 1. / (N - 1) as f64; + let alpha = alpha / (dx * dx); + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + let ip1 = if i == N - 1 { i } else { i + 1 }; + let im1 = if i == 0 { i } else { i - 1 }; + let jp1 = if j == N - 1 { j } else { j + 1 }; + let jm1 = if j == 0 { j } else { j - 1 }; + let u2v = u[N * i + j] * u[N * i + j] * v[N * i + j]; + d_u[N * i + j] = alpha * (u[N * im1 + j] + u[N * ip1 + j] + u[N * i + jp1] + u[N * i + jm1] - 4. * u[N * i + j]) + + B + u2v - (A + 1.) * u[N * i + j] + brusselator_f(x, y, t); + d_v[N * i + j] = alpha * (v[N * im1 + j] + v[N * ip1 + j] + v[N * i + jp1] + v[N * i + jm1] - 4. * v[N * i + j]) + + A * u[N * i + j] - u2v; + } + } +} + +pub type StateType = [f64; 2 * N * N]; + +pub fn lorenz(x: &StateType, dxdt: &mut StateType, t: f64) { + let p = [3.4, 1., 10.]; + let (tmp1, tmp2) = dxdt.split_at_mut(N * N); + let mut dxdt1: [f64; N * N] = tmp1.try_into().unwrap(); + let mut dxdt2: [f64; N * N] = tmp2.try_into().unwrap(); + let (tmp1, tmp2) = x.split_at(N * N); + let u: [f64; N * N] = tmp1.try_into().unwrap(); + let v: [f64; N * N] = tmp2.try_into().unwrap(); + brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &u, &v, &p, t); +} + diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/unsf.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/unsf.rs new file mode 100644 index 000000000000..9f1e4006b80e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/unsf.rs @@ -0,0 +1,79 @@ +use std::autodiff::autodiff; + +const N: usize = 32; +const xmin: f64 = 0.; +const xmax: f64 = 1.; +const ymin: f64 = 0.; +const ymax: f64 = 1.; + +#[inline(always)] +fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { + (max - min) / (N_var as f64 - 1.) * i as f64 + min +} + +fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { + let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; + let eq2 = t >= 1.1; + if eq1 && eq2 { + 5.0 + } else { + 0.0 + } +} + +#[expect(unused)] +unsafe fn init_brusselator(u: *mut f64, v: *mut f64) { + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + *u.add(N * i + j) = 22.0 * (y * (1.0 - y)) * (y * (1.0 - y)).sqrt(); + *v.add(N * i + j) = 27.0 * (x * (1.0 - x)) * (x * (1.0 - x)).sqrt(); + } + } +} + +#[no_mangle] +#[autodiff(dbrusselator_2d_loop_unsf, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] +pub unsafe fn brusselator_2d_loop_unsf(d_u: *mut f64, d_v: *mut f64, u: *const f64, v: *const f64, p: *const f64, t: f64) { + let A = *p.add(0); + let B = *p.add(1); + let alpha = *p.add(2); + let dx = 1. / (N - 1) as f64; + let alpha = alpha / (dx * dx); + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + let ip1 = if i == N - 1 { i } else { i + 1 }; + let im1 = if i == 0 { i } else { i - 1 }; + let jp1 = if j == N - 1 { j } else { j + 1 }; + let jm1 = if j == 0 { j } else { j - 1 }; + let u2v = *u.add(N * i + j) * *u.add(N * i + j) * *v.add(N * i + j); + *d_u.add(N * i + j) = alpha * (*u.add(N * im1 + j) + *u.add(N * ip1 + j) + *u.add(N * i + jp1) + *u.add(N * i + jm1) - 4. * *u.add(N * i + j)) + + B + u2v - (A + 1.) * *u.add(N * i + j) + brusselator_f(x, y, t); + *d_v.add(N * i + j) = alpha * (*v.add(N * im1 + j) + *v.add(N * ip1 + j) + *v.add(N * i + jp1) + *v.add(N * i + jm1) - 4. * *v.add(N * i + j)) + + A * *u.add(N * i + j) - u2v; + } + } +} + +type StateType = [f64; 2 * N * N]; + +pub unsafe fn lorenz(x: *const StateType, dxdt: *mut StateType, t: f64) { + let p = [3.4, 1., 10.]; + let x = x as *const f64; + let dxdt = dxdt as *mut f64; + let dxdt1: *mut f64 = dxdt as *mut f64; + let dxdt2: *mut f64 = unsafe {dxdt.add(N * N)} as *mut f64; + //let (tmp1, tmp2) = dxdt.split_at_mut(N * N); + //let mut dxdt1: [f64; N * N] = tmp1.try_into().unwrap(); + //let mut dxdt2: [f64; N * N] = tmp2.try_into().unwrap(); + let u: *const f64 = x as *const f64; + let v: *const f64 = unsafe{x.add(N * N)} as *const f64; + //let (tmp1, tmp2) = x.split_at(N * N); + //let u: [f64; N * N] = tmp1.try_into().unwrap(); + //let v: [f64; N * N] = tmp2.try_into().unwrap(); + unsafe {brusselator_2d_loop_unsf(dxdt1 as *mut f64, dxdt2 as *mut f64, u as *const f64, v as *const f64, p.as_ptr(), t)}; +} + From 3d2585873a1880d6f2fdf4b3a40975ccee6c93b0 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Oct 2024 03:40:38 -0400 Subject: [PATCH 65/88] fix safe fft performance --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index af9169c4227e..ff42c3ce8748 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -10,6 +10,8 @@ fn bitreversal_perm(data: &mut [T]) { while i < 2*len { if j > i { //dbg!(&i, &j); + //data.swap(j-1, i-1); + //data.swap(j, i); unsafe {data.swap_unchecked(j-1, i-1);} unsafe {data.swap_unchecked(j, i);} } @@ -40,10 +42,9 @@ fn radix2(data: &mut [f64], i_sign: f64, n: usize) { let mut wr = 1.0; let mut wi = 0.0; - let mut i = 0; - while i < n { + assert_eq!(data.len(), 2*n); + for i in (0..n).step_by(2) { let in_n = i + n; - let tempr = data[in_n] * wr - data[in_n + 1] * wi; let tempi = data[in_n] * wi + data[in_n + 1] * wr; @@ -55,9 +56,8 @@ fn radix2(data: &mut [f64], i_sign: f64, n: usize) { let wtemp_new = wr; wr += wr * wpr - wi * wpi; wi += wi * wpr + wtemp_new * wpi; - - i += 2; } + } fn rescale(data: &mut [f64], scale: usize) { From 2d04892447617df4149ba0b32b78cabaf4f80026 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 30 Oct 2024 03:55:06 -0400 Subject: [PATCH 66/88] cleanup fft --- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 45 ++----------------- 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index a0af8bf35302..5e09a2f26fac 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -1,33 +1,13 @@ use std::f64::consts::PI; use std::autodiff::autodiff; -//static void scramble(double* data, unsigned N) { -// int j=1; -// for (int i=1; i<2*N; i+=2) { -// if (j>i) { -// swap(&data[j-1], &data[i-1]); -// swap(&data[j], &data[i]); -// } -// int m = N; -// while (m>=2 && j>m) { -// j -= m; -// m >>= 1; -// } -// j += m; -// } -//} unsafe fn bitreversal_perm(data: *mut f64, len: usize) { - //let len = data.len() / 2; let mut j = 1; for i in (1..2*len).step_by(2) { - //let mut i = 1; - //while i < 2*len { if j > i { std::ptr::swap(data.add(j-1), data.add(i-1)); std::ptr::swap(data.add(j), data.add(i)); - //data.swap(j-1, i-1); - //data.swap(j, i); } let mut m = len; @@ -37,20 +17,13 @@ unsafe fn bitreversal_perm(data: *mut f64, len: usize) { } j += m; - //i += 2; } } unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { - if n == 1 { - return; - } - - let b = data.add(n); - let a = data; - //let (a,b) = data.split_at_mut(n); - radix2(a, i_sign, n/2); - radix2(b, i_sign, n/2); + if n == 1 { return; } + radix2(data, i_sign, n/2); + radix2(data.add(n), i_sign, n/2); let wtemp = i_sign * (PI / n as f64).sin(); let wpi = -i_sign * (2.0 * PI / n as f64).sin(); @@ -58,8 +31,7 @@ unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { let mut wr = 1.0; let mut wi = 0.0; - let mut i = 0; - while i < n { + for i in (0..n).step_by(2) { let in_n = i + n; let tempr = *data.add(in_n) * wr - *data.add(in_n + 1) * wi; @@ -73,18 +45,9 @@ unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { let wtemp_new = wr; wr += wr * wpr - wi * wpi; wi += wi * wpr + wtemp_new * wpi; - - i += 2; } } -//static void rescale(double* data, unsigned N) { -// double scale = ((double)1)/N; -// for (unsigned i=0; i<2*N; i++) { -// data[i] *= scale; -// } -//} - unsafe fn rescale(data: *mut f64, n: usize) { let scale = 1. / n as f64; for i in 0..2*n { From a273bdf66e9c01f1f8ed773796eea7867edddc4c Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 31 Oct 2024 12:33:20 -0600 Subject: [PATCH 67/88] bench fft: more idiomatic safe Rust --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index ff42c3ce8748..62caf8645c68 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -1,19 +1,23 @@ -use std::slice; -use std::f64::consts::PI; use std::autodiff::autodiff; +use std::f64::consts::PI; +use std::slice; fn bitreversal_perm(data: &mut [T]) { let len = data.len() / 2; let mut j = 1; let mut i = 1; - while i < 2*len { + while i < 2 * len { if j > i { //dbg!(&i, &j); //data.swap(j-1, i-1); //data.swap(j, i); - unsafe {data.swap_unchecked(j-1, i-1);} - unsafe {data.swap_unchecked(j, i);} + unsafe { + data.swap_unchecked(j - 1, i - 1); + } + unsafe { + data.swap_unchecked(j, i); + } } let mut m = len; @@ -27,14 +31,15 @@ fn bitreversal_perm(data: &mut [T]) { } } -fn radix2(data: &mut [f64], i_sign: f64, n: usize) { +fn radix2(data: &mut [f64], i_sign: f64) { + let n = data.len() / 2; if n == 1 { return; } - let (a,b) = data.split_at_mut(n); - radix2(a, i_sign, n/2); - radix2(b, i_sign, n/2); + let (a, b) = data.split_at_mut(n); + radix2(a, i_sign); + radix2(b, i_sign); let wtemp = i_sign * (PI / n as f64).sin(); let wpi = -i_sign * (2.0 * PI / n as f64).sin(); @@ -42,22 +47,19 @@ fn radix2(data: &mut [f64], i_sign: f64, n: usize) { let mut wr = 1.0; let mut wi = 0.0; - assert_eq!(data.len(), 2*n); - for i in (0..n).step_by(2) { - let in_n = i + n; - let tempr = data[in_n] * wr - data[in_n + 1] * wi; - let tempi = data[in_n] * wi + data[in_n + 1] * wr; + for i in (0..n).step_by(2) { + let tempr = b[i] * wr - b[i + 1] * wi; + let tempi = b[i] * wi + b[i + 1] * wr; - data[in_n] = data[i] - tempr; - data[in_n + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; + b[i] = a[i] - tempr; + b[i + 1] = a[i + 1] - tempi; + a[i] += tempr; + a[i + 1] += tempi; let wtemp_new = wr; wr += wr * wpr - wi * wpi; wi += wi * wpr + wtemp_new * wpi; } - } fn rescale(data: &mut [f64], scale: usize) { @@ -69,12 +71,12 @@ fn rescale(data: &mut [f64], scale: usize) { fn fft(data: &mut [f64]) { bitreversal_perm(data); - radix2(data, 1.0, data.len() / 2); + radix2(data, 1.0); } fn ifft(data: &mut [f64]) { bitreversal_perm(data); - radix2(data, -1.0, data.len() / 2); + radix2(data, -1.0); rescale(data, data.len() / 2); } @@ -86,15 +88,14 @@ pub fn foobar(data: &mut [f64]) { #[no_mangle] pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { - let (data, ddata) = unsafe { ( slice::from_raw_parts_mut(data, n * 2), - slice::from_raw_parts_mut(ddata, n * 2) + slice::from_raw_parts_mut(ddata, n * 2), ) }; - unsafe {dfoobar(data, ddata)}; + unsafe { dfoobar(data, ddata) }; } #[no_mangle] From 65e1eafc7584474089af4f6212460ec6f6f74893 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 31 Oct 2024 13:41:11 -0600 Subject: [PATCH 68/88] bench fft: use size_t in C++ for parity with Rust usize --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 399 +++++++-------- enzyme/benchmarks/ReverseMode/fft/fft.h | 574 +++++++++++----------- 2 files changed, 489 insertions(+), 484 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 0613934397a6..45efb4110d91 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -1,49 +1,46 @@ +#include +#include +#include #include #include #include -#include -#include -#include -#include #include -#include -#include +#include using adept::adouble; -template -Return __enzyme_autodiff(T...); +template Return __enzyme_autodiff(T...); float tdiff(struct timeval *start, struct timeval *end) { - return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec); + return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec); } #include "fft.h" -void foobar(double* data, unsigned len) { +void foobar(double *data, size_t len) { fft(data, len); ifft(data, len); } -void afoobar(aVector& data, unsigned len) { +void afoobar(aVector &data, size_t len) { fft(data, len); ifft(data, len); } extern "C" { - int enzyme_dupnoneed; +int enzyme_dupnoneed; } -extern "C" void rust_unsafe_dfoobar(int n, double *data, double *ddata); -extern "C" void rust_unsafe_foobar(int n, double *data); -extern "C" void rust_dfoobar(int n, double* data, double* ddata); -extern "C" void rust_foobar(int n, double* data); +extern "C" void rust_unsafe_dfoobar(size_t n, double *data, double *ddata); +extern "C" void rust_unsafe_foobar(size_t n, double *data); +extern "C" void rust_dfoobar(size_t n, double *data, double *ddata); +extern "C" void rust_foobar(size_t n, double *data); -static double rust_unsafe_foobar_and_gradient(unsigned len) { +static double rust_unsafe_foobar_and_gradient(size_t len) { double *inp = new double[2 * len]; - for (int i = 0; i < 2 * len; i++) + for (size_t i = 0; i < 2 * len; i++) inp[i] = 2.0; double *dinp = new double[2 * len]; - for (int i = 0; i < 2 * len; i++) + for (size_t i = 0; i < 2 * len; i++) dinp[i] = 1.0; rust_unsafe_dfoobar(len, inp, dinp); double res = dinp[0]; @@ -52,113 +49,120 @@ static double rust_unsafe_foobar_and_gradient(unsigned len) { return res; } -static double rust_foobar_and_gradient(unsigned len) { - double *inp = new double[2*len]; - for(int i=0; i<2*len; i++) inp[i] = 2.0; - double *dinp = new double[2*len]; - for(int i=0; i<2*len; i++) dinp[i] = 1.0; - rust_dfoobar(len, inp, dinp); - double res = dinp[0]; - delete[] dinp; - delete[] inp; - return res; +static double rust_foobar_and_gradient(size_t len) { + double *inp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + inp[i] = 2.0; + double *dinp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + dinp[i] = 1.0; + rust_dfoobar(len, inp, dinp); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; } -static double foobar_and_gradient(unsigned len) { - double *inp = new double[2*len]; - for(int i=0; i<2*len; i++) inp[i] = 2.0; - double *dinp = new double[2*len]; - for(int i=0; i<2*len; i++) dinp[i] = 1.0; - __enzyme_autodiff(foobar, enzyme_dupnoneed, inp, dinp, len); - double res = dinp[0]; - delete[] dinp; - delete[] inp; - return res; +__attribute__((noinline)) static double foobar_and_gradient(size_t len) { + double *inp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + inp[i] = 2.0; + double *dinp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + dinp[i] = 1.0; + __enzyme_autodiff(foobar, enzyme_dupnoneed, inp, dinp, len); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; } -static double afoobar_and_gradient(unsigned len) { - adept::Stack stack; +static double afoobar_and_gradient(size_t len) { + adept::Stack stack; - aVector x(2*len); - for(int i=0; i<2*len; i++) x(i) = 2.0; - stack.new_recording(); - afoobar(x, len); - for(int i=0; i<2*len; i++) - x(i).set_gradient(1.0); - stack.compute_adjoint(); - - double *dinp = new double[2*len]; - for(int i=0; i<2*len; i++) - dinp[i] = x(i).get_gradient(); - double res = dinp[0]; - delete[] dinp; - return res; -} + aVector x(2 * len); + for (size_t i = 0; i < 2 * len; i++) + x(i) = 2.0; + stack.new_recording(); + afoobar(x, len); + for (size_t i = 0; i < 2 * len; i++) + x(i).set_gradient(1.0); + stack.compute_adjoint(); + double *dinp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + dinp[i] = x(i).get_gradient(); + double res = dinp[0]; + delete[] dinp; + return res; +} -static double tfoobar_and_gradient(unsigned len) { - double *inp = new double[2*len]; - for(int i=0; i<2*len; i++) inp[i] = 2.0; - double *dinp = new double[2*len]; - for(int i=0; i<2*len; i++) dinp[i] = 1.0; - foobar_b(inp, dinp, len); - double res = dinp[0]; - delete[] dinp; - delete[] inp; - return res; +static double tfoobar_and_gradient(size_t len) { + double *inp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + inp[i] = 2.0; + double *dinp = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + dinp[i] = 1.0; + foobar_b(inp, dinp, len); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; } -static void adept_sincos(double inp, unsigned len) { +static void adept_sincos(double inp, size_t len) { { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - foobar(x, len); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Adept real %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + foobar(x, len); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Adept real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - adept::Stack stack; + adept::Stack stack; - aVector x(2*len); - for(int i=0; i<2*len; i++) x[i] = 2.0; - // stack.new_recording(); - afoobar(x, len); - double res = x(0).value(); + aVector x(2 * len); + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + // stack.new_recording(); + afoobar(x, len); + double res = x(0).value(); - gettimeofday(&end, NULL); - printf("Adept forward %0.6f res=%f\n", tdiff(&start, &end), res); + gettimeofday(&end, NULL); + printf("Adept forward %0.6f res=%f\n", tdiff(&start, &end), res); } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double res2 = afoobar_and_gradient(len); + double res2 = afoobar_and_gradient(len); - gettimeofday(&end, NULL); - printf("Adept combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + gettimeofday(&end, NULL); + printf("Adept combined %0.6f res'=%f\n", tdiff(&start, &end), res2); } } - -static void tapenade_sincos(double inp, unsigned len) { +static void tapenade_sincos(double inp, size_t len) { // { // struct timeval start, end; // gettimeofday(&start, NULL); // double *x = new double[2*len]; - // for(int i=0; i<2*len; i++) x[i] = 2.0; + // for(size_t i=0; i<2*len; i++) x[i] = 2.0; // foobar(x, len); // double res = x[0]; @@ -172,7 +176,7 @@ static void tapenade_sincos(double inp, unsigned len) { // gettimeofday(&start, NULL); // double* x = new double[2*len]; - // for(int i=0; i<2*len; i++) x[i] = 2.0; + // for(size_t i=0; i<2*len; i++) x[i] = 2.0; // foobar(x, len); // double res = x[0]; @@ -192,160 +196,165 @@ static void tapenade_sincos(double inp, unsigned len) { // } } -static void enzyme_sincos(double inp, unsigned len) { +static void enzyme_sincos(double inp, size_t len) { { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - foobar(x, len); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme real %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + foobar(x, len); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - foobar(x, len); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme forward %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + foobar(x, len); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme forward %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double res2 = foobar_and_gradient(len); + double res2 = foobar_and_gradient(len); - gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + gettimeofday(&end, NULL); + printf("Enzyme combined %0.6f res'=%f\n", tdiff(&start, &end), res2); } } -static void enzyme_unsafe_rust_sincos(double inp, unsigned len) { +static void enzyme_unsafe_rust_sincos(double inp, size_t len) { { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2 * len]; - for (int i = 0; i < 2 * len; i++) - x[i] = 2.0; - rust_unsafe_foobar(len, x); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme (unsafe Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) real %0.6f res=%f\n", tdiff(&start, &end), + res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2 * len]; - for (int i = 0; i < 2 * len; i++) - x[i] = 2.0; - rust_unsafe_foobar(len, x); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme (unsafe Rust) forward %0.6f res=%f\n", tdiff(&start, &end), - res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) forward %0.6f res=%f\n", tdiff(&start, &end), + res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double res2 = rust_unsafe_foobar_and_gradient(len); + double res2 = rust_unsafe_foobar_and_gradient(len); - gettimeofday(&end, NULL); - printf("Enzyme (unsafe Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), - res2); + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), + res2); } } -static void enzyme_rust_sincos(double inp, unsigned len) { +static void enzyme_rust_sincos(double inp, size_t len) { { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - rust_foobar(len, x); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme (Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); - - double *x = new double[2*len]; - for(int i=0; i<2*len; i++) x[i] = 2.0; - rust_foobar(len, x); - double res = x[0]; - - gettimeofday(&end, NULL); - printf("Enzyme (Rust) forward %0.6f res=%f\n", tdiff(&start, &end), res); - delete[] x; + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (size_t i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) forward %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double res2 = rust_foobar_and_gradient(len); + double res2 = rust_foobar_and_gradient(len); - gettimeofday(&end, NULL); - printf("Enzyme (Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + gettimeofday(&end, NULL); + printf("Enzyme (Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), res2); } } /* Function to check if x is power of 2*/ -bool isPowerOfTwo (int x) -{ - /* First x in the below expression is for the case when x is 0 */ - return x && (!(x&(x-1))); +bool isPowerOfTwo(size_t x) { + /* First x in the below expression is for the case when x is 0 */ + return x && (!(x & (x - 1))); } -unsigned max(unsigned A, unsigned B){ - if (A>B) return A; +size_t max(size_t A, size_t B) { + if (A > B) + return A; return B; } -int main(int argc, char** argv) { +int main(int argc, char **argv) { if (argc < 2) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } - unsigned N = atoi(argv[1]); + size_t N = atol(argv[1]); if (!isPowerOfTwo(N)) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } double inp = -2.1; - for(unsigned iters=max(1, N>>5); iters <= N; iters*=2) { - printf("iters=%d\n", iters); + for (size_t iters = max(1, N >> 5); iters <= N; iters *= 2) { + printf("iters=%zu\n", iters); adept_sincos(inp, iters); tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 79d8647d5300..ba8b0152a4e4 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -1,71 +1,70 @@ #ifndef _fft_h_ #define _fft_h_ -#include #include #include +#include using adept::adouble; using adept::aVector; - /* A classy FFT and Inverse FFT C++ class library Author: Tim Molteno, tim@physics.otago.ac.nz - Based on the article "A Simple and Efficient FFT Implementation in C++" by Volodymyr Myrnyy - with just a simple Inverse FFT modification. + Based on the article "A Simple and Efficient FFT Implementation in C++" by + Volodymyr Myrnyy with just a simple Inverse FFT modification. Licensed under the GPL v3. */ - #include -inline void swap(double* a, double* b) { - double temp=*a; +inline void swap(double *a, double *b) { + double temp = *a; *a = *b; *b = temp; } -static void recursiveApply(double* data, int iSign, unsigned N) { - if (N == 1) return; - recursiveApply(data, iSign, N/2); - recursiveApply(data+N, iSign, N/2); +static void recursiveApply(double *data, int iSign, size_t N) { + if (N == 1) + return; + recursiveApply(data, iSign, N / 2); + recursiveApply(data + N, iSign, N / 2); - double wtemp = iSign*sin(M_PI/N); - double wpi = -iSign*sin(2*M_PI/N); - double wpr = -2.0*wtemp*wtemp; + double wtemp = iSign * __builtin_sin(M_PI / N); + double wpi = -iSign * __builtin_sin(2 * M_PI / N); + double wpr = -2.0 * wtemp * wtemp; double wr = 1.0; double wi = 0.0; - for (unsigned i=0; ii) { - swap(&data[j-1], &data[i-1]); +static void scramble(double *data, size_t N) { + int j = 1; + for (int i = 1; i < 2 * N; i += 2) { + if (j > i) { + swap(&data[j - 1], &data[i - 1]); swap(&data[j], &data[i]); } int m = N; - while (m>=2 && j>m) { + while (m >= 2 && j > m) { j -= m; m >>= 1; } @@ -73,69 +72,69 @@ static void scramble(double* data, unsigned N) { } } -static void rescale(double* data, unsigned N) { - double scale = ((double)1)/N; - for (unsigned i=0; i<2*N; i++) { +static void rescale(double *data, size_t N) { + double scale = ((double)1) / N; + for (size_t i = 0; i < 2 * N; i++) { data[i] *= scale; } } -static void fft(double* data, unsigned N) { +static void fft(double *data, size_t N) { scramble(data, N); - recursiveApply(data,1, N); + recursiveApply(data, 1, N); } -static void ifft(double* data, unsigned N) { +static void ifft(double *data, size_t N) { scramble(data, N); - recursiveApply(data,-1, N); + recursiveApply(data, -1, N); rescale(data, N); } - - -inline void swapad(adept::ActiveReference a, adept::ActiveReference b) { - adouble temp=a; +inline void swapad(adept::ActiveReference a, + adept::ActiveReference b) { + adouble temp = a; a = b; b = temp; } -static void recursiveApply(aVector data, int iSign, unsigned N) { - if (N == 1) return; - recursiveApply(data, iSign, N/2); - recursiveApply(data(adept::range(N,adept::end)), iSign, N/2); +static void recursiveApply(aVector data, int iSign, size_t N) { + if (N == 1) + return; + recursiveApply(data, iSign, N / 2); + recursiveApply(data(adept::range(N, adept::end)), iSign, N / 2); - adouble wtemp = iSign*std::sin(M_PI/N); - adouble wpi = -iSign*std::sin(2*M_PI/N); - adouble wpr = -2.0*wtemp*wtemp; + adouble wtemp = iSign * std::sin(M_PI / N); + adouble wpi = -iSign * std::sin(2 * M_PI / N); + adouble wpr = -2.0 * wtemp * wtemp; adouble wr = 1.0; adouble wi = 0.0; - for (unsigned i=0; ii) { - swapad(data(j-1), data(i-1)); +static void scramble(aVector data, size_t N) { + int j = 1; + for (int i = 1; i < 2 * N; i += 2) { + if (j > i) { + swapad(data(j - 1), data(i - 1)); swapad(data(j), data(i)); } int m = N; - while (m>=2 && j>m) { + while (m >= 2 && j > m) { j -= m; m >>= 1; } @@ -143,28 +142,27 @@ static void scramble(aVector data, unsigned N) { } } -static void rescale(aVector data, unsigned N) { - adouble scale = ((double)1)/N; - for (unsigned i=0; i<2*N; i++) { +static void rescale(aVector data, size_t N) { + adouble scale = ((double)1) / N; + for (size_t i = 0; i < 2 * N; i++) { data[i] *= scale; } } -static void fft(aVector data, unsigned N) { +static void fft(aVector data, size_t N) { scramble(data, N); - recursiveApply(data,1, N); + recursiveApply(data, 1, N); } -static void ifft(aVector data, unsigned N) { +static void ifft(aVector data, size_t N) { scramble(data, N); - recursiveApply(data,-1, N); + recursiveApply(data, -1, N); rescale(data, N); } //! Tapenade extern "C" { - /* Generated by TAPENADE (INRIA, Ecuador team) Tapenade 3.16 (bugfix_servletAD) - 4 Jan 2024 17:44 */ @@ -179,51 +177,51 @@ extern "C" { Plus diff mem management of: a:in b:in */ inline void swap_b(double *a, double *ab, double *b, double *bb) { - double temp = *a; - double tempb = 0.0; - *a = *b; - *b = temp; - tempb = *bb; - *bb = *ab; - *ab = tempb; + double temp = *a; + double tempb = 0.0; + *a = *b; + *b = temp; + tempb = *bb; + *bb = *ab; + *ab = tempb; } inline void swap_c(double *a, double *b) { - double temp = *a; - *a = *b; - *b = temp; + double temp = *a; + *a = *b; + *b = temp; } -static void recursiveApply_c(double *data, int iSign, unsigned int N) { - unsigned int arg1; - double *arg10; - unsigned int arg2; - if (N == 1) - return; - else { - arg1 = N/2; - recursiveApply_c(data, iSign, arg1); - arg10 = data + N; - arg2 = N/2; - recursiveApply_c(arg10, iSign, arg2); - double wtemp = iSign*sin(3.14/N); - double wpi = -iSign*sin(2*3.14/N); - double wpr = -2.0*wtemp*wtemp; - double wr = 1.0; - double wi = 0.0; - for (unsigned int i = 0; i <= N-1; i += 2) { - int iN = i + N; - double tempr = data[iN]*wr - data[iN+1]*wi; - double tempi = data[iN]*wi + data[iN+1]*wr; - data[iN] = data[i] - tempr; - data[iN + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; - wtemp = wr; - wr += wr*wpr - wi*wpi; - wi += wi*wpr + wtemp*wpi; - } +static void recursiveApply_c(double *data, int iSign, size_t N) { + size_t arg1; + double *arg10; + size_t arg2; + if (N == 1) + return; + else { + arg1 = N / 2; + recursiveApply_c(data, iSign, arg1); + arg10 = data + N; + arg2 = N / 2; + recursiveApply_c(arg10, iSign, arg2); + double wtemp = iSign * sin(3.14 / N); + double wpi = -iSign * sin(2 * 3.14 / N); + double wpr = -2.0 * wtemp * wtemp; + double wr = 1.0; + double wi = 0.0; + for (size_t i = 0; i <= N - 1; i += 2) { + int iN = i + N; + double tempr = data[iN] * wr - data[iN + 1] * wi; + double tempi = data[iN] * wi + data[iN + 1] * wr; + data[iN] = data[i] - tempr; + data[iN + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + wtemp = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp * wpi; } + } } /* @@ -232,82 +230,81 @@ static void recursiveApply_c(double *data, int iSign, unsigned int N) { with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void recursiveApply_b(double *data, double *datab, int iSign, unsigned - int N) { - unsigned int arg1; - double *arg10; - double *arg10b; - unsigned int arg2; - int branch; - if (N != 1) { - arg1 = N/2; - pushReal8(*data); - recursiveApply_c(data, iSign, arg1); - arg10b = datab + N; - arg10 = data + N; - arg2 = N/2; - if (arg10) { - pushReal8(*arg10); - pushControl1b(1); - } else - pushControl1b(0); - recursiveApply_c(arg10, iSign, arg2); - double wtemp = iSign*sin(3.14/N); - double wpi = -iSign*sin(2*3.14/N); - double wpr = -2.0*wtemp*wtemp; - double wr = 1.0; - double wi = 0.0; - for (unsigned int i = 0; i <= N-1; i += 2) { - int iN = i + N; - double tempr = data[iN]*wr - data[iN+1]*wi; - double tempi = data[iN]*wi + data[iN+1]*wr; - double temprb; - double tempib; - double tmp; - double tmp0; - tmp = data[i] - tempr; - data[iN] = tmp; - tmp0 = data[i + 1] - tempi; - data[iN + 1] = tmp0; - data[i] = data[i] + tempr; - data[i + 1] = data[i + 1] + tempi; - wtemp = wr; - pushReal8(wr); - wr = wr + (wr*wpr - wi*wpi); - pushReal8(wi); - wi = wi + (wi*wpr + wtemp*wpi); - pushInteger4(iN); - } - for (unsigned int i = N-(N-1)%2-1; i >= 0; i -= 2) { - int iN; - double tempr; - double temprb = 0.0; - double tempi; - double tempib = 0.0; - double tmpb; - double tmpb0; - popInteger4(&iN); - tmpb0 = datab[iN + 1]; - popReal8(&wi); - popReal8(&wr); - tempib = datab[i + 1] - tmpb0; - temprb = datab[i]; - datab[iN + 1] = 0.0; - datab[i + 1] = datab[i + 1] + tmpb0; - tmpb = datab[iN]; - datab[iN] = 0.0; - datab[i] = datab[i] + tmpb; - temprb = temprb - tmpb; - datab[iN + 1] = datab[iN + 1] + wr*tempib - wi*temprb; - datab[iN] = datab[iN] + wi*tempib + wr*temprb; - } - popControl1b(&branch); - if (branch == 1) - popReal8(arg10); - recursiveApply_b(arg10, arg10b, iSign, arg2); - popReal8(data); - recursiveApply_b(data, datab, iSign, arg1); +static void recursiveApply_b(double *data, double *datab, int iSign, size_t N) { + size_t arg1; + double *arg10; + double *arg10b; + size_t arg2; + int branch; + if (N != 1) { + arg1 = N / 2; + pushReal8(*data); + recursiveApply_c(data, iSign, arg1); + arg10b = datab + N; + arg10 = data + N; + arg2 = N / 2; + if (arg10) { + pushReal8(*arg10); + pushControl1b(1); + } else + pushControl1b(0); + recursiveApply_c(arg10, iSign, arg2); + double wtemp = iSign * sin(3.14 / N); + double wpi = -iSign * sin(2 * 3.14 / N); + double wpr = -2.0 * wtemp * wtemp; + double wr = 1.0; + double wi = 0.0; + for (size_t i = 0; i <= N - 1; i += 2) { + int iN = i + N; + double tempr = data[iN] * wr - data[iN + 1] * wi; + double tempi = data[iN] * wi + data[iN + 1] * wr; + double temprb; + double tempib; + double tmp; + double tmp0; + tmp = data[i] - tempr; + data[iN] = tmp; + tmp0 = data[i + 1] - tempi; + data[iN + 1] = tmp0; + data[i] = data[i] + tempr; + data[i + 1] = data[i + 1] + tempi; + wtemp = wr; + pushReal8(wr); + wr = wr + (wr * wpr - wi * wpi); + pushReal8(wi); + wi = wi + (wi * wpr + wtemp * wpi); + pushInteger4(iN); } + for (size_t i = N - (N - 1) % 2 - 1; i >= 0; i -= 2) { + int iN; + double tempr; + double temprb = 0.0; + double tempi; + double tempib = 0.0; + double tmpb; + double tmpb0; + popInteger4(&iN); + tmpb0 = datab[iN + 1]; + popReal8(&wi); + popReal8(&wr); + tempib = datab[i + 1] - tmpb0; + temprb = datab[i]; + datab[iN + 1] = 0.0; + datab[i + 1] = datab[i + 1] + tmpb0; + tmpb = datab[iN]; + datab[iN] = 0.0; + datab[i] = datab[i] + tmpb; + temprb = temprb - tmpb; + datab[iN + 1] = datab[iN + 1] + wr * tempib - wi * temprb; + datab[iN] = datab[iN] + wi * tempib + wr * temprb; + } + popControl1b(&branch); + if (branch == 1) + popReal8(arg10); + recursiveApply_b(arg10, arg10b, iSign, arg2); + popReal8(data); + recursiveApply_b(data, datab, iSign, arg1); + } } /* @@ -316,68 +313,67 @@ static void recursiveApply_b(double *data, double *datab, int iSign, unsigned with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void scramble_b(double *data, double *datab, unsigned int N) { - int j = 1; - int branch; - for (int i = 1; i <= 2*N-1; i += 2) { - int adCount; - if (j > i) { - pushReal8(data[i - 1]); - pushReal8(data[j - 1]); - swap_c(&(data[j - 1]), &(data[i - 1])); - pushReal8(data[i]); - pushReal8(data[j]); - swap_c(&(data[j]), &(data[i])); - pushControl1b(0); - } else - pushControl1b(1); - int m = N; - adCount = 0; - while(m >= 2 && j > m) { - pushInteger4(j); - j = j - m; - m = m >> 1; - adCount = adCount + 1; - } - pushInteger4(adCount); - pushInteger4(j); - j = j + m; +static void scramble_b(double *data, double *datab, size_t N) { + int j = 1; + int branch; + for (int i = 1; i <= 2 * N - 1; i += 2) { + int adCount; + if (j > i) { + pushReal8(data[i - 1]); + pushReal8(data[j - 1]); + swap_c(&(data[j - 1]), &(data[i - 1])); + pushReal8(data[i]); + pushReal8(data[j]); + swap_c(&(data[j]), &(data[i])); + pushControl1b(0); + } else + pushControl1b(1); + int m = N; + adCount = 0; + while (m >= 2 && j > m) { + pushInteger4(j); + j = j - m; + m = m >> 1; + adCount = adCount + 1; } - for (int i = 2*N-(2*N-2)%2-1; i >= 1; i -= 2) { - int m; - int adCount; - int i0; - popInteger4(&j); - popInteger4(&adCount); - for (i0 = 1; i0 < adCount+1; ++i0) - popInteger4(&j); - popControl1b(&branch); - if (branch == 0) { - popReal8(&(data[j])); - popReal8(&(data[i])); - swap_b(&(data[j]), &(datab[j]), &(data[i]), &(datab[i])); - popReal8(&(data[j - 1])); - popReal8(&(data[i - 1])); - swap_b(&(data[j - 1]), &(datab[j - 1]), &(data[i - 1]), &(datab[i - - 1])); - } + pushInteger4(adCount); + pushInteger4(j); + j = j + m; + } + for (int i = 2 * N - (2 * N - 2) % 2 - 1; i >= 1; i -= 2) { + int m; + int adCount; + int i0; + popInteger4(&j); + popInteger4(&adCount); + for (i0 = 1; i0 < adCount + 1; ++i0) + popInteger4(&j); + popControl1b(&branch); + if (branch == 0) { + popReal8(&(data[j])); + popReal8(&(data[i])); + swap_b(&(data[j]), &(datab[j]), &(data[i]), &(datab[i])); + popReal8(&(data[j - 1])); + popReal8(&(data[i - 1])); + swap_b(&(data[j - 1]), &(datab[j - 1]), &(data[i - 1]), &(datab[i - 1])); } + } } -static void scramble_c(double *data, unsigned int N) { - int j = 1; - for (int i = 1; i <= 2*N-1; i += 2) { - if (j > i) { - swap_c(&(data[j - 1]), &(data[i - 1])); - swap_c(&(data[j]), &(data[i])); - } - int m = N; - while(m >= 2 && j > m) { - j -= m; - m >>= 1; - } - j += m; +static void scramble_c(double *data, size_t N) { + int j = 1; + for (int i = 1; i <= 2 * N - 1; i += 2) { + if (j > i) { + swap_c(&(data[j - 1]), &(data[i - 1])); + swap_c(&(data[j]), &(data[i])); + } + int m = N; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; } + j += m; + } } /* @@ -386,18 +382,18 @@ static void scramble_c(double *data, unsigned int N) { with respect to varying inputs: *data Plus diff mem management of: data:in */ -static void rescale_b(double *data, double *datab, unsigned int N) { - double scale = (double)1/N; - for (unsigned int i = 0; i < 2*N; ++i) - data[i] = data[i]*scale; - for (unsigned int i = 2*N-1; i > -1; --i) - datab[i] = scale*datab[i]; +static void rescale_b(double *data, double *datab, size_t N) { + double scale = (double)1 / N; + for (size_t i = 0; i < 2 * N; ++i) + data[i] = data[i] * scale; + for (size_t i = 2 * N - 1; i > -1; --i) + datab[i] = scale * datab[i]; } -static void rescale_c(double *data, unsigned int N) { - double scale = (double)1/N; - for (unsigned int i = 0; i < 2*N; ++i) - data[i] *= scale; +static void rescale_c(double *data, size_t N) { + double scale = (double)1 / N; + for (size_t i = 0; i < 2 * N; ++i) + data[i] *= scale; } /* @@ -406,20 +402,20 @@ static void rescale_c(double *data, unsigned int N) { with respect to varying inputs: *data Plus diff mem management of: data:in */ -void fiveft_b(double *data, double *datab, unsigned int N) { - pushReal8(*data); - scramble_c(data, N); - pushReal8(*data); - recursiveApply_c(data, 1, N); - popReal8(data); - recursiveApply_b(data, datab, 1, N); - popReal8(data); - scramble_b(data, datab, N); +void fiveft_b(double *data, double *datab, size_t N) { + pushReal8(*data); + scramble_c(data, N); + pushReal8(*data); + recursiveApply_c(data, 1, N); + popReal8(data); + recursiveApply_b(data, datab, 1, N); + popReal8(data); + scramble_b(data, datab, N); } -void fiveft_c(double *data, unsigned int N) { - scramble_c(data, N); - recursiveApply_c(data, 1, N); +void fiveft_c(double *data, size_t N) { + scramble_c(data, N); + recursiveApply_c(data, 1, N); } /* @@ -428,25 +424,25 @@ void fiveft_c(double *data, unsigned int N) { with respect to varying inputs: *data Plus diff mem management of: data:in */ -void ifiveft_b(double *data, double *datab, unsigned int N) { - pushReal8(*data); - scramble_c(data, N); - pushReal8(*data); - recursiveApply_c(data, -1, N); - pushReal8(*data); - rescale_c(data, N); - popReal8(data); - rescale_b(data, datab, N); - popReal8(data); - recursiveApply_b(data, datab, -1, N); - popReal8(data); - scramble_b(data, datab, N); +void ifiveft_b(double *data, double *datab, size_t N) { + pushReal8(*data); + scramble_c(data, N); + pushReal8(*data); + recursiveApply_c(data, -1, N); + pushReal8(*data); + rescale_c(data, N); + popReal8(data); + rescale_b(data, datab, N); + popReal8(data); + recursiveApply_b(data, datab, -1, N); + popReal8(data); + scramble_b(data, datab, N); } -void ifiveft_c(double *data, unsigned int N) { - scramble_c(data, N); - recursiveApply_c(data, -1, N); - rescale_c(data, N); +void ifiveft_c(double *data, size_t N) { + scramble_c(data, N); + recursiveApply_c(data, -1, N); + rescale_c(data, N); } /* @@ -456,15 +452,15 @@ void ifiveft_c(double *data, unsigned int N) { RW status of diff variables: data:(loc) *data:in-out Plus diff mem management of: data:in */ -void foobar_b(double *data, double *datab, unsigned int len) { - pushReal8(*data); - fiveft_c(data, len); - pushReal8(*data); - ifiveft_c(data, len); - popReal8(data); - ifiveft_b(data, datab, len); - popReal8(data); - fiveft_b(data, datab, len); +void foobar_b(double *data, double *datab, size_t len) { + pushReal8(*data); + fiveft_c(data, len); + pushReal8(*data); + ifiveft_c(data, len); + popReal8(data); + ifiveft_b(data, datab, len); + popReal8(data); + fiveft_b(data, datab, len); } } From 609bf4deb74d806ca3619bb7a5c9af0949b38ed6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 31 Oct 2024 14:11:42 -0600 Subject: [PATCH 69/88] bench fft: move from i+=2 to ii++ in loop increments --- enzyme/benchmarks/ReverseMode/fft/fft.h | 50 ++++++++++++++----------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index ba8b0152a4e4..8bdfa6351112 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -39,8 +39,9 @@ static void recursiveApply(double *data, int iSign, size_t N) { double wr = 1.0; double wi = 0.0; - for (size_t i = 0; i < N; i += 2) { - int iN = i + N; + for (size_t ii = 0; ii < N / 2; ii++) { + size_t i = 2 * ii; + size_t iN = i + N; double tempr = data[iN] * wr - data[iN + 1] * wi; double tempi = data[iN] * wi + data[iN + 1] * wr; @@ -57,13 +58,14 @@ static void recursiveApply(double *data, int iSign, size_t N) { } static void scramble(double *data, size_t N) { - int j = 1; - for (int i = 1; i < 2 * N; i += 2) { + size_t j = 1; + for (size_t ii = 0; ii < N; ii++) { + size_t i = 2 * ii + 1; if (j > i) { swap(&data[j - 1], &data[i - 1]); swap(&data[j], &data[i]); } - int m = N; + size_t m = N; while (m >= 2 && j > m) { j -= m; m >>= 1; @@ -109,8 +111,9 @@ static void recursiveApply(aVector data, int iSign, size_t N) { adouble wr = 1.0; adouble wi = 0.0; - for (size_t i = 0; i < N; i += 2) { - int iN = i + N; + for (size_t ii = 0; ii < N / 2; ii++) { + size_t i = 2 * ii; + size_t iN = i + N; adouble tempr = data(iN) * wr - data(iN + 1) * wi; adouble tempi = data(iN) * wi + data(iN + 1) * wr; @@ -127,13 +130,14 @@ static void recursiveApply(aVector data, int iSign, size_t N) { } static void scramble(aVector data, size_t N) { - int j = 1; - for (int i = 1; i < 2 * N; i += 2) { + size_t j = 1; + for (size_t ii = 0; ii < N; ii++) { + size_t i = 2 * ii + 1; if (j > i) { swapad(data(j - 1), data(i - 1)); swapad(data(j), data(i)); } - int m = N; + size_t m = N; while (m >= 2 && j > m) { j -= m; m >>= 1; @@ -209,8 +213,9 @@ static void recursiveApply_c(double *data, int iSign, size_t N) { double wpr = -2.0 * wtemp * wtemp; double wr = 1.0; double wi = 0.0; - for (size_t i = 0; i <= N - 1; i += 2) { - int iN = i + N; + for (size_t ii = 0; ii < N / 2; ii++) { + size_t i = 2 * ii; + size_t iN = i + N; double tempr = data[iN] * wr - data[iN + 1] * wi; double tempi = data[iN] * wi + data[iN + 1] * wr; data[iN] = data[i] - tempr; @@ -254,7 +259,8 @@ static void recursiveApply_b(double *data, double *datab, int iSign, size_t N) { double wpr = -2.0 * wtemp * wtemp; double wr = 1.0; double wi = 0.0; - for (size_t i = 0; i <= N - 1; i += 2) { + for (size_t ii = 0; ii < N / 2; ii++) { + size_t i = 2 * ii; int iN = i + N; double tempr = data[iN] * wr - data[iN + 1] * wi; double tempi = data[iN] * wi + data[iN + 1] * wr; @@ -316,7 +322,8 @@ static void recursiveApply_b(double *data, double *datab, int iSign, size_t N) { static void scramble_b(double *data, double *datab, size_t N) { int j = 1; int branch; - for (int i = 1; i <= 2 * N - 1; i += 2) { + for (size_t ii = 0; ii < N; ii++) { + size_t i = 2 * ii + 1; int adCount; if (j > i) { pushReal8(data[i - 1]); @@ -328,7 +335,7 @@ static void scramble_b(double *data, double *datab, size_t N) { pushControl1b(0); } else pushControl1b(1); - int m = N; + size_t m = N; adCount = 0; while (m >= 2 && j > m) { pushInteger4(j); @@ -340,10 +347,10 @@ static void scramble_b(double *data, double *datab, size_t N) { pushInteger4(j); j = j + m; } - for (int i = 2 * N - (2 * N - 2) % 2 - 1; i >= 1; i -= 2) { - int m; + for (size_t i = 2 * N - (2 * N - 2) % 2 - 1; i >= 1; i -= 2) { + size_t m; int adCount; - int i0; + size_t i0; popInteger4(&j); popInteger4(&adCount); for (i0 = 1; i0 < adCount + 1; ++i0) @@ -361,13 +368,14 @@ static void scramble_b(double *data, double *datab, size_t N) { } static void scramble_c(double *data, size_t N) { - int j = 1; - for (int i = 1; i <= 2 * N - 1; i += 2) { + size_t j = 1; + for (size_t ii = 0; ii < N; ii++) { + size_t i = 2 * ii + 1; if (j > i) { swap_c(&(data[j - 1]), &(data[i - 1])); swap_c(&(data[j]), &(data[i])); } - int m = N; + size_t m = N; while (m >= 2 && j > m) { j -= m; m >>= 1; From b722c8715532f8a48239b88b9b16dc75753ea9d9 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 31 Oct 2024 19:28:02 -0400 Subject: [PATCH 70/88] move rust fft to use i32 for isign --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 10 +++++----- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index 62caf8645c68..d147f9cfa4d7 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -31,7 +31,7 @@ fn bitreversal_perm(data: &mut [T]) { } } -fn radix2(data: &mut [f64], i_sign: f64) { +fn radix2(data: &mut [f64], i_sign: i32) { let n = data.len() / 2; if n == 1 { return; @@ -41,8 +41,8 @@ fn radix2(data: &mut [f64], i_sign: f64) { radix2(a, i_sign); radix2(b, i_sign); - let wtemp = i_sign * (PI / n as f64).sin(); - let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wtemp = i_sign as f64 * (PI / n as f64).sin(); + let wpi = -i_sign as f64 * (2.0 * PI / n as f64).sin(); let wpr = -2.0 * wtemp * wtemp; let mut wr = 1.0; let mut wi = 0.0; @@ -71,12 +71,12 @@ fn rescale(data: &mut [f64], scale: usize) { fn fft(data: &mut [f64]) { bitreversal_perm(data); - radix2(data, 1.0); + radix2(data, 1); } fn ifft(data: &mut [f64]) { bitreversal_perm(data); - radix2(data, -1.0); + radix2(data, -1); rescale(data, data.len() / 2); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 5e09a2f26fac..dd3665860ff9 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -20,13 +20,13 @@ unsafe fn bitreversal_perm(data: *mut f64, len: usize) { } } -unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { +unsafe fn radix2(data: *mut f64, i_sign: i32, n: usize) { if n == 1 { return; } radix2(data, i_sign, n/2); radix2(data.add(n), i_sign, n/2); - let wtemp = i_sign * (PI / n as f64).sin(); - let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wtemp = i_sign as f64 * (PI / n as f64).sin(); + let wpi = -i_sign as f64 * (2.0 * PI / n as f64).sin(); let wpr = -2.0 * wtemp * wtemp; let mut wr = 1.0; let mut wi = 0.0; @@ -57,12 +57,12 @@ unsafe fn rescale(data: *mut f64, n: usize) { unsafe fn fft(data: *mut f64, n: usize) { bitreversal_perm(data, n); - radix2(data, 1.0, n); + radix2(data, 1, n); } unsafe fn ifft(data: *mut f64, n: usize) { bitreversal_perm(data, n); - radix2(data, -1.0, n); + radix2(data, -1, n); rescale(data, n); } From a7943f055936921283be96d0707c7e4059feb845 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 1 Nov 2024 12:44:23 -0400 Subject: [PATCH 71/88] safe rust fft without bounds checking --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index d147f9cfa4d7..a8b25fa7e443 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -48,13 +48,23 @@ fn radix2(data: &mut [f64], i_sign: i32) { let mut wi = 0.0; for i in (0..n).step_by(2) { - let tempr = b[i] * wr - b[i + 1] * wi; - let tempi = b[i] * wi + b[i + 1] * wr; + unsafe { + let tempr = b.get_unchecked(i) * wr - b.get_unchecked(i + 1) * wi; + let tempi = b.get_unchecked(i) * wi + b.get_unchecked(i + 1) * wr; + + *b.get_unchecked_mut(i) = a.get_unchecked(i) - tempr; + *b.get_unchecked_mut(i + 1) = a.get_unchecked(i + 1) - tempi; + *a.get_unchecked_mut(i) += tempr; + *a.get_unchecked_mut(i + 1) += tempi; + } + + //let tempr = b[i] * wr - b[i + 1] * wi; + //let tempi = b[i] * wi + b[i + 1] * wr; - b[i] = a[i] - tempr; - b[i + 1] = a[i + 1] - tempi; - a[i] += tempr; - a[i + 1] += tempi; + //b[i] = a[i] - tempr; + //b[i + 1] = a[i + 1] - tempi; + //a[i] += tempr; + //a[i + 1] += tempi; let wtemp_new = wr; wr += wr * wpr - wi * wpi; From 59f866b19eb3608065c40505444bc6748603678e Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 1 Nov 2024 13:15:51 -0400 Subject: [PATCH 72/88] Add -fno-math-errno to each makefile for fairness --- enzyme/benchmarks/ReverseMode/ba/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/fft/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/ode-real/Makefile.make | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index f995bc33eaa3..158e9c413829 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a - clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -fno-math-errno -O3 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: ba.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index 288168048035..99c5c734fbd9 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) fft.cpp -fno-math-errno -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: fft.o ./$^ 1048576 | tee $@ diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 52ed73a6fc07..47158bd37aa4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.to RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a - clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -fno-math-errno -O3 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index eb9d531b0cd8..270f7dfdf96f 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -23,7 +23,7 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to # #opt $^ -O2 -o $@ -S lstm.o: lstm.cpp $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a - clang++ $(LOAD) $(BENCH) -pthread -O3 lstm.cpp -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ $(LOAD) $(BENCH) -pthread -fno-math-errno -O3 lstm.cpp -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index eeafb349c16f..9da0e2f8d397 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -29,7 +29,7 @@ $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo ode.o: ode.cpp $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a #/home/manuel/prog/llvm18/build/bin/clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O3 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -fno-math-errno -O3 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 #fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a # clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 From 6460848523ec52cfac36a8c171f6243b4452b6f5 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Fri, 1 Nov 2024 14:48:58 -0600 Subject: [PATCH 73/88] bench fft: __builtin_sin not necessary with -fno-math-errno --- enzyme/benchmarks/ReverseMode/fft/fft.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 8bdfa6351112..71597d1887c3 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -33,8 +33,8 @@ static void recursiveApply(double *data, int iSign, size_t N) { recursiveApply(data, iSign, N / 2); recursiveApply(data + N, iSign, N / 2); - double wtemp = iSign * __builtin_sin(M_PI / N); - double wpi = -iSign * __builtin_sin(2 * M_PI / N); + double wtemp = iSign * sin(M_PI / N); + double wpi = -iSign * sin(2 * M_PI / N); double wpr = -2.0 * wtemp * wtemp; double wr = 1.0; double wi = 0.0; From 3b174eadf936169ce9eced5879cf3055ae3d354f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 6 Nov 2024 01:48:35 -0500 Subject: [PATCH 74/88] move c++ lstm to also use size_t, since Rust uses usize --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 92 +++++++++---------- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 88 +++++++++--------- enzyme/benchmarks/ReverseMode/lstm/lstm.h | 6 +- .../ReverseMode/lstm/lstm_mayalias.h | 20 ++-- 4 files changed, 103 insertions(+), 103 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index fda5f8e3a0f2..81cf98b68b8a 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -19,9 +19,9 @@ using json = nlohmann::json; struct LSTMInput { - int l; - int c; - int b; + size_t l; + size_t c; + size_t b; std::vector main_params; std::vector extra_params; std::vector state; @@ -34,60 +34,60 @@ struct LSTMOutput { }; extern "C" { -void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, +void rust_unsafe_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void rust_unsafe_lstm_objective(int l, int c, int b, double const *main_params, +void rust_unsafe_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, +void rust_safe_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void cxx_restrict_lstm_objective(int l, int c, int b, double const *main_params, +void cxx_restrict_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, +void cxx_mayalias_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, +void rust_safe_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, +void dlstm_objective_mayalias(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective_restrict(int l, int c, int b, double const *main_params, +void dlstm_objective_restrict(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void lstm_objective_b(int l, int c, int b, const double *main_params, +void lstm_objective_b(size_t l, size_t c, size_t b, const double *main_params, double *main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb); -void adept_dlstm_objective(int l, int c, int b, double const *main_params, +void adept_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); } void read_lstm_instance(const string& fn, - int* l, int* c, int* b, + size_t* l, size_t* c, size_t* b, vector& main_params, vector& extra_params, vector& state, @@ -100,33 +100,33 @@ void read_lstm_instance(const string& fn, exit(1); } - fscanf(fid, "%i %i %i", l, c, b); + fscanf(fid, "%zu %zu %zu", l, c, b); - int l_ = *l, c_ = *c, b_ = *b; + size_t l_ = *l, c_ = *c, b_ = *b; - int main_sz = 2 * l_ * 4 * b_; - int extra_sz = 3 * b_; - int state_sz = 2 * l_ * b_; - int seq_sz = c_ * b_; + size_t main_sz = 2 * l_ * 4 * b_; + size_t extra_sz = 3 * b_; + size_t state_sz = 2 * l_ * b_; + size_t seq_sz = c_ * b_; main_params.resize(main_sz); extra_params.resize(extra_sz); state.resize(state_sz); sequence.resize(seq_sz); - for (int i = 0; i < main_sz; i++) { + for (size_t i = 0; i < main_sz; i++) { fscanf(fid, "%lf", &main_params[i]); } - for (int i = 0; i < extra_sz; i++) { + for (size_t i = 0; i < extra_sz; i++) { fscanf(fid, "%lf", &extra_params[i]); } - for (int i = 0; i < state_sz; i++) { + for (size_t i = 0; i < state_sz; i++) { fscanf(fid, "%lf", &state[i]); } - for (int i = 0; i < c_ * b_; i++) { + for (size_t i = 0; i < c_ * b_; i++) { fscanf(fid, "%lf", &sequence[i]); } @@ -134,12 +134,12 @@ void read_lstm_instance(const string& fn, fscanf(fid, "%c", &ch); fscanf(fid, "%c", &ch); - for (int i = 0; i < c_; i++) { + for (size_t i = 0; i < c_; i++) { unsigned char ch; fscanf(fid, "%c", &ch); - int cb = ch; - for (int j = b_ - 1; j >= 0; j--) { - int p = pow(2, j); + size_t cb = ch; + for (size_t j = b_ - 1; j >= 0; j--) { + size_t p = pow(2, j); if (cb >= p) { sequence[(i + 1) * b_ - j - 1] = 1; cb -= p; @@ -154,9 +154,9 @@ void read_lstm_instance(const string& fn, } typedef void(*deriv_t)( - int l, - int c, - int b, + size_t l, + size_t c, + size_t b, double const* main_params, double* dmain_params, double const* extra_params, @@ -170,7 +170,7 @@ typedef void(*deriv_t)( template void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) { - for(int i=0; i<100; i++) { + for(size_t i=0; i<100; i++) { double* main_params_gradient_part = result.gradient.data(); double* extra_params_gradient_part = result.gradient.data() + input.main_params.size(); @@ -198,7 +198,7 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) double calculate_mayalias_primal(struct LSTMInput &input) { double loss = 0.0; - for (int i = 0; i < 100; i++) { + for (size_t i = 0; i < 100; i++) { cxx_mayalias_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -209,7 +209,7 @@ double calculate_mayalias_primal(struct LSTMInput &input) { double calculate_restrict_primal(struct LSTMInput &input) { double loss = 0.0; - for (int i = 0; i < 100; i++) { + for (size_t i = 0; i < 100; i++) { cxx_restrict_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -220,7 +220,7 @@ double calculate_restrict_primal(struct LSTMInput &input) { double calculate_unsafe_primal(struct LSTMInput &input) { double loss = 0.0; - for (int i = 0; i < 100; i++) { + for (size_t i = 0; i < 100; i++) { rust_unsafe_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -231,7 +231,7 @@ double calculate_unsafe_primal(struct LSTMInput &input) { double calculate_safe_primal(struct LSTMInput &input) { double loss = 0.0; - for (int i = 0; i < 100; i++) { + for (size_t i = 0; i < 100; i++) { rust_safe_lstm_objective(input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -265,7 +265,7 @@ int main(const int argc, const char* argv[]) { } printf("\n"); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -299,7 +299,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -332,7 +332,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -366,7 +366,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -399,7 +399,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -434,7 +434,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -467,7 +467,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -500,7 +500,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -533,7 +533,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -566,7 +566,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; + size_t Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index ade0b2237510..3a291f6302cd 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -34,10 +34,10 @@ double sigmoid(double x) } // log(sum(exp(x), 2)) -double logsumexp(double const* vect, int sz) +double logsumexp(double const* vect, size_t sz) { double sum = 0.0; - int i; + size_t i; for (i = 0; i < sz; i++) { @@ -50,7 +50,7 @@ double logsumexp(double const* vect, int sz) // LSTM OBJECTIVE // The LSTM model -void lstm_model_restrict(int hsize, double const *__restrict weight, +void lstm_model_restrict(size_t hsize, double const *__restrict weight, double const *__restrict bias, double *__restrict hidden, double *__restrict cell, double const *__restrict input) { @@ -63,7 +63,7 @@ void lstm_model_restrict(int hsize, double const *__restrict weight, double* outgate = &(gates[2 * hsize]); double* change = &(gates[3 * hsize]); - int i; + size_t i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) @@ -89,10 +89,10 @@ void lstm_model_restrict(int hsize, double const *__restrict weight, } // Predict LSTM output given an input -void lstm_predict_restrict(int l, int b, double const *__restrict w, +void lstm_predict_restrict(size_t l, size_t b, double const *__restrict w, double const *__restrict w2, double *__restrict s, double const *__restrict x, double *__restrict x2) { - int i; + size_t i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; @@ -113,15 +113,15 @@ void lstm_predict_restrict(int l, int b, double const *__restrict w, } // LSTM objective (loss function) -void cxx_restrict_lstm_objective(int l, int c, int b, +void cxx_restrict_lstm_objective(size_t l, size_t c, size_t b, double const *__restrict main_params, double const *__restrict extra_params, double *__restrict state, double const *__restrict sequence, double *__restrict loss) { - int i, t; + size_t i, t; double total = 0.0; - int count = 0; + size_t count = 0; const double* input = &(sequence[0]); double* ypred = (double*)malloc(b * sizeof(double)); double* ynorm = (double*)malloc(b * sizeof(double)); @@ -162,7 +162,7 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective_restrict(int l, int c, int b, double const *main_params, +void dlstm_objective_restrict(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, @@ -208,11 +208,11 @@ double sigmoid_nodiff(double x) { Plus diff mem management of: vect:in */ // log(sum(exp(x), 2)) -void logsumexp_b(const double *vect, double *vectb, int sz, double logsumexpb) +void logsumexp_b(const double *vect, double *vectb, size_t sz, double logsumexpb) { double sum = 0.0; double sumb = 0.0; - int i; + size_t i; double logsumexp; for (i = 0; i < sz; ++i) sum = sum + exp(vect[i]); @@ -223,9 +223,9 @@ void logsumexp_b(const double *vect, double *vectb, int sz, double logsumexpb) } // log(sum(exp(x), 2)) -double logsumexp_nodiff(const double *vect, int sz) { +double logsumexp_nodiff(const double *vect, size_t sz) { double sum = 0.0; - int i; + size_t i; for (i = 0; i < sz; ++i) sum += exp(vect[i]); sum += 2; @@ -243,14 +243,14 @@ double logsumexp_nodiff(const double *vect, int sz) { */ // LSTM OBJECTIVE // The LSTM model -void lstm_model_b(int hsize, const double *weight, double *weightb, const +void lstm_model_b(size_t hsize, const double *weight, double *weightb, const double *bias, double *biasb, double *hidden, double *hiddenb, double * cell, double *cellb, const double *input, double *inputb) { double *gates; double *gatesb; double arg1; double arg1b; - int ii1; + size_t ii1; double temp; double tempb; gatesb = (double *)malloc(4*hsize*sizeof(double)); @@ -265,7 +265,7 @@ void lstm_model_b(int hsize, const double *weight, double *weightb, const double *outgateb = &(gatesb[2*hsize]); double *change = &(gates[3*hsize]); double *changeb = &(gatesb[3*hsize]); - int i; + size_t i; for (i = 0; i < hsize; ++i) { arg1 = input[i]*weight[i] + bias[i]; forget[i] = sigmoid_nodiff(arg1); @@ -325,7 +325,7 @@ void lstm_model_b(int hsize, const double *weight, double *weightb, const // LSTM OBJECTIVE // The LSTM model -void lstm_model_nodiff(int hsize, const double *weight, const double *bias, +void lstm_model_nodiff(size_t hsize, const double *weight, const double *bias, double *hidden, double *cell, const double *input) { double *gates; double arg1; @@ -334,7 +334,7 @@ void lstm_model_nodiff(int hsize, const double *weight, const double *bias, double *ingate = &(gates[hsize]); double *outgate = &(gates[2*hsize]); double *change = &(gates[3*hsize]); - int i; + size_t i; for (i = 0; i < hsize; ++i) { arg1 = input[i]*weight[i] + bias[i]; forget[i] = sigmoid_nodiff(arg1); @@ -358,10 +358,10 @@ void lstm_model_nodiff(int hsize, const double *weight, const double *bias, Plus diff mem management of: s:in w:in w2:in x2:in */ // Predict LSTM output given an input -void lstm_predict_b(int l, int b, const double *w, double *wb, const double * +void lstm_predict_b(size_t l, size_t b, const double *w, double *wb, const double * w2, double *w2b, double *s, double *sb, const double *x, double *x2, double *x2b) { - int i; + size_t i; double tmp; double tmpb; for (i = 0; i < b; ++i) { @@ -407,9 +407,9 @@ void lstm_predict_b(int l, int b, const double *w, double *wb, const double * } // Predict LSTM output given an input -void lstm_predict_nodiff(int l, int b, const double *w, const double *w2, +void lstm_predict_nodiff(size_t l, size_t b, const double *w, const double *w2, double *s, const double *x, double *x2) { - int i; + size_t i; for (i = 0; i < b; ++i) x2[i] = x[i]*w2[i]; double *xp = x2; @@ -432,17 +432,17 @@ void lstm_predict_nodiff(int l, int b, const double *w, const double *w2, Plus diff mem management of: extra_params:in loss:in */ // LSTM objective (loss function) -void lstm_objective_b(int l, int c, int b, const double *main_params, double * +void lstm_objective_b(size_t l, size_t c, size_t b, const double *main_params, double * main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb) { - int i, t; + size_t i, t; double total = 0.0; double totalb = 0.0; - int count = 0; + size_t count = 0; const double *input = &(sequence[0]); double *ypred; double *ypredb; - int ii1; + size_t ii1; int branch; double* stateb = (double*)malloc(2 * l * b * sizeof(double)); /* TFIX */ ypredb = (double *)malloc(b*sizeof(double)); @@ -528,9 +528,9 @@ T sigmoid(T x) { // log(sum(exp(x), 2)) template -T logsumexp(const T* vect, int sz) { +T logsumexp(const T* vect, size_t sz) { T sum = 0.0; - for (int i = 0; i < sz; ++i) + for (size_t i = 0; i < sz; ++i) sum += exp(vect[i]); sum += adouble(2); return log(sum); @@ -541,7 +541,7 @@ T logsumexp(const T* vect, int sz) { // The LSTM model template void lstm_model( - int hsize, + size_t hsize, T* weight, T* bias, T* hidden, @@ -556,7 +556,7 @@ void lstm_model( T* outgate = &(gates[2 * hsize]); T* change = &(gates[3 * hsize]); - int i; + size_t i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) @@ -584,8 +584,8 @@ void lstm_model( // Predict LSTM output given an input template void lstm_predict( - int l, - int b, + size_t l, + size_t b, T* w, T* w2, T* s, @@ -593,7 +593,7 @@ void lstm_predict( T* x2 ) { - int i; + size_t i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; @@ -615,9 +615,9 @@ void lstm_predict( // LSTM objective (loss function) template void lstm_objective( - int l, - int c, - int b, + size_t l, + size_t c, + size_t b, T * __restrict main_params, T * __restrict extra_params, T* __restrict state, @@ -625,9 +625,9 @@ void lstm_objective( T* __restrict loss ) { - int i, t; + size_t i, t; T total = 0.0; - int count = 0; + size_t count = 0; T* input = &(sequence[0]); T* ypred = new T[b]; T* ynorm = new T[b]; @@ -662,14 +662,14 @@ void lstm_objective( }; // Note ADBench did not have an adept impl -void adept_dlstm_objective(int l, int c, int b, const double *main_params, double * +void adept_dlstm_objective(size_t l, size_t c, size_t b, const double *main_params, double * main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb) { - int main_sz = 2 * l * 4 * b; - int extra_sz = 3 * b; - int state_sz = 2 * l * b; - int seq_sz = c* b; + size_t main_sz = 2 * l * 4 * b; + size_t extra_sz = 3 * b; + size_t state_sz = 2 * l * b; + size_t seq_sz = c* b; adept::Stack stack; diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.h b/enzyme/benchmarks/ReverseMode/lstm/lstm.h index 61deba7de30d..b311b6ce24b4 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.h @@ -17,9 +17,9 @@ extern "C" { // state (2 * l * b) // sequence (c * b) void lstm_objective( - int l, - int c, - int b, + size_t l, + size_t c, + size_t b, double const* main_params, double const* extra_params, double* state, diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h index 06401ff35a66..8031a46559a0 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -34,10 +34,10 @@ extern "C" { //} // //// log(sum(exp(x), 2)) -// double logsumexp(double const* vect, int sz) +// double logsumexp(double const* vect, size_t sz) //{ // double sum = 0.0; -// int i; +// size_t i; // // for (i = 0; i < sz; i++) // { @@ -50,7 +50,7 @@ extern "C" { // LSTM OBJECTIVE // The LSTM model -void lstm_model(int hsize, double const *weight, double const *bias, +void lstm_model(size_t hsize, double const *weight, double const *bias, double *hidden, double *cell, double const *input) { // TODO NOTE THIS //__builtin_assume(hsize > 0); @@ -61,7 +61,7 @@ void lstm_model(int hsize, double const *weight, double const *bias, double *outgate = &(gates[2 * hsize]); double *change = &(gates[3 * hsize]); - int i; + size_t i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) { @@ -85,9 +85,9 @@ void lstm_model(int hsize, double const *weight, double const *bias, } // Predict LSTM output given an input -void lstm_predict(int l, int b, double const *w, double const *w2, double *s, +void lstm_predict(size_t l, size_t b, double const *w, double const *w2, double *s, double const *x, double *x2) { - int i; + size_t i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; } @@ -104,12 +104,12 @@ void lstm_predict(int l, int b, double const *w, double const *w2, double *s, } // LSTM objective (loss function) -void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, +void cxx_mayalias_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss) { - int i, t; + size_t i, t; double total = 0.0; - int count = 0; + size_t count = 0; const double *input = &(sequence[0]); double *ypred = (double *)malloc(b * sizeof(double)); double *ynorm = (double *)malloc(b * sizeof(double)); @@ -146,7 +146,7 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, +void dlstm_objective_mayalias(size_t l, size_t c, size_t b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, From 8945897b90e54edd60b718e1108202a50868e7d4 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 7 Nov 2024 02:18:31 -0500 Subject: [PATCH 75/88] adjust gmm, but breaks tapenade --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 60 ++--- enzyme/benchmarks/ReverseMode/gmm/gmm.cpp | 238 +++++++++--------- enzyme/benchmarks/ReverseMode/gmm/gmm.h | 6 +- .../benchmarks/ReverseMode/gmm/gmm_mayalias.h | 10 +- 4 files changed, 155 insertions(+), 159 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 291b277a5c69..30e6eeaf44e0 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -18,7 +18,7 @@ using namespace std; using json = nlohmann::json; struct GMMInput { - int d, k, n; + size_t d, k, n; std::vector alphas, means, icf, x; Wishart wishart; }; @@ -33,54 +33,54 @@ struct GMMParameters { }; extern "C" { -void gmm_objective(int d, int k, int n, double const *alphas, +void gmm_objective(size_t d, size_t k, size_t n, double const *alphas, double const *means, double const *icf, double const *x, Wishart wishart, double *err); -void gmm_objective_restrict(int d, int k, int n, double const *alphas, +void gmm_objective_restrict(size_t d, size_t k, size_t n, double const *alphas, double const *means, double const *icf, double const *x, Wishart wishart, double *err); -void dgmm_objective_restrict(int d, int k, int n, const double *alphas, +void dgmm_objective_restrict(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double *errb); -void dgmm_objective(int d, int k, int n, const double *alphas, double *alphasb, +void dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double *errb); -void gmm_objective_b(int d, int k, int n, const double *alphas, double *alphasb, +void gmm_objective_b(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double *errb); -void adept_dgmm_objective(int d, int k, int n, const double *alphas, +void adept_dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double *errb); -void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, +void rust_unsafe_dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double *errb); -void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, +void rust_unsafe_gmm_objective(size_t d, size_t k, size_t n, const double *alphas, const double *means, const double *icf, const double *x, Wishart &wishart, double *err); -void rust_dgmm_objective(int d, int k, int n, const double *alphas, +void rust_dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double *errb); -void rust_gmm_objective(int d, int k, int n, const double *alphas, +void rust_gmm_objective(size_t d, size_t k, size_t n, const double *alphas, const double *means, const double *icf, const double *x, Wishart &wishart, double *err); } void read_gmm_instance(const string& fn, - int* d, int* k, int* n, + size_t* d, size_t* k, size_t* n, vector& alphas, vector& means, vector& icf, @@ -95,32 +95,32 @@ void read_gmm_instance(const string& fn, exit(1); } - fscanf(fid, "%i %i %i", d, k, n); + fscanf(fid, "%zu %zu %zu", d, k, n); - int d_ = *d, k_ = *k, n_ = *n; + size_t d_ = *d, k_ = *k, n_ = *n; - int icf_sz = d_ * (d_ + 1) / 2; + size_t icf_sz = d_ * (d_ + 1) / 2; alphas.resize(k_); means.resize(d_ * k_); icf.resize(icf_sz * k_); x.resize(d_ * n_); - for (int i = 0; i < k_; i++) + for (size_t i = 0; i < k_; i++) { fscanf(fid, "%lf", &alphas[i]); } - for (int i = 0; i < k_; i++) + for (size_t i = 0; i < k_; i++) { - for (int j = 0; j < d_; j++) + for (size_t j = 0; j < d_; j++) { fscanf(fid, "%lf", &means[i * d_ + j]); } } - for (int i = 0; i < k_; i++) + for (size_t i = 0; i < k_; i++) { - for (int j = 0; j < icf_sz; j++) + for (size_t j = 0; j < icf_sz; j++) { fscanf(fid, "%lf", &icf[i * icf_sz + j]); } @@ -128,20 +128,20 @@ void read_gmm_instance(const string& fn, if (replicate_point) { - for (int j = 0; j < d_; j++) + for (size_t j = 0; j < d_; j++) { fscanf(fid, "%lf", &x[j]); } - for (int i = 0; i < n_; i++) + for (size_t i = 0; i < n_; i++) { memcpy(&x[i * d_], &x[0], d_ * sizeof(double)); } } else { - for (int i = 0; i < n_; i++) + for (size_t i = 0; i < n_; i++) { - for (int j = 0; j < d_; j++) + for (size_t j = 0; j < d_; j++) { fscanf(fid, "%lf", &x[i * d_ + j]); } @@ -234,7 +234,7 @@ int main(const int argc, const char* argv[]) { read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = { 0, std::vector(Jcols) }; @@ -264,7 +264,7 @@ int main(const int argc, const char* argv[]) { read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = { 0, std::vector(Jcols) }; @@ -297,7 +297,7 @@ int main(const int argc, const char* argv[]) { read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = { 0, std::vector(Jcols) }; @@ -327,7 +327,7 @@ int main(const int argc, const char* argv[]) { input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = {0, std::vector(Jcols)}; @@ -356,7 +356,7 @@ int main(const int argc, const char* argv[]) { read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = { 0, std::vector(Jcols) }; @@ -427,7 +427,7 @@ int main(const int argc, const char* argv[]) { input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); - int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; struct GMMOutput result = {0, std::vector(Jcols)}; diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp index 37fa90574157..e0f86a9852c0 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp @@ -13,7 +13,7 @@ * typedef struct * { * double gamma; - * int m; + * size_t m; * } Wishart; * * After Tapenade CLI installing use the next command to generate a file: @@ -39,9 +39,9 @@ extern "C" { /* ==================================================================== */ // This throws error on n<1 -double arr_max(int n, double const* x) +double arr_max(size_t n, double const* x) { - int i; + size_t i; double m = x[0]; for (i = 1; i < n; i++) { @@ -57,9 +57,9 @@ double arr_max(int n, double const* x) // sum of component squares -double sqnorm(int n, double const* x) +double sqnorm(size_t n, double const* x) { - int i; + size_t i; double res = x[0] * x[0]; for (i = 1; i < n; i++) { @@ -73,13 +73,13 @@ double sqnorm(int n, double const* x) // out = a - b void subtract( - int d, + size_t d, double const* x, double const* y, double* out ) { - int id; + size_t id; for (id = 0; id < d; id++) { out[id] = x[id] - y[id]; @@ -87,9 +87,9 @@ void subtract( } -double log_sum_exp(int n, double const* x) +double log_sum_exp(size_t n, double const* x) { - int i; + size_t i; double mx = arr_max(n, x); double semx = 0.0; @@ -105,7 +105,7 @@ double log_sum_exp(int n, double const* x) __attribute__((const)) double log_gamma_distrib(double a, double p) { - int j; + size_t j; double out = 0.25 * p * (p - 1) * log(PI); for (j = 1; j <= p; j++) @@ -123,17 +123,17 @@ double log_gamma_distrib(double a, double p) /* ======================================================================== */ double log_wishart_prior( - int p, - int k, + size_t p, + size_t k, Wishart wishart, double const* sum_qs, double const* Qdiags, double const* icf ) { - int ik; - int n = p + wishart.m + 1; - int icf_sz = p * (p + 1) / 2; + size_t ik; + size_t n = p + wishart.m + 1; + size_t icf_sz = p * (p + 1) / 2; double C = n * p * (log(wishart.gamma) - 0.5 * log(2)) - log_gamma_distrib(0.5 * n, p); @@ -150,15 +150,15 @@ double log_wishart_prior( void preprocess_qs( - int d, - int k, + size_t d, + size_t k, double const* icf, double* sum_qs, double* Qdiags ) { - int ik, id; - int icf_sz = d * (d + 1) / 2; + size_t ik, id; + size_t icf_sz = d * (d + 1) / 2; for (ik = 0; ik < k; ik++) { sum_qs[ik] = 0.; @@ -174,14 +174,14 @@ void preprocess_qs( void Qtimesx( - int d, + size_t d, double const* Qdiag, double const* ltri, // strictly lower triangular part double const* x, double* out ) { - int i, j; + size_t i, j; for (i = 0; i < d; i++) { out[i] = Qdiag[i] * x[i]; @@ -189,10 +189,10 @@ void Qtimesx( //caching lparams as scev doesn't replicate index calculation // todo note changing to strengthened form - //int Lparamsidx = 0; + //size_t Lparamsidx = 0; for (i = 0; i < d; i++) { - int Lparamsidx = i*(2*d-i-1)/2; + size_t Lparamsidx = i*(2*d-i-1)/2; for (j = i + 1; j < d; j++) { // and this x @@ -202,16 +202,15 @@ void Qtimesx( } } -void gmm_objective_restrict(int d, int k, int n, +void gmm_objective_restrict(size_t d, size_t k, size_t n, double const *__restrict alphas, double const *__restrict means, double const *__restrict icf, double const *__restrict x, Wishart wishart, double *__restrict err) { -#define int int64_t - int ix, ik; + size_t ix, ik; const double CONSTANT = -n * d * 0.5 * log(2 * PI); - int icf_sz = d * (d + 1) / 2; + size_t icf_sz = d * (d + 1) / 2; double* Qdiags = (double*)malloc(d * k * sizeof(double)); double* sum_qs = (double*)malloc(k * sizeof(double)); @@ -248,7 +247,6 @@ void gmm_objective_restrict(int d, int k, int n, free(xcentered); free(Qxcentered); free(main_term); - #undef int } extern int enzyme_const; @@ -257,7 +255,7 @@ extern int enzyme_dupnoneed; void __enzyme_autodiff(...) noexcept; // * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c -void dgmm_objective_restrict(int d, int k, int n, const double *alphas, +void dgmm_objective_restrict(size_t d, size_t k, size_t n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, @@ -285,8 +283,8 @@ extern "C" { UTILS ==================================================================== */ // This throws error on n<1 -void arr_max_b(int n, const double *x, double *xb, double arr_maxb) { - int i; +void arr_max_b(size_t n, const double *x, double *xb, double arr_maxb) { + size_t i; double m = x[0]; double mb = 0.0; int branch; @@ -312,8 +310,8 @@ void arr_max_b(int n, const double *x, double *xb, double arr_maxb) { UTILS ==================================================================== */ // This throws error on n<1 -double arr_max_nodiff(int n, const double *x) { - int i; +double arr_max_nodiff(size_t n, const double *x) { + size_t i; double m = x[0]; for (i = 1; i < n; ++i) if (m < x[i]) @@ -328,8 +326,8 @@ double arr_max_nodiff(int n, const double *x) { Plus diff mem management of: x:in */ // sum of component squares -void sqnorm_b(int n, const double *x, double *xb, double sqnormb) { - int i; +void sqnorm_b(size_t n, const double *x, double *xb, double sqnormb) { + size_t i; double res = x[0]*x[0]; double resb = 0.0; double sqnorm; @@ -340,8 +338,8 @@ void sqnorm_b(int n, const double *x, double *xb, double sqnormb) { } // sum of component squares -double sqnorm_nodiff(int n, const double *x) { - int i; +double sqnorm_nodiff(size_t n, const double *x) { + size_t i; double res = x[0]*x[0]; for (i = 1; i < n; ++i) res = res + x[i]*x[i]; @@ -355,9 +353,9 @@ double sqnorm_nodiff(int n, const double *x) { Plus diff mem management of: out:in y:in */ // out = a - b -void subtract_b(int d, const double *x, const double *y, double *yb, double * +void subtract_b(size_t d, const double *x, const double *y, double *yb, double * out, double *outb) { - int id; + size_t id; for (id = d-1; id > -1; --id) { yb[id] = yb[id] - outb[id]; outb[id] = 0.0; @@ -365,8 +363,8 @@ void subtract_b(int d, const double *x, const double *y, double *yb, double * } // out = a - b -void subtract_nodiff(int d, const double *x, const double *y, double *out) { - int id; +void subtract_nodiff(size_t d, const double *x, const double *y, double *out) { + size_t id; for (id = 0; id < d; ++id) out[id] = x[id] - y[id]; } @@ -377,8 +375,8 @@ void subtract_nodiff(int d, const double *x, const double *y, double *out) { with respect to varying inputs: *x Plus diff mem management of: x:in */ -void log_sum_exp_b(int n, const double *x, double *xb, double log_sum_expb) { - int i; +void log_sum_exp_b(size_t n, const double *x, double *xb, double log_sum_expb) { + size_t i; double mx; double mxb; double tempb; @@ -398,8 +396,8 @@ void log_sum_exp_b(int n, const double *x, double *xb, double log_sum_expb) { arr_max_b(n, x, xb, mxb); } -double log_sum_exp_nodiff(int n, const double *x) { - int i; +double log_sum_exp_nodiff(size_t n, const double *x) { + size_t i; double mx; mx = arr_max_nodiff(n, x); double semx = 0.0; @@ -409,7 +407,7 @@ double log_sum_exp_nodiff(int n, const double *x) { } double log_gamma_distrib_nodiff(double a, double p) { - int j; + size_t j; /* TFIX */ double out = 0.25*p*(p-1)*log(PI); double arg1; @@ -431,12 +429,12 @@ double log_gamma_distrib_nodiff(double a, double p) { ======================================================================== MAIN LOGIC ======================================================================== */ -void log_wishart_prior_b(int p, int k, Wishart wishart, const double *sum_qs, +void log_wishart_prior_b(size_t p, size_t k, Wishart wishart, const double *sum_qs, double *sum_qsb, const double *Qdiags, double *Qdiagsb, const double * icf, double *icfb, double log_wishart_priorb) { - int ik; - int n = p + wishart.m + 1; - int icf_sz = p*(p+1)/2; + size_t ik; + size_t n = p + wishart.m + 1; + size_t icf_sz = p*(p+1)/2; double C; float arg1; double result1; @@ -446,7 +444,7 @@ void log_wishart_prior_b(int p, int k, Wishart wishart, const double *sum_qs, for (ik = 0; ik < k; ++ik) { double frobenius; double result1; - int arg1; + size_t arg1; double result2; } outb = log_wishart_priorb; @@ -461,7 +459,7 @@ void log_wishart_prior_b(int p, int k, Wishart wishart, const double *sum_qs, double frobeniusb; double result1; double result1b; - int arg1; + size_t arg1; double result2; double result2b; frobeniusb = wishart.gamma*wishart.gamma*0.5*outb; @@ -478,11 +476,11 @@ void log_wishart_prior_b(int p, int k, Wishart wishart, const double *sum_qs, /* ======================================================================== MAIN LOGIC ======================================================================== */ -double log_wishart_prior_nodiff(int p, int k, Wishart wishart, const double * +double log_wishart_prior_nodiff(size_t p, size_t k, Wishart wishart, const double * sum_qs, const double *Qdiags, const double *icf) { - int ik; - int n = p + wishart.m + 1; - int icf_sz = p*(p+1)/2; + size_t ik; + size_t n = p + wishart.m + 1; + size_t icf_sz = p*(p+1)/2; double C; float arg1; double result1; @@ -493,7 +491,7 @@ double log_wishart_prior_nodiff(int p, int k, Wishart wishart, const double * for (ik = 0; ik < k; ++ik) { double frobenius; double result1; - int arg1; + size_t arg1; double result2; result1 = sqnorm_nodiff(p, &(Qdiags[ik*p])); arg1 = icf_sz - p; @@ -511,10 +509,10 @@ double log_wishart_prior_nodiff(int p, int k, Wishart wishart, const double * with respect to varying inputs: *icf Plus diff mem management of: Qdiags:in sum_qs:in icf:in */ -void preprocess_qs_b(int d, int k, const double *icf, double *icfb, double * +void preprocess_qs_b(size_t d, size_t k, const double *icf, double *icfb, double * sum_qs, double *sum_qsb, double *Qdiags, double *Qdiagsb) { - int ik, id; - int icf_sz = d*(d+1)/2; + size_t ik, id; + size_t icf_sz = d*(d+1)/2; for (ik = 0; ik < k; ++ik) for (id = 0; id < d; ++id) { double q = icf[ik*icf_sz + id]; @@ -534,10 +532,10 @@ void preprocess_qs_b(int d, int k, const double *icf, double *icfb, double * } } -void preprocess_qs_nodiff(int d, int k, const double *icf, double *sum_qs, +void preprocess_qs_nodiff(size_t d, size_t k, const double *icf, double *sum_qs, double *Qdiags) { - int ik, id; - int icf_sz = d*(d+1)/2; + size_t ik, id; + size_t icf_sz = d*(d+1)/2; for (ik = 0; ik < k; ++ik) { sum_qs[ik] = 0.; for (id = 0; id < d; ++id) { @@ -554,13 +552,13 @@ void preprocess_qs_nodiff(int d, int k, const double *icf, double *sum_qs, with respect to varying inputs: *out *Qdiag *x *ltri Plus diff mem management of: out:in Qdiag:in x:in ltri:in */ -void Qtimesx_b(int d, const double *Qdiag, double *Qdiagb, const double *ltri, +void Qtimesx_b(size_t d, const double *Qdiag, double *Qdiagb, const double *ltri, double *ltrib, const double *x, double *xb, double *out, double *outb) { // strictly lower triangular part - int i, j; + size_t i, j; int adFrom; - int Lparamsidx = 0; + size_t Lparamsidx = 0; for (i = 0; i < d; ++i) { adFrom = i + 1; for (j = adFrom; j < d; ++j) @@ -582,13 +580,13 @@ void Qtimesx_b(int d, const double *Qdiag, double *Qdiagb, const double *ltri, } } -void Qtimesx_nodiff(int d, const double *Qdiag, const double *ltri, const +void Qtimesx_nodiff(size_t d, const double *Qdiag, const double *ltri, const double *x, double *out) { // strictly lower triangular part - int i, j; + size_t i, j; for (i = 0; i < d; ++i) out[i] = Qdiag[i]*x[i]; - int Lparamsidx = 0; + size_t Lparamsidx = 0; for (i = 0; i < d; ++i) for (j = i+1; j < d; ++j) { out[j] = out[j] + ltri[Lparamsidx]*x[i]; @@ -604,19 +602,19 @@ void Qtimesx_nodiff(int d, const double *Qdiag, const double *ltri, const *alphas:out Plus diff mem management of: err:in means:in icf:in alphas:in */ -void gmm_objective_b(int d, int k, int n, const double *alphas, double * +void gmm_objective_b(size_t d, size_t k, size_t n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb) { - int ix, ik; + size_t ix, ik; /* TFIX */ const double CONSTANT = -n*d*0.5*log(2*PI); - int icf_sz = d*(d+1)/2; + size_t icf_sz = d*(d+1)/2; double *Qdiags; double *Qdiagsb; double result1; double result1b; - int ii1; + size_t ii1; Qdiagsb = (double *)malloc(d*k*sizeof(double)); for (ii1 = 0; ii1 < d*k; ++ii1) Qdiagsb[ii1] = 0.0; @@ -718,32 +716,32 @@ namespace adeptTest { // out = a - b template -void subtract(int d, +void subtract(size_t d, const T1* const x, const T2* const y, T3* out) { - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) { out[id] = x[id] - y[id]; } } template -T sqnorm(int n, const T* const x) +T sqnorm(size_t n, const T* const x) { T res = x[0] * x[0]; - for (int i = 1; i < n; i++) + for (size_t i = 1; i < n; i++) res = res + x[i] * x[i]; return res; } // This throws error on n<1 template -T arr_max(int n, const T* const x) +T arr_max(size_t n, const T* const x) { T m = x[0]; - for (int i = 1; i < n; i++) + for (size_t i = 1; i < n; i++) { if (m < x[i]) m = x[i]; @@ -752,12 +750,12 @@ T arr_max(int n, const T* const x) } template -void gmm_objective(int d, int k, int n, const T* const alphas, const T* const means, +void gmm_objective(size_t d, size_t k, size_t n, const T* const alphas, const T* const means, const T* const icf, const double* const x, Wishart wishart, T* err); // split of the outer loop over points template -void gmm_objective_split_inner(int d, int k, +void gmm_objective_split_inner(size_t d, size_t k, const T* const alphas, const T* const means, const T* const icf, @@ -766,7 +764,7 @@ void gmm_objective_split_inner(int d, int k, T* err); // other terms which are outside the loop template -void gmm_objective_split_other(int d, int k, int n, +void gmm_objective_split_other(size_t d, size_t k, size_t n, const T* const alphas, const T* const means, const T* const icf, @@ -774,7 +772,7 @@ void gmm_objective_split_other(int d, int k, int n, T* err); template -T logsumexp(int n, const T* const x); +T logsumexp(size_t n, const T* const x); // p: dim // k: number of components @@ -783,20 +781,20 @@ T logsumexp(int n, const T* const x); // Qdiags: d*k // icf: (p*(p+1)/2)*k inverse covariance factors template -T log_wishart_prior(int p, int k, +T log_wishart_prior(size_t p, size_t k, Wishart wishart, const T* const sum_qs, const T* const Qdiags, const T* const icf); template -void preprocess_qs(int d, int k, +void preprocess_qs(size_t d, size_t k, const T* const icf, T* sum_qs, T* Qdiags); template -void Qtimesx(int d, +void Qtimesx(size_t d, const T* const Qdiag, const T* const ltri, // strictly lower triangular part const T* const x, @@ -807,11 +805,11 @@ void Qtimesx(int d, //////////////////////////////////////////////////////////// template -T logsumexp(int n, const T* const x) +T logsumexp(size_t n, const T* const x) { T mx = arr_max(n, x); T semx = 0.; - for (int i = 0; i < n; i++) + for (size_t i = 0; i < n; i++) { semx = semx + exp(x[i] - mx); } @@ -819,19 +817,19 @@ T logsumexp(int n, const T* const x) } template -T log_wishart_prior(int p, int k, +T log_wishart_prior(size_t p, size_t k, Wishart wishart, const T* const sum_qs, const T* const Qdiags, const T* const icf) { - int n = p + wishart.m + 1; - int icf_sz = p * (p + 1) / 2; + size_t n = p + wishart.m + 1; + size_t icf_sz = p * (p + 1) / 2; double C = n * p * (log(wishart.gamma) - 0.5 * log(2)) - log_gamma_distrib(0.5 * n, p); T out = 0; - for (int ik = 0; ik < k; ik++) + for (size_t ik = 0; ik < k; ik++) { T frobenius = sqnorm(p, &Qdiags[ik * p]) + sqnorm(icf_sz - p, &icf[ik * icf_sz + p]); out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius) @@ -842,16 +840,16 @@ T log_wishart_prior(int p, int k, } template -void preprocess_qs(int d, int k, +void preprocess_qs(size_t d, size_t k, const T* const icf, T* sum_qs, T* Qdiags) { - int icf_sz = d * (d + 1) / 2; - for (int ik = 0; ik < k; ik++) + size_t icf_sz = d * (d + 1) / 2; + for (size_t ik = 0; ik < k; ik++) { sum_qs[ik] = 0.; - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) { T q = icf[ik * icf_sz + id]; sum_qs[ik] = sum_qs[ik] + q; @@ -861,19 +859,19 @@ void preprocess_qs(int d, int k, } template -void Qtimesx(int d, +void Qtimesx(size_t d, const T* const Qdiag, const T* const ltri, // strictly lower triangular part const T* const x, T* out) { - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) out[id] = Qdiag[id] * x[id]; - int Lparamsidx = 0; - for (int i = 0; i < d; i++) + size_t Lparamsidx = 0; + for (size_t i = 0; i < d; i++) { - for (int j = i + 1; j < d; j++) + for (size_t j = i + 1; j < d; j++) { out[j] = out[j] + ltri[Lparamsidx] * x[i]; Lparamsidx++; @@ -882,7 +880,7 @@ void Qtimesx(int d, } template -void gmm_objective(int d, int k, int n, +void gmm_objective(size_t d, size_t k, size_t n, const T* const alphas, const T* const means, const T* const icf, @@ -891,7 +889,7 @@ void gmm_objective(int d, int k, int n, T* err) { const double CONSTANT = -n * d * 0.5 * log(2 * PI); - int icf_sz = d * (d + 1) / 2; + size_t icf_sz = d * (d + 1) / 2; vector Qdiags(d * k); vector sum_qs(k); @@ -902,9 +900,9 @@ void gmm_objective(int d, int k, int n, preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]); T slse = 0.; - for (int ix = 0; ix < n; ix++) + for (size_t ix = 0; ix < n; ix++) { - for (int ik = 0; ik < k; ik++) + for (size_t ik = 0; ik < k; ik++) { subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]); Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0]); @@ -922,7 +920,7 @@ void gmm_objective(int d, int k, int n, } template -void gmm_objective_split_inner(int d, int k, +void gmm_objective_split_inner(size_t d, size_t k, const T* const alphas, const T* const means, const T* const icf, @@ -930,39 +928,39 @@ void gmm_objective_split_inner(int d, int k, Wishart wishart, T* err) { - int icf_sz = d * (d + 1) / 2; + size_t icf_sz = d * (d + 1) / 2; T* Ldiag = new T[d]; T* xcentered = new T[d]; T* mahal = new T[d]; T* lse = new T[k]; - for (int ik = 0; ik < k; ik++) + for (size_t ik = 0; ik < k; ik++) { - int icf_off = ik * icf_sz; + size_t icf_off = ik * icf_sz; T sumlog_Ldiag(0.); - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) { sumlog_Ldiag = sumlog_Ldiag + icf[icf_off + id]; Ldiag[id] = exp(icf[icf_off + id]); } - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) { xcentered[id] = x[id] - means[ik * d + id]; mahal[id] = Ldiag[id] * xcentered[id]; } - int Lparamsidx = d; - for (int i = 0; i < d; i++) + size_t Lparamsidx = d; + for (size_t i = 0; i < d; i++) { - for (int j = i + 1; j < d; j++) + for (size_t j = i + 1; j < d; j++) { mahal[j] = mahal[j] + icf[icf_off + Lparamsidx] * xcentered[i]; Lparamsidx++; } } T sqsum_mahal(0.); - for (int id = 0; id < d; id++) + for (size_t id = 0; id < d; id++) { sqsum_mahal = sqsum_mahal + mahal[id] * mahal[id]; } @@ -979,7 +977,7 @@ void gmm_objective_split_inner(int d, int k, } template -void gmm_objective_split_other(int d, int k, int n, +void gmm_objective_split_other(size_t d, size_t k, size_t n, const T* const alphas, const T* const means, const T* const icf, @@ -1000,14 +998,14 @@ void gmm_objective_split_other(int d, int k, int n, }; -void adept_dgmm_objective(int d, int k, int n, const double *alphas, double * +void adept_dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb) { - int icf_sz = d*(d + 1) / 2; - int Jrows = 1; - int Jcols = (k*(d + 1)*(d + 2)) / 2; + size_t icf_sz = d*(d + 1) / 2; + size_t Jrows = 1; + size_t Jcols = (k*(d + 1)*(d + 2)) / 2; adept::Stack stack; adouble *aalphas = new adouble[k]; diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.h b/enzyme/benchmarks/ReverseMode/gmm/gmm.h index eb189afed44b..5dc5bc0b1edb 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.h +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.h @@ -28,9 +28,9 @@ extern "C" { // wishart: wishart distribution parameters // err: 1 output void gmm_objective( - int d, - int k, - int n, + size_t d, + size_t k, + size_t n, double const* alphas, double const* means, double const* icf, diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h index 91e207fbcceb..e62794552067 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h @@ -1,10 +1,9 @@ -void gmm_objective(int d, int k, int n, double const *alphas, +void gmm_objective(size_t d, size_t k, size_t n, double const *alphas, double const *means, double const *icf, double const *x, Wishart wishart, double *err) { -#define int int64_t - int ix, ik; + size_t ix, ik; const double CONSTANT = -n * d * 0.5 * log(2 * PI); - int icf_sz = d * (d + 1) / 2; + size_t icf_sz = d * (d + 1) / 2; double *Qdiags = (double *)malloc(d * k * sizeof(double)); double *sum_qs = (double *)malloc(k * sizeof(double)); @@ -41,11 +40,10 @@ void gmm_objective(int d, int k, int n, double const *alphas, free(xcentered); free(Qxcentered); free(main_term); -#undef int } // * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c -void dgmm_objective(int d, int k, int n, const double *alphas, double * +void dgmm_objective(size_t d, size_t k, size_t n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb) { From a73a1addbaf935dc11920f192f12e0b0b229d085 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 8 Nov 2024 22:07:58 -0500 Subject: [PATCH 76/88] Revert "move c++ lstm to also use size_t, since Rust uses usize" This reverts commit 3b174eadf936169ce9eced5879cf3055ae3d354f. --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 92 +++++++++---------- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 88 +++++++++--------- enzyme/benchmarks/ReverseMode/lstm/lstm.h | 6 +- .../ReverseMode/lstm/lstm_mayalias.h | 20 ++-- 4 files changed, 103 insertions(+), 103 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 81cf98b68b8a..fda5f8e3a0f2 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -19,9 +19,9 @@ using json = nlohmann::json; struct LSTMInput { - size_t l; - size_t c; - size_t b; + int l; + int c; + int b; std::vector main_params; std::vector extra_params; std::vector state; @@ -34,60 +34,60 @@ struct LSTMOutput { }; extern "C" { -void rust_unsafe_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void rust_unsafe_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void rust_unsafe_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void rust_safe_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void cxx_restrict_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void cxx_restrict_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void cxx_mayalias_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); -void rust_safe_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective_mayalias(size_t l, size_t c, size_t b, double const *main_params, +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective_restrict(size_t l, size_t c, size_t b, double const *main_params, +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void lstm_objective_b(size_t l, size_t c, size_t b, const double *main_params, +void lstm_objective_b(int l, int c, int b, const double *main_params, double *main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb); -void adept_dlstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void adept_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); } void read_lstm_instance(const string& fn, - size_t* l, size_t* c, size_t* b, + int* l, int* c, int* b, vector& main_params, vector& extra_params, vector& state, @@ -100,33 +100,33 @@ void read_lstm_instance(const string& fn, exit(1); } - fscanf(fid, "%zu %zu %zu", l, c, b); + fscanf(fid, "%i %i %i", l, c, b); - size_t l_ = *l, c_ = *c, b_ = *b; + int l_ = *l, c_ = *c, b_ = *b; - size_t main_sz = 2 * l_ * 4 * b_; - size_t extra_sz = 3 * b_; - size_t state_sz = 2 * l_ * b_; - size_t seq_sz = c_ * b_; + int main_sz = 2 * l_ * 4 * b_; + int extra_sz = 3 * b_; + int state_sz = 2 * l_ * b_; + int seq_sz = c_ * b_; main_params.resize(main_sz); extra_params.resize(extra_sz); state.resize(state_sz); sequence.resize(seq_sz); - for (size_t i = 0; i < main_sz; i++) { + for (int i = 0; i < main_sz; i++) { fscanf(fid, "%lf", &main_params[i]); } - for (size_t i = 0; i < extra_sz; i++) { + for (int i = 0; i < extra_sz; i++) { fscanf(fid, "%lf", &extra_params[i]); } - for (size_t i = 0; i < state_sz; i++) { + for (int i = 0; i < state_sz; i++) { fscanf(fid, "%lf", &state[i]); } - for (size_t i = 0; i < c_ * b_; i++) { + for (int i = 0; i < c_ * b_; i++) { fscanf(fid, "%lf", &sequence[i]); } @@ -134,12 +134,12 @@ void read_lstm_instance(const string& fn, fscanf(fid, "%c", &ch); fscanf(fid, "%c", &ch); - for (size_t i = 0; i < c_; i++) { + for (int i = 0; i < c_; i++) { unsigned char ch; fscanf(fid, "%c", &ch); - size_t cb = ch; - for (size_t j = b_ - 1; j >= 0; j--) { - size_t p = pow(2, j); + int cb = ch; + for (int j = b_ - 1; j >= 0; j--) { + int p = pow(2, j); if (cb >= p) { sequence[(i + 1) * b_ - j - 1] = 1; cb -= p; @@ -154,9 +154,9 @@ void read_lstm_instance(const string& fn, } typedef void(*deriv_t)( - size_t l, - size_t c, - size_t b, + int l, + int c, + int b, double const* main_params, double* dmain_params, double const* extra_params, @@ -170,7 +170,7 @@ typedef void(*deriv_t)( template void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) { - for(size_t i=0; i<100; i++) { + for(int i=0; i<100; i++) { double* main_params_gradient_part = result.gradient.data(); double* extra_params_gradient_part = result.gradient.data() + input.main_params.size(); @@ -198,7 +198,7 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) double calculate_mayalias_primal(struct LSTMInput &input) { double loss = 0.0; - for (size_t i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { cxx_mayalias_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -209,7 +209,7 @@ double calculate_mayalias_primal(struct LSTMInput &input) { double calculate_restrict_primal(struct LSTMInput &input) { double loss = 0.0; - for (size_t i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { cxx_restrict_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -220,7 +220,7 @@ double calculate_restrict_primal(struct LSTMInput &input) { double calculate_unsafe_primal(struct LSTMInput &input) { double loss = 0.0; - for (size_t i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { rust_unsafe_lstm_objective( input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -231,7 +231,7 @@ double calculate_unsafe_primal(struct LSTMInput &input) { double calculate_safe_primal(struct LSTMInput &input) { double loss = 0.0; - for (size_t i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { rust_safe_lstm_objective(input.l, input.c, input.b, input.main_params.data(), input.extra_params.data(), input.state.data(), @@ -265,7 +265,7 @@ int main(const int argc, const char* argv[]) { } printf("\n"); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -299,7 +299,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -332,7 +332,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -366,7 +366,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -399,7 +399,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; { @@ -434,7 +434,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -467,7 +467,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -500,7 +500,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -533,7 +533,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { @@ -566,7 +566,7 @@ int main(const int argc, const char* argv[]) { std::vector state = std::vector(input.state.size()); - size_t Jcols = 8 * input.l * input.b + 3 * input.b; + int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; { diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index 3a291f6302cd..ade0b2237510 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -34,10 +34,10 @@ double sigmoid(double x) } // log(sum(exp(x), 2)) -double logsumexp(double const* vect, size_t sz) +double logsumexp(double const* vect, int sz) { double sum = 0.0; - size_t i; + int i; for (i = 0; i < sz; i++) { @@ -50,7 +50,7 @@ double logsumexp(double const* vect, size_t sz) // LSTM OBJECTIVE // The LSTM model -void lstm_model_restrict(size_t hsize, double const *__restrict weight, +void lstm_model_restrict(int hsize, double const *__restrict weight, double const *__restrict bias, double *__restrict hidden, double *__restrict cell, double const *__restrict input) { @@ -63,7 +63,7 @@ void lstm_model_restrict(size_t hsize, double const *__restrict weight, double* outgate = &(gates[2 * hsize]); double* change = &(gates[3 * hsize]); - size_t i; + int i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) @@ -89,10 +89,10 @@ void lstm_model_restrict(size_t hsize, double const *__restrict weight, } // Predict LSTM output given an input -void lstm_predict_restrict(size_t l, size_t b, double const *__restrict w, +void lstm_predict_restrict(int l, int b, double const *__restrict w, double const *__restrict w2, double *__restrict s, double const *__restrict x, double *__restrict x2) { - size_t i; + int i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; @@ -113,15 +113,15 @@ void lstm_predict_restrict(size_t l, size_t b, double const *__restrict w, } // LSTM objective (loss function) -void cxx_restrict_lstm_objective(size_t l, size_t c, size_t b, +void cxx_restrict_lstm_objective(int l, int c, int b, double const *__restrict main_params, double const *__restrict extra_params, double *__restrict state, double const *__restrict sequence, double *__restrict loss) { - size_t i, t; + int i, t; double total = 0.0; - size_t count = 0; + int count = 0; const double* input = &(sequence[0]); double* ypred = (double*)malloc(b * sizeof(double)); double* ynorm = (double*)malloc(b * sizeof(double)); @@ -162,7 +162,7 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective_restrict(size_t l, size_t c, size_t b, double const *main_params, +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, @@ -208,11 +208,11 @@ double sigmoid_nodiff(double x) { Plus diff mem management of: vect:in */ // log(sum(exp(x), 2)) -void logsumexp_b(const double *vect, double *vectb, size_t sz, double logsumexpb) +void logsumexp_b(const double *vect, double *vectb, int sz, double logsumexpb) { double sum = 0.0; double sumb = 0.0; - size_t i; + int i; double logsumexp; for (i = 0; i < sz; ++i) sum = sum + exp(vect[i]); @@ -223,9 +223,9 @@ void logsumexp_b(const double *vect, double *vectb, size_t sz, double logsumexpb } // log(sum(exp(x), 2)) -double logsumexp_nodiff(const double *vect, size_t sz) { +double logsumexp_nodiff(const double *vect, int sz) { double sum = 0.0; - size_t i; + int i; for (i = 0; i < sz; ++i) sum += exp(vect[i]); sum += 2; @@ -243,14 +243,14 @@ double logsumexp_nodiff(const double *vect, size_t sz) { */ // LSTM OBJECTIVE // The LSTM model -void lstm_model_b(size_t hsize, const double *weight, double *weightb, const +void lstm_model_b(int hsize, const double *weight, double *weightb, const double *bias, double *biasb, double *hidden, double *hiddenb, double * cell, double *cellb, const double *input, double *inputb) { double *gates; double *gatesb; double arg1; double arg1b; - size_t ii1; + int ii1; double temp; double tempb; gatesb = (double *)malloc(4*hsize*sizeof(double)); @@ -265,7 +265,7 @@ void lstm_model_b(size_t hsize, const double *weight, double *weightb, const double *outgateb = &(gatesb[2*hsize]); double *change = &(gates[3*hsize]); double *changeb = &(gatesb[3*hsize]); - size_t i; + int i; for (i = 0; i < hsize; ++i) { arg1 = input[i]*weight[i] + bias[i]; forget[i] = sigmoid_nodiff(arg1); @@ -325,7 +325,7 @@ void lstm_model_b(size_t hsize, const double *weight, double *weightb, const // LSTM OBJECTIVE // The LSTM model -void lstm_model_nodiff(size_t hsize, const double *weight, const double *bias, +void lstm_model_nodiff(int hsize, const double *weight, const double *bias, double *hidden, double *cell, const double *input) { double *gates; double arg1; @@ -334,7 +334,7 @@ void lstm_model_nodiff(size_t hsize, const double *weight, const double *bias, double *ingate = &(gates[hsize]); double *outgate = &(gates[2*hsize]); double *change = &(gates[3*hsize]); - size_t i; + int i; for (i = 0; i < hsize; ++i) { arg1 = input[i]*weight[i] + bias[i]; forget[i] = sigmoid_nodiff(arg1); @@ -358,10 +358,10 @@ void lstm_model_nodiff(size_t hsize, const double *weight, const double *bias, Plus diff mem management of: s:in w:in w2:in x2:in */ // Predict LSTM output given an input -void lstm_predict_b(size_t l, size_t b, const double *w, double *wb, const double * +void lstm_predict_b(int l, int b, const double *w, double *wb, const double * w2, double *w2b, double *s, double *sb, const double *x, double *x2, double *x2b) { - size_t i; + int i; double tmp; double tmpb; for (i = 0; i < b; ++i) { @@ -407,9 +407,9 @@ void lstm_predict_b(size_t l, size_t b, const double *w, double *wb, const doubl } // Predict LSTM output given an input -void lstm_predict_nodiff(size_t l, size_t b, const double *w, const double *w2, +void lstm_predict_nodiff(int l, int b, const double *w, const double *w2, double *s, const double *x, double *x2) { - size_t i; + int i; for (i = 0; i < b; ++i) x2[i] = x[i]*w2[i]; double *xp = x2; @@ -432,17 +432,17 @@ void lstm_predict_nodiff(size_t l, size_t b, const double *w, const double *w2, Plus diff mem management of: extra_params:in loss:in */ // LSTM objective (loss function) -void lstm_objective_b(size_t l, size_t c, size_t b, const double *main_params, double * +void lstm_objective_b(int l, int c, int b, const double *main_params, double * main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb) { - size_t i, t; + int i, t; double total = 0.0; double totalb = 0.0; - size_t count = 0; + int count = 0; const double *input = &(sequence[0]); double *ypred; double *ypredb; - size_t ii1; + int ii1; int branch; double* stateb = (double*)malloc(2 * l * b * sizeof(double)); /* TFIX */ ypredb = (double *)malloc(b*sizeof(double)); @@ -528,9 +528,9 @@ T sigmoid(T x) { // log(sum(exp(x), 2)) template -T logsumexp(const T* vect, size_t sz) { +T logsumexp(const T* vect, int sz) { T sum = 0.0; - for (size_t i = 0; i < sz; ++i) + for (int i = 0; i < sz; ++i) sum += exp(vect[i]); sum += adouble(2); return log(sum); @@ -541,7 +541,7 @@ T logsumexp(const T* vect, size_t sz) { // The LSTM model template void lstm_model( - size_t hsize, + int hsize, T* weight, T* bias, T* hidden, @@ -556,7 +556,7 @@ void lstm_model( T* outgate = &(gates[2 * hsize]); T* change = &(gates[3 * hsize]); - size_t i; + int i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) @@ -584,8 +584,8 @@ void lstm_model( // Predict LSTM output given an input template void lstm_predict( - size_t l, - size_t b, + int l, + int b, T* w, T* w2, T* s, @@ -593,7 +593,7 @@ void lstm_predict( T* x2 ) { - size_t i; + int i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; @@ -615,9 +615,9 @@ void lstm_predict( // LSTM objective (loss function) template void lstm_objective( - size_t l, - size_t c, - size_t b, + int l, + int c, + int b, T * __restrict main_params, T * __restrict extra_params, T* __restrict state, @@ -625,9 +625,9 @@ void lstm_objective( T* __restrict loss ) { - size_t i, t; + int i, t; T total = 0.0; - size_t count = 0; + int count = 0; T* input = &(sequence[0]); T* ypred = new T[b]; T* ynorm = new T[b]; @@ -662,14 +662,14 @@ void lstm_objective( }; // Note ADBench did not have an adept impl -void adept_dlstm_objective(size_t l, size_t c, size_t b, const double *main_params, double * +void adept_dlstm_objective(int l, int c, int b, const double *main_params, double * main_paramsb, const double *extra_params, double *extra_paramsb, double *state, const double *sequence, double *loss, double *lossb) { - size_t main_sz = 2 * l * 4 * b; - size_t extra_sz = 3 * b; - size_t state_sz = 2 * l * b; - size_t seq_sz = c* b; + int main_sz = 2 * l * 4 * b; + int extra_sz = 3 * b; + int state_sz = 2 * l * b; + int seq_sz = c* b; adept::Stack stack; diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.h b/enzyme/benchmarks/ReverseMode/lstm/lstm.h index b311b6ce24b4..61deba7de30d 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.h @@ -17,9 +17,9 @@ extern "C" { // state (2 * l * b) // sequence (c * b) void lstm_objective( - size_t l, - size_t c, - size_t b, + int l, + int c, + int b, double const* main_params, double const* extra_params, double* state, diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h index 8031a46559a0..06401ff35a66 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -34,10 +34,10 @@ extern "C" { //} // //// log(sum(exp(x), 2)) -// double logsumexp(double const* vect, size_t sz) +// double logsumexp(double const* vect, int sz) //{ // double sum = 0.0; -// size_t i; +// int i; // // for (i = 0; i < sz; i++) // { @@ -50,7 +50,7 @@ extern "C" { // LSTM OBJECTIVE // The LSTM model -void lstm_model(size_t hsize, double const *weight, double const *bias, +void lstm_model(int hsize, double const *weight, double const *bias, double *hidden, double *cell, double const *input) { // TODO NOTE THIS //__builtin_assume(hsize > 0); @@ -61,7 +61,7 @@ void lstm_model(size_t hsize, double const *weight, double const *bias, double *outgate = &(gates[2 * hsize]); double *change = &(gates[3 * hsize]); - size_t i; + int i; // caching input // hidden (needed) for (i = 0; i < hsize; i++) { @@ -85,9 +85,9 @@ void lstm_model(size_t hsize, double const *weight, double const *bias, } // Predict LSTM output given an input -void lstm_predict(size_t l, size_t b, double const *w, double const *w2, double *s, +void lstm_predict(int l, int b, double const *w, double const *w2, double *s, double const *x, double *x2) { - size_t i; + int i; for (i = 0; i < b; i++) { x2[i] = x[i] * w2[i]; } @@ -104,12 +104,12 @@ void lstm_predict(size_t l, size_t b, double const *w, double const *w2, double } // LSTM objective (loss function) -void cxx_mayalias_lstm_objective(size_t l, size_t c, size_t b, double const *main_params, +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss) { - size_t i, t; + int i, t; double total = 0.0; - size_t count = 0; + int count = 0; const double *input = &(sequence[0]); double *ypred = (double *)malloc(b * sizeof(double)); double *ynorm = (double *)malloc(b * sizeof(double)); @@ -146,7 +146,7 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective_mayalias(size_t l, size_t c, size_t b, double const *main_params, +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, From dca82bba74d7c837a34da88d9051959c433d86fe Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 9 Nov 2024 01:03:57 -0500 Subject: [PATCH 77/88] fix single gmm size for benchmarking --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 30e6eeaf44e0..a96d07f82869 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -211,9 +211,9 @@ int main(const int argc, const char* argv[]) { const auto replicate_point = (argc > 9 && string(argv[9]) == "-rep"); const GMMParameters params = { replicate_point }; - std::vector paths;// = { "1k/gmm_d10_K100.txt" }; + std::vector paths = { "10k/gmm_d10_K200.txt" }; - getTests(paths, "data/1k", "1k/"); + //getTests(paths, "data/1k", "1k/"); if (std::getenv("BENCH_LARGE")) { getTests(paths, "data/2.5k", "2.5k/"); getTests(paths, "data/10k", "10k/"); From f7c75a23acb552e3ded9175fb9ae3409681f205b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 9 Nov 2024 02:03:36 -0500 Subject: [PATCH 78/88] safe lstm without alias info --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 3 +- .../benchmarks/ReverseMode/lstm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 16 +++--- .../benchmarks/ReverseMode/lstm/src/safe.rs | 54 +++++++++---------- 4 files changed, 38 insertions(+), 37 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index fda5f8e3a0f2..3d565bc3902d 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -243,7 +243,8 @@ double calculate_safe_primal(struct LSTMInput &input) { int main(const int argc, const char* argv[]) { printf("starting main\n"); - std::vector paths = { "lstm_l2_c1024.txt", "lstm_l4_c1024.txt", "lstm_l2_c4096.txt", "lstm_l4_c4096.txt" }; + //std::vector paths = { "lstm_l2_c1024.txt", "lstm_l4_c1024.txt", "lstm_l2_c4096.txt", "lstm_l4_c4096.txt" }; + std::vector paths = { "lstm_l4_c4096.txt" }; std::ofstream jsonfile("results.json", std::ofstream::trunc); json test_results; diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 129bf76a187a..1f2b20776058 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -9,7 +9,7 @@ clean: cargo +enzyme clean $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - cargo +enzyme rustc --release --lib --crate-type=staticlib + RUSTFLAGS="-Z mutable-noalias=no" cargo +enzyme rustc --release --lib --crate-type=staticlib lstm.o: lstm.cpp $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index 937460f3cee3..541bf5fbc357 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -13,15 +13,15 @@ pub extern "C" fn rust_unsafe_lstm_objective(l: i32, c: i32, b: i32, main_params unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } #[no_mangle] -pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *mut f64, extra_params: *mut f64, state: *mut f64, sequence: *mut f64, loss: *mut f64) { let l = l as usize; let c = c as usize; let b = b as usize; let (main_params, extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(main_params, 2*l*4*b), + slice::from_raw_parts_mut(extra_params, 3*b), slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) + slice::from_raw_parts_mut(sequence, c*b) )}; unsafe { @@ -37,17 +37,17 @@ pub extern "C" fn rust_unsafe_dlstm_objective(l: i32, c: i32, b: i32, main_param unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} } #[no_mangle] -pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *mut f64, d_main_params: *mut f64, extra_params: *mut f64, d_extra_params: *mut f64, state: *mut f64, sequence: *mut f64, res: *mut f64, d_res: *mut f64) { let l = l as usize; let c = c as usize; let b = b as usize; let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts_mut(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(extra_params, 3*b), slice::from_raw_parts_mut(d_extra_params, 3*b), slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) + slice::from_raw_parts_mut(sequence, c*b) )}; unsafe { diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index d6847a4d5d72..7ab08905d52c 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -8,9 +8,9 @@ fn sigmoid(x: f64) -> f64 { // log(sum(exp(x), 2)) #[inline] -fn logsumexp(vect: &[f64]) -> f64 { +fn logsumexp(vect: &mut [f64]) -> f64 { let mut sum = 0.0; - for &val in vect { + for &mut val in vect { sum += val.exp(); } sum += 2.0; // Adding 2 to sum @@ -25,7 +25,7 @@ fn lstm_model( bias: &[f64], hidden: &mut [f64], cell: &mut [f64], - input: &[f64], + input: &mut [f64], ) { let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; @@ -59,10 +59,10 @@ fn lstm_model( fn lstm_predict( l: usize, b: usize, - w: &[f64], - w2: &[f64], + w: &mut [f64], + w2: &mut [f64], s: &mut [f64], - x: &[f64], + x: &mut [f64], x2: &mut [f64], ) { for i in 0..b { @@ -85,8 +85,8 @@ fn lstm_predict( lstm_model( b, - &w[i * 4..(i + b) * 4], - &w[(i + b) * 4..(i + 2 * b) * 4], + & w[i * 4..(i + b) * 4], + & w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, @@ -95,7 +95,7 @@ fn lstm_predict( i += 2 * b; } - let xp = &s[i - 2 * b..]; + let xp = &mut s[i - 2 * b..]; for i in 0..b { x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; @@ -119,15 +119,15 @@ pub(crate) fn lstm_objective( l: usize, c: usize, b: usize, - main_params: &[f64], - extra_params: &[f64], + main_params: &mut [f64], + extra_params: &mut [f64], state: &mut [f64], - sequence: &[f64], + sequence: &mut [f64], loss: &mut f64, ) { let mut total = 0.0; - let mut input = &sequence[..b]; + let mut input = &mut sequence[..b]; let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; @@ -137,12 +137,12 @@ pub(crate) fn lstm_objective( for j in 0..(c - 1) { let t = j * b; lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - let lse = logsumexp(&ypred); + let lse = logsumexp(&mut ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; } - let ygold = &sequence[t + b..]; + let ygold = &mut sequence[t + b..]; for i in 0..b { total += ygold[i] * ynorm[i]; } @@ -159,18 +159,18 @@ pub extern "C" fn rust_lstm_objective( l: usize, c: usize, b: usize, - main_params: *const f64, - extra_params: *const f64, + main_params: *mut f64, + extra_params: *mut f64, state: *mut f64, - sequence: *const f64, + sequence: *mut f64, loss: *mut f64, ) { let (main_params, extra_params, state, sequence) = unsafe { ( - slice::from_raw_parts(main_params, 2 * l * 4 * b), - slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(main_params, 2 * l * 4 * b), + slice::from_raw_parts_mut(extra_params, 3 * b), slice::from_raw_parts_mut(state, 2 * l * b), - slice::from_raw_parts(sequence, c * b), + slice::from_raw_parts_mut(sequence, c * b), ) }; @@ -193,23 +193,23 @@ pub extern "C" fn rust_dlstm_objective( l: usize, c: usize, b: usize, - main_params: *const f64, + main_params: *mut f64, d_main_params: *mut f64, - extra_params: *const f64, + extra_params: *mut f64, d_extra_params: *mut f64, state: *mut f64, - sequence: *const f64, + sequence: *mut f64, res: *mut f64, d_res: *mut f64, ) { let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe { ( - slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts_mut(main_params, 2 * l * 4 * b), slice::from_raw_parts_mut(d_main_params, 2 * l * 4 * b), - slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(extra_params, 3 * b), slice::from_raw_parts_mut(d_extra_params, 3 * b), slice::from_raw_parts_mut(state, 2 * l * b), - slice::from_raw_parts(sequence, c * b), + slice::from_raw_parts_mut(sequence, c * b), ) }; From 06b135ffe318758b740a0a9722afc62e9dbcc71c Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 9 Nov 2024 02:05:19 -0500 Subject: [PATCH 79/88] use new flag to keep const ptr/ref while still not emitting noalias --- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 16 +++--- .../benchmarks/ReverseMode/lstm/src/safe.rs | 54 +++++++++---------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index 541bf5fbc357..937460f3cee3 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -13,15 +13,15 @@ pub extern "C" fn rust_unsafe_lstm_objective(l: i32, c: i32, b: i32, main_params unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } #[no_mangle] -pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *mut f64, extra_params: *mut f64, state: *mut f64, sequence: *mut f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { let l = l as usize; let c = c as usize; let b = b as usize; let (main_params, extra_params, state, sequence) = unsafe {( - slice::from_raw_parts_mut(main_params, 2*l*4*b), - slice::from_raw_parts_mut(extra_params, 3*b), + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts_mut(sequence, c*b) + slice::from_raw_parts(sequence, c*b) )}; unsafe { @@ -37,17 +37,17 @@ pub extern "C" fn rust_unsafe_dlstm_objective(l: i32, c: i32, b: i32, main_param unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} } #[no_mangle] -pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *mut f64, d_main_params: *mut f64, extra_params: *mut f64, d_extra_params: *mut f64, state: *mut f64, sequence: *mut f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { let l = l as usize; let c = c as usize; let b = b as usize; let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( - slice::from_raw_parts_mut(main_params, 2*l*4*b), + slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), - slice::from_raw_parts_mut(extra_params, 3*b), + slice::from_raw_parts(extra_params, 3*b), slice::from_raw_parts_mut(d_extra_params, 3*b), slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts_mut(sequence, c*b) + slice::from_raw_parts(sequence, c*b) )}; unsafe { diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 7ab08905d52c..d6847a4d5d72 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -8,9 +8,9 @@ fn sigmoid(x: f64) -> f64 { // log(sum(exp(x), 2)) #[inline] -fn logsumexp(vect: &mut [f64]) -> f64 { +fn logsumexp(vect: &[f64]) -> f64 { let mut sum = 0.0; - for &mut val in vect { + for &val in vect { sum += val.exp(); } sum += 2.0; // Adding 2 to sum @@ -25,7 +25,7 @@ fn lstm_model( bias: &[f64], hidden: &mut [f64], cell: &mut [f64], - input: &mut [f64], + input: &[f64], ) { let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; @@ -59,10 +59,10 @@ fn lstm_model( fn lstm_predict( l: usize, b: usize, - w: &mut [f64], - w2: &mut [f64], + w: &[f64], + w2: &[f64], s: &mut [f64], - x: &mut [f64], + x: &[f64], x2: &mut [f64], ) { for i in 0..b { @@ -85,8 +85,8 @@ fn lstm_predict( lstm_model( b, - & w[i * 4..(i + b) * 4], - & w[(i + b) * 4..(i + 2 * b) * 4], + &w[i * 4..(i + b) * 4], + &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, @@ -95,7 +95,7 @@ fn lstm_predict( i += 2 * b; } - let xp = &mut s[i - 2 * b..]; + let xp = &s[i - 2 * b..]; for i in 0..b { x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; @@ -119,15 +119,15 @@ pub(crate) fn lstm_objective( l: usize, c: usize, b: usize, - main_params: &mut [f64], - extra_params: &mut [f64], + main_params: &[f64], + extra_params: &[f64], state: &mut [f64], - sequence: &mut [f64], + sequence: &[f64], loss: &mut f64, ) { let mut total = 0.0; - let mut input = &mut sequence[..b]; + let mut input = &sequence[..b]; let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; @@ -137,12 +137,12 @@ pub(crate) fn lstm_objective( for j in 0..(c - 1) { let t = j * b; lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - let lse = logsumexp(&mut ypred); + let lse = logsumexp(&ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; } - let ygold = &mut sequence[t + b..]; + let ygold = &sequence[t + b..]; for i in 0..b { total += ygold[i] * ynorm[i]; } @@ -159,18 +159,18 @@ pub extern "C" fn rust_lstm_objective( l: usize, c: usize, b: usize, - main_params: *mut f64, - extra_params: *mut f64, + main_params: *const f64, + extra_params: *const f64, state: *mut f64, - sequence: *mut f64, + sequence: *const f64, loss: *mut f64, ) { let (main_params, extra_params, state, sequence) = unsafe { ( - slice::from_raw_parts_mut(main_params, 2 * l * 4 * b), - slice::from_raw_parts_mut(extra_params, 3 * b), + slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts(extra_params, 3 * b), slice::from_raw_parts_mut(state, 2 * l * b), - slice::from_raw_parts_mut(sequence, c * b), + slice::from_raw_parts(sequence, c * b), ) }; @@ -193,23 +193,23 @@ pub extern "C" fn rust_dlstm_objective( l: usize, c: usize, b: usize, - main_params: *mut f64, + main_params: *const f64, d_main_params: *mut f64, - extra_params: *mut f64, + extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, - sequence: *mut f64, + sequence: *const f64, res: *mut f64, d_res: *mut f64, ) { let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe { ( - slice::from_raw_parts_mut(main_params, 2 * l * 4 * b), + slice::from_raw_parts(main_params, 2 * l * 4 * b), slice::from_raw_parts_mut(d_main_params, 2 * l * 4 * b), - slice::from_raw_parts_mut(extra_params, 3 * b), + slice::from_raw_parts(extra_params, 3 * b), slice::from_raw_parts_mut(d_extra_params, 3 * b), slice::from_raw_parts_mut(state, 2 * l * b), - slice::from_raw_parts_mut(sequence, c * b), + slice::from_raw_parts(sequence, c * b), ) }; From 74a37849d59f267abd33b26434b43d573153d47e Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 9 Nov 2024 02:49:03 -0500 Subject: [PATCH 80/88] make safe fft safe again --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index a8b25fa7e443..a56f5356430a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -14,8 +14,6 @@ fn bitreversal_perm(data: &mut [T]) { //data.swap(j, i); unsafe { data.swap_unchecked(j - 1, i - 1); - } - unsafe { data.swap_unchecked(j, i); } } @@ -48,23 +46,24 @@ fn radix2(data: &mut [f64], i_sign: i32) { let mut wi = 0.0; for i in (0..n).step_by(2) { - unsafe { - let tempr = b.get_unchecked(i) * wr - b.get_unchecked(i + 1) * wi; - let tempi = b.get_unchecked(i) * wi + b.get_unchecked(i + 1) * wr; - - *b.get_unchecked_mut(i) = a.get_unchecked(i) - tempr; - *b.get_unchecked_mut(i + 1) = a.get_unchecked(i + 1) - tempi; - *a.get_unchecked_mut(i) += tempr; - *a.get_unchecked_mut(i + 1) += tempi; - } - - //let tempr = b[i] * wr - b[i + 1] * wi; - //let tempi = b[i] * wi + b[i + 1] * wr; - - //b[i] = a[i] - tempr; - //b[i + 1] = a[i + 1] - tempi; - //a[i] += tempr; - //a[i + 1] += tempi; + + let tempr = b[i] * wr - b[i + 1] * wi; + let tempi = b[i] * wi + b[i + 1] * wr; + + b[i] = a[i] - tempr; + b[i + 1] = a[i + 1] - tempi; + a[i] += tempr; + a[i + 1] += tempi; + + //unsafe { + // let tempr = b.get_unchecked(i) * wr - b.get_unchecked(i + 1) * wi; + // let tempi = b.get_unchecked(i) * wi + b.get_unchecked(i + 1) * wr; + + // *b.get_unchecked_mut(i) = a.get_unchecked(i) - tempr; + // *b.get_unchecked_mut(i + 1) = a.get_unchecked(i + 1) - tempi; + // *a.get_unchecked_mut(i) += tempr; + // *a.get_unchecked_mut(i + 1) += tempi; + //} let wtemp_new = wr; wr += wr * wpr - wi * wpi; From ba7cf82d9e59e8820c5c36a96bcb8a748ffae51c Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 9 Nov 2024 08:31:03 -0700 Subject: [PATCH 81/88] bench fft: cleaner to use step_by(2) --- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index a56f5356430a..9beba2faf327 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -6,8 +6,7 @@ fn bitreversal_perm(data: &mut [T]) { let len = data.len() / 2; let mut j = 1; - let mut i = 1; - while i < 2 * len { + for i in (1..data.len()).step_by(2) { if j > i { //dbg!(&i, &j); //data.swap(j-1, i-1); @@ -25,7 +24,6 @@ fn bitreversal_perm(data: &mut [T]) { } j += m; - i += 2; } } @@ -46,7 +44,6 @@ fn radix2(data: &mut [f64], i_sign: i32) { let mut wi = 0.0; for i in (0..n).step_by(2) { - let tempr = b[i] * wr - b[i + 1] * wi; let tempi = b[i] * wi + b[i + 1] * wr; From d5bd596f194e82383e91f0e0c9ecdf7c9cf555c0 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Tue, 12 Nov 2024 23:31:50 -0700 Subject: [PATCH 82/88] bench gmm: fix Tapenade --- enzyme/benchmarks/ReverseMode/gmm/gmm.cpp | 49 ++++++++++------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp index e0f86a9852c0..c5e94f3d7305 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp @@ -208,9 +208,9 @@ void gmm_objective_restrict(size_t d, size_t k, size_t n, double const *__restrict icf, double const *__restrict x, Wishart wishart, double *__restrict err) { - size_t ix, ik; + int64_t ix, ik; const double CONSTANT = -n * d * 0.5 * log(2 * PI); - size_t icf_sz = d * (d + 1) / 2; + int64_t icf_sz = d * (d + 1) / 2; double* Qdiags = (double*)malloc(d * k * sizeof(double)); double* sum_qs = (double*)malloc(k * sizeof(double)); @@ -284,19 +284,18 @@ extern "C" { ==================================================================== */ // This throws error on n<1 void arr_max_b(size_t n, const double *x, double *xb, double arr_maxb) { - size_t i; double m = x[0]; double mb = 0.0; int branch; double arr_max; - for (i = 1; i < n; ++i) + for (int64_t i = 1; i < n; ++i) if (m < x[i]) { m = x[i]; pushControl1b(1); } else pushControl1b(0); mb = arr_maxb; - for (i = n-1; i > 0; --i) { + for (int64_t i = (int64_t)n-1; i > 0; --i) { popControl1b(&branch); if (branch != 0) { xb[i] = xb[i] + mb; @@ -327,12 +326,11 @@ double arr_max_nodiff(size_t n, const double *x) { */ // sum of component squares void sqnorm_b(size_t n, const double *x, double *xb, double sqnormb) { - size_t i; double res = x[0]*x[0]; double resb = 0.0; double sqnorm; resb = sqnormb; - for (i = n-1; i > 0; --i) + for (int64_t i = (int64_t)n-1; i > 0; --i) xb[i] = xb[i] + 2*x[i]*resb; xb[0] = xb[0] + 2*x[0]*resb; } @@ -355,8 +353,7 @@ double sqnorm_nodiff(size_t n, const double *x) { // out = a - b void subtract_b(size_t d, const double *x, const double *y, double *yb, double * out, double *outb) { - size_t id; - for (id = d-1; id > -1; --id) { + for (int64_t id = (int64_t)d-1; id > -1; --id) { yb[id] = yb[id] - outb[id]; outb[id] = 0.0; } @@ -376,7 +373,6 @@ void subtract_nodiff(size_t d, const double *x, const double *y, double *out) { Plus diff mem management of: x:in */ void log_sum_exp_b(size_t n, const double *x, double *xb, double log_sum_expb) { - size_t i; double mx; double mxb; double tempb; @@ -384,11 +380,11 @@ void log_sum_exp_b(size_t n, const double *x, double *xb, double log_sum_expb) { mx = arr_max_nodiff(n, x); double semx = 0.0; double semxb = 0.0; - for (i = 0; i < n; ++i) + for (int64_t i = 0; i < n; ++i) semx = semx + exp(x[i] - mx); semxb = log_sum_expb/semx; mxb = log_sum_expb; - for (i = n-1; i > -1; --i) { + for (int64_t i = (int64_t)n-1; i > -1; --i) { tempb = exp(x[i]-mx)*semxb; xb[i] = xb[i] + tempb; mxb = mxb - tempb; @@ -432,7 +428,7 @@ double log_gamma_distrib_nodiff(double a, double p) { void log_wishart_prior_b(size_t p, size_t k, Wishart wishart, const double *sum_qs, double *sum_qsb, const double *Qdiags, double *Qdiagsb, const double * icf, double *icfb, double log_wishart_priorb) { - size_t ik; + int64_t ik; size_t n = p + wishart.m + 1; size_t icf_sz = p*(p+1)/2; double C; @@ -454,7 +450,7 @@ void log_wishart_prior_b(size_t p, size_t k, Wishart wishart, const double *sum_ sum_qsb[ik] = 0.0; for (ik = 0; ik < k * icf_sz; ik++) /* TFIX */ icfb[ik] = 0.0; - for (ik = k-1; ik > -1; --ik) { + for (ik = (int64_t)k-1; ik > -1; --ik) { double frobenius; double frobeniusb; double result1; @@ -511,15 +507,15 @@ double log_wishart_prior_nodiff(size_t p, size_t k, Wishart wishart, const doubl */ void preprocess_qs_b(size_t d, size_t k, const double *icf, double *icfb, double * sum_qs, double *sum_qsb, double *Qdiags, double *Qdiagsb) { - size_t ik, id; + int64_t ik, id; size_t icf_sz = d*(d+1)/2; for (ik = 0; ik < k; ++ik) for (id = 0; id < d; ++id) { double q = icf[ik*icf_sz + id]; pushReal8(q); } - for (ik = k-1; ik > -1; --ik) { - for (id = d-1; id > -1; --id) { + for (ik = (int64_t)k-1; ik > -1; --ik) { + for (id = (int64_t)d-1; id > -1; --id) { double q; double qb = 0.0; popReal8(&q); @@ -534,11 +530,10 @@ void preprocess_qs_b(size_t d, size_t k, const double *icf, double *icfb, double void preprocess_qs_nodiff(size_t d, size_t k, const double *icf, double *sum_qs, double *Qdiags) { - size_t ik, id; size_t icf_sz = d*(d+1)/2; - for (ik = 0; ik < k; ++ik) { + for (size_t ik = 0; ik < k; ++ik) { sum_qs[ik] = 0.; - for (id = 0; id < d; ++id) { + for (size_t id = 0; id < d; ++id) { double q = icf[ik*icf_sz + id]; sum_qs[ik] = sum_qs[ik] + q; Qdiags[ik*d + id] = exp(q); @@ -556,7 +551,7 @@ void Qtimesx_b(size_t d, const double *Qdiag, double *Qdiagb, const double *ltri double *ltrib, const double *x, double *xb, double *out, double *outb) { // strictly lower triangular part - size_t i, j; + int64_t i, j; int adFrom; size_t Lparamsidx = 0; for (i = 0; i < d; ++i) { @@ -565,15 +560,15 @@ void Qtimesx_b(size_t d, const double *Qdiag, double *Qdiagb, const double *ltri Lparamsidx++; pushInteger4(adFrom); } - for (i = d-1; i > -1; --i) { + for (i = (int64_t)d-1; i > -1; --i) { popInteger4(&adFrom); - for (j = d-1; j > adFrom-1; --j) { + for (j = (int64_t)d-1; j > adFrom-1; --j) { --Lparamsidx; ltrib[Lparamsidx] = ltrib[Lparamsidx] + x[i]*outb[j]; xb[i] = xb[i] + ltri[Lparamsidx]*outb[j]; } } - for (i = d-1; i > -1; --i) { + for (i = (int64_t)d-1; i > -1; --i) { Qdiagb[i] = Qdiagb[i] + x[i]*outb[i]; xb[i] = xb[i] + Qdiag[i]*outb[i]; outb[i] = 0.0; @@ -606,7 +601,7 @@ void gmm_objective_b(size_t d, size_t k, size_t n, const double *alphas, double alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb) { - size_t ix, ik; + int64_t ix, ik; /* TFIX */ const double CONSTANT = -n*d*0.5*log(2*PI); size_t icf_sz = d*(d+1)/2; @@ -670,10 +665,10 @@ void gmm_objective_b(size_t d, size_t k, size_t n, const double *alphas, double log_sum_exp_b(k, alphas, alphasb, lse_alphasb); for (ii1 = 0; ii1 < d * k; ii1++) /* TFIX */ meansb[ii1] = 0.0; - for (ix = n-1; ix > -1; --ix) { + for (ix = (int64_t)n-1; ix > -1; --ix) { result1b = slseb; log_sum_exp_b(k, &(main_term[0]), &(main_termb[0]), result1b); - for (ik = k-1; ik > -1; --ik) { + for (ik = (int64_t)k-1; ik > -1; --ik) { popReal8(&(main_term[ik])); alphasb[ik] = alphasb[ik] + main_termb[ik]; sum_qsb[ik] = sum_qsb[ik] + main_termb[ik]; From 4d78820c196481207855283d7b2604522bc4cc98 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 13 Nov 2024 00:05:31 -0700 Subject: [PATCH 83/88] bench gmm: fix C++ primal (negative size_t) --- enzyme/benchmarks/ReverseMode/gmm/gmm.cpp | 10 +++++----- enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp index c5e94f3d7305..cb7e864eca48 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp @@ -105,7 +105,7 @@ double log_sum_exp(size_t n, double const* x) __attribute__((const)) double log_gamma_distrib(double a, double p) { - size_t j; + int64_t j; double out = 0.25 * p * (p - 1) * log(PI); for (j = 1; j <= p; j++) @@ -209,7 +209,7 @@ void gmm_objective_restrict(size_t d, size_t k, size_t n, double const *__restrict x, Wishart wishart, double *__restrict err) { int64_t ix, ik; - const double CONSTANT = -n * d * 0.5 * log(2 * PI); + const double CONSTANT = -(double)n * d * 0.5 * log(2 * PI); int64_t icf_sz = d * (d + 1) / 2; double* Qdiags = (double*)malloc(d * k * sizeof(double)); @@ -603,7 +603,7 @@ void gmm_objective_b(size_t d, size_t k, size_t n, const double *alphas, double errb) { int64_t ix, ik; /* TFIX */ - const double CONSTANT = -n*d*0.5*log(2*PI); + const double CONSTANT = -(double)n*d*0.5*log(2*PI); size_t icf_sz = d*(d+1)/2; double *Qdiags; double *Qdiagsb; @@ -883,7 +883,7 @@ void gmm_objective(size_t d, size_t k, size_t n, Wishart wishart, T* err) { - const double CONSTANT = -n * d * 0.5 * log(2 * PI); + const double CONSTANT = -(double)n * d * 0.5 * log(2 * PI); size_t icf_sz = d * (d + 1) / 2; vector Qdiags(d * k); @@ -979,7 +979,7 @@ void gmm_objective_split_other(size_t d, size_t k, size_t n, Wishart wishart, T* err) { - const double CONSTANT = -n * d * 0.5 * log(2 * PI); + const double CONSTANT = -(double)n * d * 0.5 * log(2 * PI); T lse_alphas = logsumexp(k, alphas); diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h index e62794552067..4bcba4fb0900 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h @@ -2,7 +2,7 @@ void gmm_objective(size_t d, size_t k, size_t n, double const *alphas, double const *means, double const *icf, double const *x, Wishart wishart, double *err) { size_t ix, ik; - const double CONSTANT = -n * d * 0.5 * log(2 * PI); + const double CONSTANT = -(double)n * d * 0.5 * log(2 * PI); size_t icf_sz = d * (d + 1) / 2; double *Qdiags = (double *)malloc(d * k * sizeof(double)); From 13ef016be7a3aa650ef3598bb3bb2ca73499df55 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Fri, 1 Nov 2024 10:50:02 -0600 Subject: [PATCH 84/88] bench fft: avoid bounds checks in safe Rust This yields identical assembly apart from asserts at the top of safe::radix2. --- .../benchmarks/ReverseMode/fft/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/fft/fft.h | 32 +++++----- enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 3 +- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 33 ++++------- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 59 +++++++++++-------- 5 files changed, 65 insertions(+), 64 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index f0fb0d71f4d9..a8ff92267904 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -9,7 +9,7 @@ clean: cargo +enzyme clean $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml - cargo +enzyme rustc --release --lib --crate-type=staticlib + cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 71597d1887c3..4ee9ef0de8f0 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -27,14 +27,14 @@ inline void swap(double *a, double *b) { *b = temp; } -static void recursiveApply(double *data, int iSign, size_t N) { +static void recursiveApply(double *data, size_t N, int iSign) { if (N == 1) return; - recursiveApply(data, iSign, N / 2); - recursiveApply(data + N, iSign, N / 2); + recursiveApply(data, N / 2, iSign); + recursiveApply(data + N, N / 2, iSign); double wtemp = iSign * sin(M_PI / N); - double wpi = -iSign * sin(2 * M_PI / N); + double wpi = -iSign * sin(2 * (M_PI / N)); double wpr = -2.0 * wtemp * wtemp; double wr = 1.0; double wi = 0.0; @@ -52,8 +52,8 @@ static void recursiveApply(double *data, int iSign, size_t N) { data[i + 1] += tempi; wtemp = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp * wpi; + wr = wr * (wpr + 1.) - wi * wpi; + wi = wi * (wpr + 1.) + wtemp * wpi; } } @@ -83,12 +83,12 @@ static void rescale(double *data, size_t N) { static void fft(double *data, size_t N) { scramble(data, N); - recursiveApply(data, 1, N); + recursiveApply(data, N, 1); } static void ifft(double *data, size_t N) { scramble(data, N); - recursiveApply(data, -1, N); + recursiveApply(data, N, -1); rescale(data, N); } @@ -99,14 +99,14 @@ inline void swapad(adept::ActiveReference a, b = temp; } -static void recursiveApply(aVector data, int iSign, size_t N) { +static void recursiveApply(aVector data, size_t N, int iSign) { if (N == 1) return; - recursiveApply(data, iSign, N / 2); - recursiveApply(data(adept::range(N, adept::end)), iSign, N / 2); + recursiveApply(data, N / 2, iSign); + recursiveApply(data(adept::range(N, adept::end)), N / 2, iSign); adouble wtemp = iSign * std::sin(M_PI / N); - adouble wpi = -iSign * std::sin(2 * M_PI / N); + adouble wpi = -iSign * std::sin(2 * (M_PI / N)); adouble wpr = -2.0 * wtemp * wtemp; adouble wr = 1.0; adouble wi = 0.0; @@ -124,8 +124,8 @@ static void recursiveApply(aVector data, int iSign, size_t N) { data(i + 1) += tempi; wtemp = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp * wpi; + wr = wr * (wpr + 1.) - wi * wpi; + wi = wi * (wpr + 1.) + wtemp * wpi; } } @@ -155,12 +155,12 @@ static void rescale(aVector data, size_t N) { static void fft(aVector data, size_t N) { scramble(data, N); - recursiveApply(data, 1, N); + recursiveApply(data, N, 1); } static void ifft(aVector data, size_t N) { scramble(data, N); - recursiveApply(data, -1, N); + recursiveApply(data, N, -1); rescale(data, N); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs index 84b16d077ac7..3b49cb61fd47 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -1,7 +1,6 @@ #![feature(slice_swap_unchecked)] #![feature(autodiff)] +#![feature(slice_as_chunks)] pub mod safe; pub mod unsf; - - diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index 9beba2faf327..8c8cb815c95a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -34,37 +34,30 @@ fn radix2(data: &mut [f64], i_sign: i32) { } let (a, b) = data.split_at_mut(n); + // assert_eq!(a.len(), b.len()); radix2(a, i_sign); radix2(b, i_sign); let wtemp = i_sign as f64 * (PI / n as f64).sin(); - let wpi = -i_sign as f64 * (2.0 * PI / n as f64).sin(); + let wpi = -i_sign as f64 * (2.0 * (PI / n as f64)).sin(); let wpr = -2.0 * wtemp * wtemp; let mut wr = 1.0; let mut wi = 0.0; - for i in (0..n).step_by(2) { - let tempr = b[i] * wr - b[i + 1] * wi; - let tempi = b[i] * wi + b[i + 1] * wr; + let (achunks, _) = a.as_chunks_mut(); + let (bchunks, _) = b.as_chunks_mut(); + for ([ax, ay], [bx, by]) in achunks.iter_mut().zip(bchunks.iter_mut()) { + let tempr = *bx * wr - *by * wi; + let tempi = *bx * wi + *by * wr; - b[i] = a[i] - tempr; - b[i + 1] = a[i + 1] - tempi; - a[i] += tempr; - a[i + 1] += tempi; - - //unsafe { - // let tempr = b.get_unchecked(i) * wr - b.get_unchecked(i + 1) * wi; - // let tempi = b.get_unchecked(i) * wi + b.get_unchecked(i + 1) * wr; - - // *b.get_unchecked_mut(i) = a.get_unchecked(i) - tempr; - // *b.get_unchecked_mut(i + 1) = a.get_unchecked(i + 1) - tempi; - // *a.get_unchecked_mut(i) += tempr; - // *a.get_unchecked_mut(i + 1) += tempi; - //} + *bx = *ax - tempr; + *by = *ay - tempi; + *ax += tempr; + *ay += tempi; let wtemp_new = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp_new * wpi; + wr = wr * (wpr + 1.0) - wi * wpi; + wi = wi * (wpr + 1.0) + wtemp_new * wpi; } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index dd3665860ff9..29c8ceb1187d 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -1,12 +1,12 @@ -use std::f64::consts::PI; use std::autodiff::autodiff; +use std::f64::consts::PI; unsafe fn bitreversal_perm(data: *mut f64, len: usize) { let mut j = 1; - for i in (1..2*len).step_by(2) { + for i in (1..2 * len).step_by(2) { if j > i { - std::ptr::swap(data.add(j-1), data.add(i-1)); + std::ptr::swap(data.add(j - 1), data.add(i - 1)); std::ptr::swap(data.add(j), data.add(i)); } @@ -20,64 +20,73 @@ unsafe fn bitreversal_perm(data: *mut f64, len: usize) { } } -unsafe fn radix2(data: *mut f64, i_sign: i32, n: usize) { - if n == 1 { return; } - radix2(data, i_sign, n/2); - radix2(data.add(n), i_sign, n/2); +unsafe fn radix2(data: *mut f64, n: usize, i_sign: i32) { + if n == 1 { + return; + } + radix2(data, n / 2, i_sign); + radix2(data.add(n), n / 2, i_sign); let wtemp = i_sign as f64 * (PI / n as f64).sin(); - let wpi = -i_sign as f64 * (2.0 * PI / n as f64).sin(); + let wpi = -i_sign as f64 * (2.0 * (PI / n as f64)).sin(); let wpr = -2.0 * wtemp * wtemp; let mut wr = 1.0; let mut wi = 0.0; for i in (0..n).step_by(2) { let in_n = i + n; - - let tempr = *data.add(in_n) * wr - *data.add(in_n + 1) * wi; - let tempi = *data.add(in_n) * wi + *data.add(in_n + 1) * wr; - - *data.add(in_n) = *data.add(i) - tempr; - *data.add(in_n + 1) = *data.add(i + 1) - tempi; - *data.add(i) += tempr; - *data.add(i + 1) += tempi; + let ax = &mut *data.add(i); + let ay = &mut *data.add(i + 1); + let bx = &mut *data.add(in_n); + let by = &mut *data.add(in_n + 1); + let tempr = *bx * wr - *by * wi; + let tempi = *bx * wi + *by * wr; + + *bx = *ax - tempr; + *by = *ay - tempi; + *ax += tempr; + *ay += tempi; let wtemp_new = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp_new * wpi; + wr = wr * (wpr + 1.0) - wi * wpi; + wi = wi * (wpr + 1.0) + wtemp_new * wpi; } } unsafe fn rescale(data: *mut f64, n: usize) { let scale = 1. / n as f64; - for i in 0..2*n { + for i in 0..2 * n { *data.add(i) = *data.add(i) * scale; } } unsafe fn fft(data: *mut f64, n: usize) { bitreversal_perm(data, n); - radix2(data, 1, n); + radix2(data, n, 1); } unsafe fn ifft(data: *mut f64, n: usize) { bitreversal_perm(data, n); - radix2(data, -1, n); + radix2(data, n, -1); rescale(data, n); } #[autodiff(unsafe_dfoobar, Reverse, Const, DuplicatedOnly)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { - fft(data, n ); - ifft(data, n ); + fft(data, n); + ifft(data, n); } #[no_mangle] pub extern "C" fn rust_unsafe_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { - unsafe {unsafe_dfoobar(n, data, ddata); } + unsafe { + unsafe_dfoobar(n, data, ddata); + } } #[no_mangle] pub extern "C" fn rust_unsafe_foobar(n: usize, data: *mut f64) { - unsafe {unsafe_foobar(n, data); } + unsafe { + unsafe_foobar(n, data); + } } From 9f67bd1ac08572c3c83086e28980484a006c6f09 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 14 Nov 2024 22:11:55 -0500 Subject: [PATCH 85/88] update fft to make benchmarking more reliable, make safe rust version fully safe --- enzyme/benchmarks/ReverseMode/fft/Makefile.make | 9 +++++++-- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 6 +++++- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 12 ++++++------ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index a8ff92267904..774f4565359b 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -12,7 +12,12 @@ $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a + clang++ $(LOADCLANG) $(BENCH) -DCPP=1 -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ + +fftr.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ -results.json: fft.o - ./$^ 1048576 | tee $@ +results.json: fft.o fftr.o + numactl -C 1 ./fft.o 1048576 | tee results.json + numactl -C 1 ./fftr.o 1048576 | tee resultsr.json + diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 45efb4110d91..799b7b16c1b9 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -353,12 +353,16 @@ int main(int argc, char **argv) { } double inp = -2.1; - for (size_t iters = max(1, N >> 5); iters <= N; iters *= 2) { + size_t iters = max(1, N >> 0); + for (size_t i = 0; i < 5; i++) { printf("iters=%zu\n", iters); +#if CPP adept_sincos(inp, iters); tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); +#else enzyme_rust_sincos(inp, iters); enzyme_unsafe_rust_sincos(inp, iters); +#endif } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index 8c8cb815c95a..cbca5abb8484 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -9,12 +9,12 @@ fn bitreversal_perm(data: &mut [T]) { for i in (1..data.len()).step_by(2) { if j > i { //dbg!(&i, &j); - //data.swap(j-1, i-1); - //data.swap(j, i); - unsafe { - data.swap_unchecked(j - 1, i - 1); - data.swap_unchecked(j, i); - } + data.swap(j-1, i-1); + data.swap(j, i); + //unsafe { + // data.swap_unchecked(j - 1, i - 1); + // data.swap_unchecked(j, i); + //} } let mut m = len; From 88feab61d5281206c6d5c7ed09d77ab4a4d2ceae Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 15 Nov 2024 00:00:30 -0500 Subject: [PATCH 86/88] improve lstm output, makefiles --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 108 ++++++++++-------- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 36 ++---- .../benchmarks/ReverseMode/gmm/Makefile.make | 2 +- .../benchmarks/ReverseMode/lstm/Makefile.make | 4 +- 4 files changed, 76 insertions(+), 74 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index a96d07f82869..aa6176d04ec1 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -291,6 +291,7 @@ int main(const int argc, const char* argv[]) { //} } + for (size_t i = 0; i < 5; i++) { struct GMMInput input; @@ -349,6 +350,65 @@ int main(const int argc, const char* argv[]) { test_suite["tools"].push_back(enzyme); } } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Rust unsafe Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + for (size_t i = 0; i < 5; i++) + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Rust Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } { @@ -401,36 +461,6 @@ int main(const int argc, const char* argv[]) { primal["result"].push_back(res); test_suite["tools"].push_back(primal); } - { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); - json enzyme; - enzyme["name"] = "Rust unsafe Enzyme combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); - i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } - printf("\n"); - test_suite["tools"].push_back(enzyme); - } - } - - { - - struct GMMInput input; - read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, - input.alphas, input.means, input.icf, input.x, - input.wishart, params.replicate_point); - - size_t Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; - - struct GMMOutput result = {0, std::vector(Jcols)}; - { struct timeval start, end; gettimeofday(&start, NULL); @@ -443,24 +473,8 @@ int main(const int argc, const char* argv[]) { primal["result"].push_back(res); test_suite["tools"].push_back(primal); } - { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); - json enzyme; - enzyme["name"] = "Rust Enzyme combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } - printf("\n"); - test_suite["tools"].push_back(enzyme); - } } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 3d565bc3902d..4f998418a938 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -474,17 +474,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_mayalias_primal(input); + double res = calculate_mayalias_primal(input); gettimeofday(&end, NULL); printf("C++ mayalias primal %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "C++ mayalias primal"; enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); - i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } + printf("%f ", res); + enzyme["result"].push_back(res); test_suite["tools"].push_back(enzyme); printf("\n"); @@ -507,17 +504,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_restrict_primal(input); + double res = calculate_restrict_primal(input); gettimeofday(&end, NULL); printf("C++ restrict primal %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "C++ restrict primal"; enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); - i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } + printf("%f ", res); + enzyme["result"].push_back(res); test_suite["tools"].push_back(enzyme); printf("\n"); @@ -540,17 +534,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_unsafe_primal(input); + double res =calculate_unsafe_primal(input); gettimeofday(&end, NULL); printf("Enzyme (unsafe Rust) primal %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme (unsafe Rust) primal"; enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); - i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } + printf("%f ", res); + enzyme["result"].push_back(res); test_suite["tools"].push_back(enzyme); printf("\n"); @@ -573,17 +564,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_safe_primal(input); + double res = calculate_safe_primal(input); gettimeofday(&end, NULL); printf("Enzyme (safe Rust) primal %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme (safe Rust) primal"; enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); - i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); - } + printf("%f ", res); + enzyme["result"].push_back(res); test_suite["tools"].push_back(enzyme); printf("\n"); diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 049a9946a9a2..ddfd29798fb4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -15,4 +15,4 @@ gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ results.json: gmm.o - ./$^ + numactl -C 1 ./$^ diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 1f2b20776058..935bd3698948 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -9,10 +9,10 @@ clean: cargo +enzyme clean $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - RUSTFLAGS="-Z mutable-noalias=no" cargo +enzyme rustc --release --lib --crate-type=staticlib + RUSTFLAGS="-Z mutable-noalias=yes" cargo +enzyme rustc --release --lib --crate-type=staticlib lstm.o: lstm.cpp $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ results.json: lstm.o - ./$^ + numactl -C 1 ./$^ From 31fd712e84fe486d902e8e39f39bfdcc42392c20 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 14 Nov 2024 23:05:26 -0700 Subject: [PATCH 87/88] bench fft: closer C++ code to Rust and use -fno-plt Note that -mtune=cascadelake is only about tuning (like instruction timing) not arch/features. --- .../benchmarks/ReverseMode/fft/Makefile.make | 4 +-- enzyme/benchmarks/ReverseMode/fft/fft.h | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index 774f4565359b..c3c17ec37797 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -12,10 +12,10 @@ $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOADCLANG) $(BENCH) -DCPP=1 -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ + clang++ $(LOADCLANG) $(BENCH) -DCPP=1 -O3 -fno-math-errno -fno-plt -mtune=cascadelake -g $^ $(BENCHLINK) -lm -o $@ fftr.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ + clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno -fno-plt -mtune=cascadelake -g $^ $(BENCHLINK) -lm -o $@ results.json: fft.o fftr.o numactl -C 1 ./fft.o 1048576 | tee results.json diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 4ee9ef0de8f0..fad3c7dad145 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -27,7 +27,7 @@ inline void swap(double *a, double *b) { *b = temp; } -static void recursiveApply(double *data, size_t N, int iSign) { +static void recursiveApply(double *__restrict data, size_t N, int iSign) { if (N == 1) return; recursiveApply(data, N / 2, iSign); @@ -39,17 +39,20 @@ static void recursiveApply(double *data, size_t N, int iSign) { double wr = 1.0; double wi = 0.0; - for (size_t ii = 0; ii < N / 2; ii++) { - size_t i = 2 * ii; + for (size_t i = 0; i < N; i += 2) { size_t iN = i + N; - - double tempr = data[iN] * wr - data[iN + 1] * wi; - double tempi = data[iN] * wi + data[iN + 1] * wr; - - data[iN] = data[i] - tempr; - data[iN + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; + double *__restrict ay = &data[i + 1]; + double *__restrict ax = &data[i]; + double *__restrict by = &data[iN + 1]; + double *__restrict bx = &data[iN]; + + double tempr = *bx * wr - *by * wi; + double tempi = *bx * wi + *by * wr; + + *bx = *ax - tempr; + *by = *ay - tempi; + *ax += tempr; + *ay += tempi; wtemp = wr; wr = wr * (wpr + 1.) - wi * wpi; From abf2a262e6bede0143101ae4f2f92592f145c474 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 15 Nov 2024 04:47:54 -0500 Subject: [PATCH 88/88] more numactl --- .../benchmarks/ReverseMode/ba/Makefile.make | 2 +- .../ReverseMode/ode-real/Makefile.make | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index c9cdbbc4c50b..cab55c1f159f 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -15,4 +15,4 @@ ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ results.json: ba.o - ./$^ + numactl -C 1 ./$^ diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index ab51a2a4671e..f12319d15210 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -15,13 +15,13 @@ ode.o: ode.cpp $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ results.json: ode.o - ./$^ 1000 | tee $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ - ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 | tee $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@ + numactl -C 1 ./$^ 1000 >> $@