From 2648244626c1c965ee0b36b088504dfe57dfd4fa Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 28 Oct 2020 19:42:22 +1100 Subject: [PATCH 01/10] Add back in the old pen-and-paper multiplication implementation. --- src/field/daira.rs | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/field/daira.rs diff --git a/src/field/daira.rs b/src/field/daira.rs new file mode 100644 index 0000000..d021a8e --- /dev/null +++ b/src/field/daira.rs @@ -0,0 +1,45 @@ +#[inline] +#[unroll_for_loops] +fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] { + // Grade school multiplication. To avoid carrying at each of + // O(n^2) steps, we first add each intermediate product to a + // 128-bit accumulator, then propagate carries at the end. + let mut acc128 = [0u128; 4 + 4]; + + for i in 0..4 { + for j in 0..4 { + let a_i_b_j = a[i] as u128 * b[j] as u128; + // Add the less significant chunk to the less significant + // accumulator. + acc128[i + j] += a_i_b_j as u64 as u128; + // Add the more significant chunk to the more significant + // accumulator. + acc128[i + j + 1] += a_i_b_j >> 64; + } + } + + let mut acc = [0u64; 8]; + acc[0] = acc128[0] as u64; + let mut carry = false; + for i in 1..8 { + let last_chunk_big = (acc128[i - 1] >> 64) as u64; + let curr_chunk_small = acc128[i] as u64; + // Note that last_chunk_big won't get anywhere near 2^64, + // since it's essentially a carry from some additions in the + // previous phase, so we can add the carry bit to it without + // fear of overflow. + let result = curr_chunk_small.overflowing_add( + last_chunk_big + carry as u64); + acc[i] += result.0; + carry = result.1; + } + debug_assert!(!carry); + acc +} + +/// This modular arithmetic representation is based on Daira's +/// adaptation of the tricks for moduli of the form 2^b + c. +/// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw +pub trait DairaRepr { + +} From 0652e842bc2ce87301512a04c128bb20cc0978d0 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 28 Oct 2020 20:23:50 +1100 Subject: [PATCH 02/10] First draft of full pen-and-paper squaring. --- src/field/daira.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/field/daira.rs b/src/field/daira.rs index d021a8e..3d1d2aa 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -1,4 +1,47 @@ +use unroll::unroll_for_loops; +use crate::{add_no_overflow, sub, cmp, mul2}; + +// FIXME: These functions are copypasta from monty.rs +#[inline] +fn mul_add_cy_in(a: u64, b: u64, c: u64, cy_in: u64) -> (u64, u64) { + let t = (a as u128) * (b as u128) + (c as u128) + (cy_in as u128); + ((t >> 64) as u64, t as u64) +} + #[inline] +fn mul_add(a: u64, b: u64, c: u64) -> (u64, u64) { + let t = (a as u128) * (b as u128) + (c as u128); + ((t >> 64) as u64, t as u64) +} + +#[unroll_for_loops] +fn sqr_4(a: [u64; 4]) -> [u64; 8] { + let mut res = [u64; 8]; + + // Calculate the off-diagonal part of the square + for i in 0 .. 4 { + let mut hi_in = 0u64; + for j in i+1 .. 4 { + let (hi_out, lo) = mul_add(a[j], a[i], hi_in); + res[i + j] = lo; + hi_in = hi_out; + } + res[i + 4] = hi_in; + } + res = mul2(res); // NB: Top and bottom words are zero + + // Calculate and add in the diagonal part + let mut hi_in = 0u64; + for i in 0 .. 4 { + let (hi_out, lo) = mul_add_cy_in(a[i], a[i], res[2*i], hi_in); + res[2*i] = lo; + let (t, cy) = res[2*i + 1].overflowing_add(hi_out); + res[2*i + 1] = t; + hi_in = cy as u64; + } + res +} + #[unroll_for_loops] fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] { // Grade school multiplication. To avoid carrying at each of From b1e5c15f113732875312b34712866aff1b579c54 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 18 Jan 2021 22:32:13 +1100 Subject: [PATCH 03/10] Full working version of Daira-reduction based arithmetic (needs tidying). --- src/field/daira.rs | 272 +++++++++++++++++++++++++++++++++-- src/field/field.rs | 25 +++- src/field/mod.rs | 2 + src/field/tweedledee_base.rs | 160 +++++++++------------ 4 files changed, 353 insertions(+), 106 deletions(-) diff --git a/src/field/daira.rs b/src/field/daira.rs index 3d1d2aa..c8f57f3 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -1,28 +1,39 @@ +use std::cmp::Ordering::{Less, Greater}; use unroll::unroll_for_loops; -use crate::{add_no_overflow, sub, cmp, mul2}; +use crate::{add_no_overflow, sub, cmp, nonzero_multiplicative_inverse}; // FIXME: These functions are copypasta from monty.rs -#[inline] +#[inline(always)] fn mul_add_cy_in(a: u64, b: u64, c: u64, cy_in: u64) -> (u64, u64) { let t = (a as u128) * (b as u128) + (c as u128) + (cy_in as u128); ((t >> 64) as u64, t as u64) } #[inline] -fn mul_add(a: u64, b: u64, c: u64) -> (u64, u64) { - let t = (a as u128) * (b as u128) + (c as u128); - ((t >> 64) as u64, t as u64) +#[unroll_for_loops] +fn mul2(x: [u64; 8]) -> [u64; 8] { + debug_assert_eq!(x[8-1] >> 63, 0, "Most significant bit should be clear"); + + let mut result = [0; 8]; + result[0] = x[0] << 1; + for i in 1..8 { + result[i] = x[i] << 1 | x[i - 1] >> 63; + } + result } +#[inline] #[unroll_for_loops] fn sqr_4(a: [u64; 4]) -> [u64; 8] { - let mut res = [u64; 8]; + let mut res = [0u64; 8]; // Calculate the off-diagonal part of the square + // TODO: Note that res is all zeros on the first itertion, so no + // need to add it for i in 0 .. 4 { let mut hi_in = 0u64; for j in i+1 .. 4 { - let (hi_out, lo) = mul_add(a[j], a[i], hi_in); + let (hi_out, lo) = mul_add_cy_in(a[j], a[i], res[i + j], hi_in); res[i + j] = lo; hi_in = hi_out; } @@ -39,9 +50,11 @@ fn sqr_4(a: [u64; 4]) -> [u64; 8] { res[2*i + 1] = t; hi_in = cy as u64; } + debug_assert_eq!(hi_in, 0, "Unexpected carry detected"); res } +#[inline] #[unroll_for_loops] fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] { // Grade school multiplication. To avoid carrying at each of @@ -80,9 +93,252 @@ fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] { acc } +#[inline] +#[unroll_for_loops] +fn mul_2_2(a: [u64; 2], b: [u64; 2]) -> [u64; 4] { + // Grade school multiplication. To avoid carrying at each of + // O(n^2) steps, we first add each intermediate product to a + // 128-bit accumulator, then propagate carries at the end. + let mut acc128 = [0u128; 2 + 2]; + + for i in 0..2 { + for j in 0..2 { + let a_i_b_j = a[i] as u128 * b[j] as u128; + // Add the less significant chunk to the less significant + // accumulator. + acc128[i + j] += a_i_b_j as u64 as u128; + // Add the more significant chunk to the more significant + // accumulator. + acc128[i + j + 1] += a_i_b_j >> 64; + } + } + + let mut acc = [0u64; 4]; + acc[0] = acc128[0] as u64; + let mut carry = false; + for i in 1..4 { + let last_chunk_big = (acc128[i - 1] >> 64) as u64; + let curr_chunk_small = acc128[i] as u64; + // Note that last_chunk_big won't get anywhere near 2^64, + // since it's essentially a carry from some additions in the + // previous phase, so we can add the carry bit to it without + // fear of overflow. + let result = curr_chunk_small.overflowing_add( + last_chunk_big + carry as u64); + acc[i] += result.0; + carry = result.1; + } + debug_assert!(!carry); + acc +} + +#[inline] +#[unroll_for_loops] +fn mul_2_4(a: [u64; 2], b: [u64; 4]) -> [u64; 6] { + // Grade school multiplication. To avoid carrying at each of + // O(n^2) steps, we first add each intermediate product to a + // 128-bit accumulator, then propagate carries at the end. + let mut acc128 = [0u128; 2 + 4]; + + for i in 0..2 { + for j in 0..4 { + let a_i_b_j = a[i] as u128 * b[j] as u128; + // Add the less significant chunk to the less significant + // accumulator. + acc128[i + j] += a_i_b_j as u64 as u128; + // Add the more significant chunk to the more significant + // accumulator. + acc128[i + j + 1] += a_i_b_j >> 64; + } + } + + let mut acc = [0u64; 6]; + acc[0] = acc128[0] as u64; + let mut carry = false; + for i in 1..6 { + let last_chunk_big = (acc128[i - 1] >> 64) as u64; + let curr_chunk_small = acc128[i] as u64; + // Note that last_chunk_big won't get anywhere near 2^64, + // since it's essentially a carry from some additions in the + // previous phase, so we can add the carry bit to it without + // fear of overflow. + let result = curr_chunk_small.overflowing_add( + last_chunk_big + carry as u64); + acc[i] += result.0; + carry = result.1; + } + debug_assert!(!carry); + acc +} + +#[inline] +fn add_6_4(a: [u64; 6], b: [u64; 4]) -> [u64; 6] { + // TODO: This is slightly wasteful, since we know the two high + // words are zero. + let c = [b[0], b[1], b[2], b[3], 0, 0]; + add_no_overflow(a, c) +} + +/// Given x = sum_{i=0}^7 xi (2^64)^i, with x < 2^512, return +/// y1,y2,y3,y4,z1,z2,z3,z4 such that x = y + z * 2^254 + w * (2^254)^2 +/// where y = sum_{i=0}^3 yi (2^64)^i and z = sum_{i=0}^3 zi (2^64)^i +/// are both < 2^254, and w < 16 +#[inline] +fn rebase_8(x: [u64; 8]) -> ([u64; 4], [u64; 4], u64) { + const MASK: u64 = (1u64 << 62) - 1u64; // 2^62-1 + + // bottom half is the same, except the top two bits are dropped + let y = [x[0], x[1], x[2], x[3] & MASK]; + + // shift the top half words up by two bits + let z = [((x[4] << 2) | (x[3] >> 62)), + ((x[5] << 2) | (x[4] >> 62)), + ((x[6] << 2) | (x[5] >> 62)), + ((x[7] << 2) | (x[6] >> 62)) & MASK]; + + // save the very top two bits in w + let w = x[7] >> 60; + + (y, z, w) +} + +#[inline] +fn rebase_6(x: [u64; 6]) -> ([u64; 4], [u64; 2]) { + const MASK: u64 = (1u64 << 62) - 1u64; // 2^62-1 + + debug_assert_eq!(x[5] >> 62, 0, "highest word of x is too big"); + + // bottom half is the same, except the top two bits are dropped + let y = [x[0], x[1], x[2], x[3] & MASK]; + + // shift the top words up by two bits + let z = [((x[4] << 2) | (x[3] >> 62)), + ((x[5] << 2) | (x[4] >> 62))]; + + (y, z) +} + /// This modular arithmetic representation is based on Daira's -/// adaptation of the tricks for moduli of the form 2^b + c. +/// adaptation of the tricks for moduli of the form 2^n + C. /// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw pub trait DairaRepr { + /// Order of the field (i.e. modulus for operations) + const ORDER: [u64; 4]; + + const ZERO: [u64; 4] = [0u64; 4]; + const ONE: [u64; 4] = [1u64, 0u64, 0u64, 0u64]; + + /// The C in 2^n + C. + const C: [u64; 2]; + + /// The value of x*C^2 for x in 1..16 + const C_SQR_TBL: [[u64; 4]; 15]; + + const K_M: [u64; 6]; + + // TODO: This is copypasta from monty.rs + // TODO: Daira's representation actually allows for some redundancy, + // so reducing is not always necessary. + fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] { + let sum = add_no_overflow(lhs, rhs); + let res = if cmp(sum, Self::ORDER) == Less { + sum + } else { + sub(sum, Self::ORDER) + }; + Self::daira_to_canonical(res) + } + + // TODO: This is copypasta from monty.rs + fn daira_sub(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] { + let res = if cmp(lhs, rhs) == Less { + // Underflow occurs, so we compute the difference as `self + (-rhs)`. + add_no_overflow(lhs, Self::daira_neg(rhs)) + } else { + // No underflow, so it's faster to subtract directly. + sub(lhs, rhs) + }; + Self::daira_to_canonical(res) + } + + // TODO: This is copypasta from monty.rs + fn daira_neg(limbs: [u64; 4]) -> [u64; 4] { + let res = if limbs == Self::ZERO { + Self::ZERO + } else { + sub(Self::ORDER, limbs) + }; + Self::daira_to_canonical(res) + } + + #[inline] + fn _reduce(x: [u64; 8]) -> [u64; 4] { + // x = (x0, x1, x2) + let (x0, x1, x2) = rebase_8(x); + // s = C * x1 + let s = mul_2_4(Self::C, x1); + // t = C^2 * x2 + x0 + let t = if x2 == 0 { + x0 + } else { + add_no_overflow(Self::C_SQR_TBL[(x2 - 1) as usize], x0) + }; + + // xp = kM - s + t + let xp = add_6_4(sub(Self::K_M, s), t); + + // xp = (xp0, xp1) + let (xp0, xp1) = rebase_6(xp); + // u = C * xp1 + let u = mul_2_2(Self::C, xp1); + // return M - u + xp0 + //Self::daira_to_canonical(add_no_overflow(sub(Self::ORDER, u), xp0)) + let res = add_no_overflow(sub(Self::ORDER, u), xp0); + let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1]; + debug_assert!(cmp(res, max_expected) != Greater); + Self::daira_to_canonical(res) + } + + #[inline] + fn daira_multiply(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { + Self::_reduce(mul_4_4(a, b)) + } + + #[inline] + fn daira_square(a: [u64; 4]) -> [u64; 4] { + Self::_reduce(sqr_4(a)) + } + + fn daira_inverse(a: [u64; 4]) -> [u64; 4] { + nonzero_multiplicative_inverse(a, Self::ORDER) + } + + fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] { + let mut b = a; + let mut count = 0; + while cmp(b, Self::ORDER) != Less && count < 10 { + b = sub(b, Self::ORDER); + count += 1; + } + //println!("Reducing ({}) {:?} -> {:?}", count, a, b); + /* + if count > 1 { + println!("Expected at most one reduction loop but did {}", count); + } + */ + debug_assert!(count < 2, "Expected at most one reduction loop"); + b + } + + /* + fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] { + let mut b = a; + if cmp(b, Self::ORDER) != Less { + b = sub(b, Self::ORDER); + debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop"); + } + b + } + */ } diff --git a/src/field/field.rs b/src/field/field.rs index ad7fc58..ee578c3 100644 --- a/src/field/field.rs +++ b/src/field/field.rs @@ -384,7 +384,7 @@ pub trait Field: if exp == Self::NEG_ONE { return false; } - panic!("Number theory is a lie!") + panic!("Number theory is a lie! Expected {:?}^{:?} == 1 or p-1 but got {:?}", self, power, exp) } /// The number of bits in the binary encoding of this field element. @@ -403,8 +403,10 @@ pub trait Field: /// Like `Ord::cmp`. We can't implement `Ord` directly due to Rust's trait coherence rules, so /// instead we provide this helper which implementations can use to trivially implement `Ord`. fn cmp_helper(&self, other: &Self) -> Ordering { + //println!("Before"); let self_limbs = self.to_canonical_u64_vec().into_iter(); let other_limbs = other.to_canonical_u64_vec().into_iter(); + //println!("After"); let mut result = Equal; for (self_limb, other_limb) in self_limbs.zip(other_limbs) { @@ -433,7 +435,10 @@ pub trait Field: /// roots, otherwise return `None`. /// Inspired by implementation in https://github.com/scipr-lab/zexe/blob/85bae796a411077733ddeefda042d02f4b4772e5/algebra-core/src/fields/arithmetic.rs fn square_root(&self) -> Option { - if self.is_zero() { + println!("square root input: {:?}", self); + let r = self.is_zero(); + println!(" is zero? {}", r); + if r { Some(*self) } else if self.is_quadratic_residue() { let mut z = Self::MULTIPLICATIVE_SUBGROUP_GENERATOR.exp(Self::T); @@ -558,6 +563,14 @@ pub mod field_tests { let output = inputs .iter() .map(|x| field_to_biguint(op(biguint_to_field(x.clone())))); + /* + for (x, y) in output.zip(expected) { + if x != y { + println!("got = {:?}; expected = {:?}", x, y); + assert!(false, "FAIL"); + } + } + */ // Compare expected outputs with actual outputs assert!( output.zip(expected).all(|(x, y)| x == y), @@ -597,6 +610,14 @@ pub mod field_tests { let output = inputs.iter().zip(shifted_inputs).map(|(x, y)| { field_to_biguint(op(biguint_to_field(x.clone()), biguint_to_field(y.clone()))) }); + /* + for (x, y) in output.zip(expected) { + if x != y { + println!("got = {:?}; expected = {:?}", x, y); + assert!(false, "FAIL"); + } + } + */ // Compare expected outputs with actual outputs assert!( output.zip(expected).all(|(x, y)| x == y), diff --git a/src/field/mod.rs b/src/field/mod.rs index 3ca6abe..c402272 100644 --- a/src/field/mod.rs +++ b/src/field/mod.rs @@ -4,6 +4,7 @@ pub use field::*; pub use tweedledee_base::*; pub use tweedledum_base::*; pub use monty::*; +pub use daira::*; mod bls12_377_base; mod bls12_377_scalar; @@ -12,3 +13,4 @@ mod field; mod tweedledee_base; mod tweedledum_base; mod monty; +mod daira; diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs index d491548..2318e32 100644 --- a/src/field/tweedledee_base.rs +++ b/src/field/tweedledee_base.rs @@ -1,6 +1,6 @@ use rand::Rng; use std::cmp::Ordering; -use std::cmp::Ordering::Less; +use std::cmp::Ordering::{Less, Equal}; use std::convert::TryInto; use std::ops::{Add, Div, Mul, Neg, Sub}; use std::fmt; @@ -8,76 +8,69 @@ use std::fmt::{Debug, Display, Formatter}; use crate::{cmp, field_to_biguint, rand_range, rand_range_from_rng, - MontyRepr, Field}; + DairaRepr, Field}; /// An element of the Tweedledee group's base field. #[derive(Copy, Clone, Eq, PartialEq, Hash, Default)] +//#[derive(Copy, Clone, Hash, Default)] pub struct TweedledeeBase { - /// Montgomery representation, encoded with little-endian u64 limbs. + /// Daira representation, encoded with little-endian u64 limbs. pub limbs: [u64; 4], } -impl MontyRepr for TweedledeeBase { - /// The order of the field: 28948022309329048855892746252171976963322203655954433126947083963168578338817 +impl DairaRepr for TweedledeeBase { + /// ORDER = 2^254 + C const ORDER: [u64; 4] = [ - 9524180637049683969, - 255193519543715529, - 0, - 4611686018427387904, + 0x842cafd400000001, + 0x38aa127696286c9, + 0x0, + 0x4000000000000000 ]; - /// Twice the order of the field: 57896044618658097711785492504343953926644407311908866253894167926337156677634 - #[allow(dead_code)] - const ORDER_X2: [u64; 4] = [ - 601617200389816322, - 510387039087431059, - 0, - 9223372036854775808, + const C: [u64; 2] = [0x842cafd400000001, 0x38aa127696286c9]; + + /// Multiplication lookup table: ith element is (i+1)*C^2 + const C_SQR_TBL: [[u64; 4]; 15] = [ + [0x8595fa800000001, 0x27e16a565c689523, 0x3f4c0e6fcb03aa0a, 0xc8ad910688601], + [0x10b2bf5000000002, 0x4fc2d4acb8d12a46, 0x7e981cdf96075414, 0x1915b220d10c02], + [0x190c1ef800000003, 0x77a43f031539bf69, 0xbde42b4f610afe1e, 0x25a08b31399203], + [0x21657ea000000004, 0x9f85a95971a2548c, 0xfd3039bf2c0ea828, 0x322b6441a21804], + [0x29bede4800000005, 0xc76713afce0ae9af, 0x3c7c482ef7125232, 0x3eb63d520a9e06], + [0x32183df000000006, 0xef487e062a737ed2, 0x7bc8569ec215fc3c, 0x4b411662732407], + [0x3a719d9800000007, 0x1729e85c86dc13f5, 0xbb14650e8d19a647, 0x57cbef72dbaa08], + [0x42cafd4000000008, 0x3f0b52b2e344a918, 0xfa60737e581d5051, 0x6456c883443009], + [0x4b245ce800000009, 0x66ecbd093fad3e3b, 0x39ac81ee2320fa5b, 0x70e1a193acb60b], + [0x537dbc900000000a, 0x8ece275f9c15d35e, 0x78f8905dee24a465, 0x7d6c7aa4153c0c], + [0x5bd71c380000000b, 0xb6af91b5f87e6881, 0xb8449ecdb9284e6f, 0x89f753b47dc20d], + [0x64307be00000000c, 0xde90fc0c54e6fda4, 0xf790ad3d842bf879, 0x96822cc4e6480e], + [0x6c89db880000000d, 0x6726662b14f92c7, 0x36dcbbad4f2fa284, 0xa30d05d54ece10], + [0x74e33b300000000e, 0x2e53d0b90db827ea, 0x7628ca1d1a334c8e, 0xaf97dee5b75411], + [0x7d3c9ad80000000f, 0x56353b0f6a20bd0d, 0xb574d88ce536f698, 0xbc22b7f61fda12] ]; - /// R in the context of the Montgomery reduction, i.e. 2^256 % |F|. - const R: [u64; 4] = [ - 8320946236270051325, - 17681163515078405027, - 18446744073709551615, - 4611686018427387903, + /// K = 2*C, M = ORDER, K_M = K*M + const K_M: [u64; 6] = [ + 0x10b2bf5000000002, 0x4fc2d4acb8d12a46, 0x7e981cdf96075414, + 0x801915b220d10c02, 0xc21657ea00000000, 0x1c55093b4b14364 ]; - - /// R^2 in the context of the Montgomery reduction, i.e. 2^(256*2) % |F|. - const R2: [u64; 4] = [ - 9625875206237061136, - 9085631154807722544, - 17636350113745641634, - 56485833733595155, - ]; - - /// R^3 in the context of the Montgomery reduction, i.e. 2^(256*3) % |F|. - const R3: [u64; 4] = [ - 11971961131424865118, - 6311318431551332850, - 14638507591886519234, - 739379759776372087, - ]; - - /// In the context of Montgomery multiplication, ยต = -|F|^-1 mod 2^64. - const MU: u64 = 9524180637049683967; } impl TweedledeeBase { pub fn from_canonical(c: [u64; 4]) -> Self { - Self { limbs: Self::from_monty(c) } + debug_assert!(cmp(c, Self::ORDER) == Less, "Given non-canonical input: {:?}", c); + Self { limbs: c } } pub fn to_canonical(&self) -> [u64; 4] { - Self::to_monty(self.limbs) + Self::daira_to_canonical(self.limbs) } } impl Add for TweedledeeBase { type Output = Self; - fn add(self, rhs: TweedledeeBase) -> Self::Output { - Self { limbs: Self::monty_add(self.limbs, rhs.limbs) } + fn add(self, rhs: TweedledeeBase) -> Self { + Self { limbs: Self::daira_add(self.limbs, rhs.limbs) } } } @@ -85,7 +78,7 @@ impl Sub for TweedledeeBase { type Output = Self; fn sub(self, rhs: Self) -> Self { - Self { limbs: Self::monty_sub(self.limbs, rhs.limbs) } + Self { limbs: Self::daira_sub(self.limbs, rhs.limbs) } } } @@ -93,7 +86,7 @@ impl Mul for TweedledeeBase { type Output = Self; fn mul(self, rhs: Self) -> Self { - Self { limbs: Self::monty_multiply(self.limbs, rhs.limbs) } + Self { limbs: Self::daira_multiply(self.limbs, rhs.limbs) } } } @@ -109,49 +102,22 @@ impl Neg for TweedledeeBase { type Output = Self; fn neg(self) -> Self { - Self { limbs: Self::monty_neg(self.limbs) } + Self { limbs: Self::daira_neg(self.limbs) } } } impl Field for TweedledeeBase { const BITS: usize = 255; const BYTES: usize = 32; - const ZERO: Self = Self { limbs: ::ZERO }; - const ONE: Self = Self { limbs: ::ONE }; - const TWO: Self = Self { - limbs: [ - 7117711835490418681, - 16660389436903542909, - 18446744073709551615, - 4611686018427387903, - ], - }; - const THREE: Self = Self { - limbs: [ - 5914477434710786037, - 15639615358728680791, - 18446744073709551615, - 4611686018427387903, - ], - }; - const FOUR: Self = Self { - limbs: [ - 4711243033931153393, - 14618841280553818673, - 18446744073709551615, - 4611686018427387903, - ], - }; - const FIVE: Self = Self { - limbs: [ - 3508008633151520749, - 13598067202378956555, - 18446744073709551615, - 4611686018427387903, - ], - }; + const ZERO: Self = Self { limbs: ::ZERO }; + const ONE: Self = Self { limbs: ::ONE }; + const TWO: Self = Self { limbs: [2u64, 0, 0, 0] }; + const THREE: Self = Self { limbs: [3u64, 0, 0, 0] }; + const FOUR: Self = Self { limbs: [4u64, 0, 0, 0] }; + const FIVE: Self = Self { limbs: [5u64, 0, 0, 0] }; + const NEG_ONE: Self = Self { - limbs: [1203234400779632644, 1020774078174862118, 0, 0], + limbs: [0x842cafd400000000, 0x38aa127696286c9, 0x0, 0x4000000000000000] }; const MULTIPLICATIVE_SUBGROUP_GENERATOR: Self = Self::FIVE; @@ -162,12 +128,7 @@ impl Field for TweedledeeBase { /// 1684996666696914987166688442938726917102595538363933628829375605749 const T: Self = Self { - limbs: [ - 9524180637049683969, - 255193519543715529, - 0, - 4611686017353646080, - ], + limbs: [0xda58a1b2610b2bf5, 0xe2a849, 0x0, 0x10000000] }; fn to_canonical_u64_vec(&self) -> Vec { @@ -188,7 +149,7 @@ impl Field for TweedledeeBase { fn multiplicative_inverse_assuming_nonzero(&self) -> Self { Self { - limbs: Self::monty_inverse(self.limbs) + limbs: Self::daira_inverse(self.limbs) } } @@ -207,7 +168,7 @@ impl Field for TweedledeeBase { #[inline(always)] fn square(&self) -> Self { Self { - limbs: Self::monty_square(self.limbs), + limbs: Self::daira_square(self.limbs), } } } @@ -224,6 +185,16 @@ impl PartialOrd for TweedledeeBase { } } +/* +impl PartialEq for TweedledeeBase { + fn eq(&self, other: &Self) -> bool { + self.cmp_helper(other) == Equal + } +} + +impl Eq for TweedledeeBase { } +*/ + impl Display for TweedledeeBase { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", field_to_biguint(*self)) @@ -232,7 +203,8 @@ impl Display for TweedledeeBase { impl Debug for TweedledeeBase { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "TweedledeeBase {}", field_to_biguint(*self)) + //write!(f, "TweedledeeBase {}", field_to_biguint(*self)) + write!(f, "TweedledeeBase{:?}", self.limbs) } } @@ -241,7 +213,6 @@ mod tests { use crate::test_arithmetic; use crate::Field; use crate::TweedledeeBase; - use crate::MontyRepr; // This is just to access ORDER_X2. #[test] fn primitive_root_order() { @@ -257,9 +228,6 @@ mod tests { let small = ::ONE.to_canonical_u64_vec(); assert!(TweedledeeBase::is_valid_canonical_u64(&small)); - let big = TweedledeeBase::ORDER_X2.to_vec(); - assert_eq!(TweedledeeBase::is_valid_canonical_u64(&big), false); - let limbs = vec![1, 2, 3, 4, 5]; assert_eq!(TweedledeeBase::is_valid_canonical_u64(&limbs), false); } From 5f9b93a1fb5b5e0c9a2c252a7a2e3dec00242e2c Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 19 Jan 2021 20:11:53 +1100 Subject: [PATCH 04/10] Fastest compilation flags with Daira (and Monty) reduction. --- Cargo.toml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 56bbf9e..db20cd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,10 @@ harness = false name = "tweedledee_base" harness = false +[[bench]] +name = "tweedledum_base" +harness = false + [[bench]] name = "bls12_g1" harness = false @@ -66,6 +70,10 @@ harness = false [profile.release] opt-level = 3 +lto = "fat" +codegen-units = 1 [profile.bench] opt-level = 3 +lto = "fat" +codegen-units = 1 From 4a490056de61151e4adf44476f32f5fb700a8914 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 19 Jan 2021 21:09:49 +1100 Subject: [PATCH 05/10] Remove unnecessary reductions where possible. --- src/field/daira.rs | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/src/field/daira.rs b/src/field/daira.rs index c8f57f3..4ee3418 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -241,34 +241,31 @@ pub trait DairaRepr { // so reducing is not always necessary. fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] { let sum = add_no_overflow(lhs, rhs); - let res = if cmp(sum, Self::ORDER) == Less { + if cmp(sum, Self::ORDER) == Less { sum } else { sub(sum, Self::ORDER) - }; - Self::daira_to_canonical(res) + } } // TODO: This is copypasta from monty.rs fn daira_sub(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] { - let res = if cmp(lhs, rhs) == Less { + if cmp(lhs, rhs) == Less { // Underflow occurs, so we compute the difference as `self + (-rhs)`. add_no_overflow(lhs, Self::daira_neg(rhs)) } else { // No underflow, so it's faster to subtract directly. sub(lhs, rhs) - }; - Self::daira_to_canonical(res) + } } // TODO: This is copypasta from monty.rs fn daira_neg(limbs: [u64; 4]) -> [u64; 4] { - let res = if limbs == Self::ZERO { + if limbs == Self::ZERO { Self::ZERO } else { sub(Self::ORDER, limbs) - }; - Self::daira_to_canonical(res) + } } #[inline] @@ -293,7 +290,6 @@ pub trait DairaRepr { let u = mul_2_2(Self::C, xp1); // return M - u + xp0 - //Self::daira_to_canonical(add_no_overflow(sub(Self::ORDER, u), xp0)) let res = add_no_overflow(sub(Self::ORDER, u), xp0); let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1]; debug_assert!(cmp(res, max_expected) != Greater); @@ -315,30 +311,12 @@ pub trait DairaRepr { } fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] { - let mut b = a; - let mut count = 0; - while cmp(b, Self::ORDER) != Less && count < 10 { - b = sub(b, Self::ORDER); - count += 1; - } - //println!("Reducing ({}) {:?} -> {:?}", count, a, b); - /* - if count > 1 { - println!("Expected at most one reduction loop but did {}", count); - } - */ - debug_assert!(count < 2, "Expected at most one reduction loop"); - b - } - - /* - fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] { - let mut b = a; - if cmp(b, Self::ORDER) != Less { - b = sub(b, Self::ORDER); + if cmp(a, Self::ORDER) == Less { + a + } else { + let b = sub(a, Self::ORDER); debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop"); + b } - b } - */ } From 5ad97f60cc206ba4b402908d3326cea674228a37 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 19 Jan 2021 21:49:32 +1100 Subject: [PATCH 06/10] Remove final reduction from _reduce(); also some tidying. --- src/field/daira.rs | 14 ++++++++++---- src/field/field.rs | 2 -- src/field/tweedledee_base.rs | 8 ++++---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/field/daira.rs b/src/field/daira.rs index 4ee3418..9894ea3 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -291,9 +291,14 @@ pub trait DairaRepr { // return M - u + xp0 let res = add_no_overflow(sub(Self::ORDER, u), xp0); - let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1]; - debug_assert!(cmp(res, max_expected) != Greater); - Self::daira_to_canonical(res) + // max_expected = 2^254 + M - 1 = 2^255 + c - 1 + let max_expected = [ + Self::ORDER[0] - 1, Self::ORDER[1], + Self::ORDER[2], Self::ORDER[3] << 1 + ]; + debug_assert!(cmp(res, max_expected) != Greater, + "Semi-reduced value exceeds maximum expected"); + res } #[inline] @@ -315,7 +320,8 @@ pub trait DairaRepr { a } else { let b = sub(a, Self::ORDER); - debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop"); + debug_assert!(cmp(b, Self::ORDER) == Less, + "Expected at most one reduction loop"); b } } diff --git a/src/field/field.rs b/src/field/field.rs index 46ffeb6..ab4e8cc 100644 --- a/src/field/field.rs +++ b/src/field/field.rs @@ -409,10 +409,8 @@ pub trait Field: /// Like `Ord::cmp`. We can't implement `Ord` directly due to Rust's trait coherence rules, so /// instead we provide this helper which implementations can use to trivially implement `Ord`. fn cmp_helper(&self, other: &Self) -> Ordering { - //println!("Before"); let self_limbs = self.to_canonical_u64_vec().into_iter(); let other_limbs = other.to_canonical_u64_vec().into_iter(); - //println!("After"); let mut result = Equal; for (self_limb, other_limb) in self_limbs.zip(other_limbs) { diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs index 2318e32..087b2b6 100644 --- a/src/field/tweedledee_base.rs +++ b/src/field/tweedledee_base.rs @@ -11,8 +11,7 @@ use crate::{cmp, field_to_biguint, DairaRepr, Field}; /// An element of the Tweedledee group's base field. -#[derive(Copy, Clone, Eq, PartialEq, Hash, Default)] -//#[derive(Copy, Clone, Hash, Default)] +#[derive(Copy, Clone, Hash, Default)] pub struct TweedledeeBase { /// Daira representation, encoded with little-endian u64 limbs. pub limbs: [u64; 4], @@ -185,7 +184,8 @@ impl PartialOrd for TweedledeeBase { } } -/* +/// NB: We need to implement this so that is_zero and == call +/// to_canonical (via cmp_helper) before comparing numbers. impl PartialEq for TweedledeeBase { fn eq(&self, other: &Self) -> bool { self.cmp_helper(other) == Equal @@ -193,7 +193,7 @@ impl PartialEq for TweedledeeBase { } impl Eq for TweedledeeBase { } -*/ + impl Display for TweedledeeBase { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { From bb4191cf789e3052337e7cdd37fa3b3a6607b154 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 19 Jan 2021 22:26:41 +1100 Subject: [PATCH 07/10] Reinstate final reduction to make tests from other crates pass. --- src/field/daira.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/field/daira.rs b/src/field/daira.rs index 9894ea3..7eb4cfd 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -298,7 +298,10 @@ pub trait DairaRepr { ]; debug_assert!(cmp(res, max_expected) != Greater, "Semi-reduced value exceeds maximum expected"); - res + // FIXME: This is not necessary in general and it wastes + // performance; only included to make tests of calling code + // pass (which require values to always be reduced). + Self::daira_to_canonical(res) } #[inline] From 21741d8ca8afd3d051e2cecb6e7242934c97fd54 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 19 Jan 2021 22:31:44 +1100 Subject: [PATCH 08/10] Add missing file. --- benches/tweedledum_base.rs | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 benches/tweedledum_base.rs diff --git a/benches/tweedledum_base.rs b/benches/tweedledum_base.rs new file mode 100644 index 0000000..33d49e9 --- /dev/null +++ b/benches/tweedledum_base.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, Criterion}; +use criterion::criterion_group; +use criterion::criterion_main; + +use plonky::{Field, TweedledumBase}; + +fn criterion_benchmark(c: &mut Criterion) { + let x = TweedledumBase::from_canonical([11111111, 22222222, 33333333, 44444444]); + let y = TweedledumBase::from_canonical([44444444, 55555555, 66666666, 77777777]); + + c.bench_function("TweedledumBase field addition", move |b| b.iter(|| { + black_box(y) + black_box(x) + })); + + c.bench_function("TweedledumBase field subtraction (no underflow)", move |b| b.iter(|| { + black_box(y) - black_box(x) + })); + + c.bench_function("TweedledumBase field subtraction (underflow)", move |b| b.iter(|| { + black_box(x) - black_box(y) + })); + + c.bench_function("TweedledumBase field multiplication", move |b| b.iter(|| { + black_box(x) * black_box(y) + })); + + c.bench_function("TweedledumBase field squaring", move |b| b.iter(|| { + black_box(x).square() + })); + + c.bench_function("TweedledumBase field inversion", move |b| b.iter(|| { + black_box(x).multiplicative_inverse() + })); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From ea189cfef4b3248b41e918d951ec90a921638630 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 20 Jan 2021 22:38:06 +1100 Subject: [PATCH 09/10] Switch explicit constants into raw form from Monty form. --- src/curve/tweedledee_curve.rs | 8 ++++---- src/curve/tweedledum_curve.rs | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/curve/tweedledee_curve.rs b/src/curve/tweedledee_curve.rs index 159c055..da44973 100644 --- a/src/curve/tweedledee_curve.rs +++ b/src/curve/tweedledee_curve.rs @@ -21,10 +21,10 @@ impl Curve for Tweedledee { impl HaloCurve for Tweedledee { const ZETA: Self::BaseField = TweedledeeBase { limbs: [ - 1444470991491022206, - 3301226169728360777, - 72516509137424193, - 708688398506307241, + 4869731214443914493, + 7624468717990391489, + 9776623610414440446, + 3946962219815967320 ], }; const ZETA_SCALAR: Self::ScalarField = TweedledumBase { diff --git a/src/curve/tweedledum_curve.rs b/src/curve/tweedledum_curve.rs index 899875b..0bbf78c 100644 --- a/src/curve/tweedledum_curve.rs +++ b/src/curve/tweedledum_curve.rs @@ -44,10 +44,10 @@ impl HaloCurve for Tweedledum { }; const ZETA_SCALAR: Self::ScalarField = TweedledeeBase { limbs: [ - 9282944046338294407, - 16421485501699768486, - 18374227564572127422, - 3902997619921080662, + 4654449422605769475, + 11077468875262875656, + 8670120463295111169, + 664723798611420583 ] }; } From c2785b053b52df02921102c80f7d7d6d9a513a28 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 21 Jan 2021 21:29:16 +1100 Subject: [PATCH 10/10] Tidying up and comments. --- src/field/daira.rs | 19 ++++++++++++++----- src/field/field.rs | 25 ++++--------------------- src/field/tweedledee_base.rs | 3 +-- 3 files changed, 19 insertions(+), 28 deletions(-) diff --git a/src/field/daira.rs b/src/field/daira.rs index 7eb4cfd..81b7cb1 100644 --- a/src/field/daira.rs +++ b/src/field/daira.rs @@ -222,7 +222,8 @@ fn rebase_6(x: [u64; 6]) -> ([u64; 4], [u64; 2]) { /// adaptation of the tricks for moduli of the form 2^n + C. /// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw pub trait DairaRepr { - /// Order of the field (i.e. modulus for operations) + /// Order of the field (i.e. modulus for operations); equals 2^n + + /// C for some n and C. const ORDER: [u64; 4]; const ZERO: [u64; 4] = [0u64; 4]; @@ -238,7 +239,7 @@ pub trait DairaRepr { // TODO: This is copypasta from monty.rs // TODO: Daira's representation actually allows for some redundancy, - // so reducing is not always necessary. + // so reducing is not always necessary; same in sub fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] { let sum = add_no_overflow(lhs, rhs); if cmp(sum, Self::ORDER) == Less { @@ -268,6 +269,11 @@ pub trait DairaRepr { } } + /// Given an 8-word number (usually the result of multiplying or + /// squaring), reduce it modulo 2^n + C. + /// + /// The implementation is a direct translation of the formulae at + /// the end of https://hackmd.io/drzN-z-_So28zDLhK2tegw #[inline] fn _reduce(x: [u64; 8]) -> [u64; 4] { // x = (x0, x1, x2) @@ -298,9 +304,12 @@ pub trait DairaRepr { ]; debug_assert!(cmp(res, max_expected) != Greater, "Semi-reduced value exceeds maximum expected"); - // FIXME: This is not necessary in general and it wastes - // performance; only included to make tests of calling code - // pass (which require values to always be reduced). + // NB: This is not necessary in general; a different interface + // could accommodate semi-reduced results. Some performance + // testing suggested that the gain (as a proportion of + // mul/sqr) were too small to notice though. Must be included + // at the moment to make tests of calling code pass (which + // require values to always be reduced). Self::daira_to_canonical(res) } diff --git a/src/field/field.rs b/src/field/field.rs index f722426..03ee586 100644 --- a/src/field/field.rs +++ b/src/field/field.rs @@ -388,7 +388,9 @@ pub trait Field: if exp == Self::NEG_ONE { return false; } - panic!("Number theory is a lie! Expected {:?}^{:?} == 1 or p-1 but got {:?}", self, power, exp) + panic!("Number theory is a lie! \ + Expected {:?}^{:?} == 1 or p-1 \ + but got {:?}", self, power, exp) } /// The number of bits in the binary encoding of this field element. @@ -439,10 +441,7 @@ pub trait Field: /// roots, otherwise return `None`. /// Inspired by implementation in https://github.com/scipr-lab/zexe/blob/85bae796a411077733ddeefda042d02f4b4772e5/algebra-core/src/fields/arithmetic.rs fn square_root(&self) -> Option { - println!("square root input: {:?}", self); - let r = self.is_zero(); - println!(" is zero? {}", r); - if r { + if self.is_zero() { Some(*self) } else if self.is_quadratic_residue() { let mut z = Self::MULTIPLICATIVE_SUBGROUP_GENERATOR.exp(Self::T); @@ -569,14 +568,6 @@ pub mod field_tests { let output = inputs .iter() .map(|x| field_to_biguint(op(biguint_to_field(x.clone())))); - /* - for (x, y) in output.zip(expected) { - if x != y { - println!("got = {:?}; expected = {:?}", x, y); - assert!(false, "FAIL"); - } - } - */ // Compare expected outputs with actual outputs assert!( output.zip(expected).all(|(x, y)| x == y), @@ -616,14 +607,6 @@ pub mod field_tests { let output = inputs.iter().zip(shifted_inputs).map(|(x, y)| { field_to_biguint(op(biguint_to_field(x.clone()), biguint_to_field(y.clone()))) }); - /* - for (x, y) in output.zip(expected) { - if x != y { - println!("got = {:?}; expected = {:?}", x, y); - assert!(false, "FAIL"); - } - } - */ // Compare expected outputs with actual outputs assert!( output.zip(expected).all(|(x, y)| x == y), diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs index 087b2b6..85ec10b 100644 --- a/src/field/tweedledee_base.rs +++ b/src/field/tweedledee_base.rs @@ -203,8 +203,7 @@ impl Display for TweedledeeBase { impl Debug for TweedledeeBase { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - //write!(f, "TweedledeeBase {}", field_to_biguint(*self)) - write!(f, "TweedledeeBase{:?}", self.limbs) + write!(f, "TweedledeeBase {}", field_to_biguint(*self)) } }