From 2648244626c1c965ee0b36b088504dfe57dfd4fa Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Wed, 28 Oct 2020 19:42:22 +1100
Subject: [PATCH 01/10] Add back in the old pen-and-paper multiplication
 implementation.

---
 src/field/daira.rs | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 src/field/daira.rs

diff --git a/src/field/daira.rs b/src/field/daira.rs
new file mode 100644
index 0000000..d021a8e
--- /dev/null
+++ b/src/field/daira.rs
@@ -0,0 +1,45 @@
+#[inline]
+#[unroll_for_loops]
+fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] {
+    // Grade school multiplication. To avoid carrying at each of
+    // O(n^2) steps, we first add each intermediate product to a
+    // 128-bit accumulator, then propagate carries at the end.
+    let mut acc128 = [0u128; 4 + 4];
+
+    for i in 0..4 {
+        for j in 0..4 {
+            let a_i_b_j = a[i] as u128 * b[j] as u128;
+            // Add the less significant chunk to the less significant
+            // accumulator.
+            acc128[i + j] += a_i_b_j as u64 as u128;
+            // Add the more significant chunk to the more significant
+            // accumulator.
+            acc128[i + j + 1] += a_i_b_j >> 64;
+        }
+    }
+
+    let mut acc = [0u64; 8];
+    acc[0] = acc128[0] as u64;
+    let mut carry = false;
+    for i in 1..8 {
+        let last_chunk_big = (acc128[i - 1] >> 64) as u64;
+        let curr_chunk_small = acc128[i] as u64;
+        // Note that last_chunk_big won't get anywhere near 2^64,
+        // since it's essentially a carry from some additions in the
+        // previous phase, so we can add the carry bit to it without
+        // fear of overflow.
+        let result = curr_chunk_small.overflowing_add(
+            last_chunk_big + carry as u64);
+        acc[i] += result.0;
+        carry = result.1;
+    }
+    debug_assert!(!carry);
+    acc
+}
+
+/// This modular arithmetic representation is based on Daira's
+/// adaptation of the tricks for moduli of the form 2^b + c.
+/// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw
+pub trait DairaRepr {
+
+}

From 0652e842bc2ce87301512a04c128bb20cc0978d0 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Wed, 28 Oct 2020 20:23:50 +1100
Subject: [PATCH 02/10] First draft of full pen-and-paper squaring.

---
 src/field/daira.rs | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index d021a8e..3d1d2aa 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -1,4 +1,47 @@
+use unroll::unroll_for_loops;
+use crate::{add_no_overflow, sub, cmp, mul2};
+
+// FIXME: These functions are copypasta from monty.rs
+#[inline]
+fn mul_add_cy_in(a: u64, b: u64, c: u64, cy_in: u64) -> (u64, u64) {
+    let t = (a as u128) * (b as u128) + (c as u128) + (cy_in as u128);
+    ((t >> 64) as u64, t as u64)
+}
+
 #[inline]
+fn mul_add(a: u64, b: u64, c: u64) -> (u64, u64) {
+    let t = (a as u128) * (b as u128) + (c as u128);
+    ((t >> 64) as u64, t as u64)
+}
+
+#[unroll_for_loops]
+fn sqr_4(a: [u64; 4]) -> [u64; 8] {
+    let mut res = [u64; 8];
+
+    // Calculate the off-diagonal part of the square
+    for i in 0 .. 4 {
+        let mut hi_in = 0u64;
+        for j in i+1 .. 4 {
+            let (hi_out, lo) = mul_add(a[j], a[i], hi_in);
+            res[i + j] = lo;
+            hi_in = hi_out;
+        }
+        res[i + 4] = hi_in;
+    }
+    res = mul2(res); // NB: Top and bottom words are zero
+
+    // Calculate and add in the diagonal part
+    let mut hi_in = 0u64;
+    for i in 0 .. 4 {
+        let (hi_out, lo) = mul_add_cy_in(a[i], a[i], res[2*i], hi_in);
+        res[2*i] = lo;
+        let (t, cy) = res[2*i + 1].overflowing_add(hi_out);
+        res[2*i + 1] = t;
+        hi_in = cy as u64;
+    }
+    res
+}
+
 #[unroll_for_loops]
 fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] {
     // Grade school multiplication. To avoid carrying at each of

From b1e5c15f113732875312b34712866aff1b579c54 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Mon, 18 Jan 2021 22:32:13 +1100
Subject: [PATCH 03/10] Full working version of Daira-reduction based
 arithmetic (needs tidying).

---
 src/field/daira.rs           | 272 +++++++++++++++++++++++++++++++++--
 src/field/field.rs           |  25 +++-
 src/field/mod.rs             |   2 +
 src/field/tweedledee_base.rs | 160 +++++++++------------
 4 files changed, 353 insertions(+), 106 deletions(-)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index 3d1d2aa..c8f57f3 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -1,28 +1,39 @@
+use std::cmp::Ordering::{Less, Greater};
 use unroll::unroll_for_loops;
-use crate::{add_no_overflow, sub, cmp, mul2};
+use crate::{add_no_overflow, sub, cmp, nonzero_multiplicative_inverse};
 
 // FIXME: These functions are copypasta from monty.rs
-#[inline]
+#[inline(always)]
 fn mul_add_cy_in(a: u64, b: u64, c: u64, cy_in: u64) -> (u64, u64) {
     let t = (a as u128) * (b as u128) + (c as u128) + (cy_in as u128);
     ((t >> 64) as u64, t as u64)
 }
 
 #[inline]
-fn mul_add(a: u64, b: u64, c: u64) -> (u64, u64) {
-    let t = (a as u128) * (b as u128) + (c as u128);
-    ((t >> 64) as u64, t as u64)
+#[unroll_for_loops]
+fn mul2(x: [u64; 8]) -> [u64; 8] {
+    debug_assert_eq!(x[8-1] >> 63, 0, "Most significant bit should be clear");
+
+    let mut result = [0; 8];
+    result[0] = x[0] << 1;
+    for i in 1..8 {
+        result[i] = x[i] << 1 | x[i - 1] >> 63;
+    }
+    result
 }
 
+#[inline]
 #[unroll_for_loops]
 fn sqr_4(a: [u64; 4]) -> [u64; 8] {
-    let mut res = [u64; 8];
+    let mut res = [0u64; 8];
 
     // Calculate the off-diagonal part of the square
+    // TODO: Note that res is all zeros on the first itertion, so no
+    // need to add it
     for i in 0 .. 4 {
         let mut hi_in = 0u64;
         for j in i+1 .. 4 {
-            let (hi_out, lo) = mul_add(a[j], a[i], hi_in);
+            let (hi_out, lo) = mul_add_cy_in(a[j], a[i], res[i + j], hi_in);
             res[i + j] = lo;
             hi_in = hi_out;
         }
@@ -39,9 +50,11 @@ fn sqr_4(a: [u64; 4]) -> [u64; 8] {
         res[2*i + 1] = t;
         hi_in = cy as u64;
     }
+    debug_assert_eq!(hi_in, 0, "Unexpected carry detected");
     res
 }
 
+#[inline]
 #[unroll_for_loops]
 fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] {
     // Grade school multiplication. To avoid carrying at each of
@@ -80,9 +93,252 @@ fn mul_4_4(a: [u64; 4], b: [u64; 4]) -> [u64; 8] {
     acc
 }
 
+#[inline]
+#[unroll_for_loops]
+fn mul_2_2(a: [u64; 2], b: [u64; 2]) -> [u64; 4] {
+    // Grade school multiplication. To avoid carrying at each of
+    // O(n^2) steps, we first add each intermediate product to a
+    // 128-bit accumulator, then propagate carries at the end.
+    let mut acc128 = [0u128; 2 + 2];
+
+    for i in 0..2 {
+        for j in 0..2 {
+            let a_i_b_j = a[i] as u128 * b[j] as u128;
+            // Add the less significant chunk to the less significant
+            // accumulator.
+            acc128[i + j] += a_i_b_j as u64 as u128;
+            // Add the more significant chunk to the more significant
+            // accumulator.
+            acc128[i + j + 1] += a_i_b_j >> 64;
+        }
+    }
+
+    let mut acc = [0u64; 4];
+    acc[0] = acc128[0] as u64;
+    let mut carry = false;
+    for i in 1..4 {
+        let last_chunk_big = (acc128[i - 1] >> 64) as u64;
+        let curr_chunk_small = acc128[i] as u64;
+        // Note that last_chunk_big won't get anywhere near 2^64,
+        // since it's essentially a carry from some additions in the
+        // previous phase, so we can add the carry bit to it without
+        // fear of overflow.
+        let result = curr_chunk_small.overflowing_add(
+            last_chunk_big + carry as u64);
+        acc[i] += result.0;
+        carry = result.1;
+    }
+    debug_assert!(!carry);
+    acc
+}
+
+#[inline]
+#[unroll_for_loops]
+fn mul_2_4(a: [u64; 2], b: [u64; 4]) -> [u64; 6] {
+    // Grade school multiplication. To avoid carrying at each of
+    // O(n^2) steps, we first add each intermediate product to a
+    // 128-bit accumulator, then propagate carries at the end.
+    let mut acc128 = [0u128; 2 + 4];
+
+    for i in 0..2 {
+        for j in 0..4 {
+            let a_i_b_j = a[i] as u128 * b[j] as u128;
+            // Add the less significant chunk to the less significant
+            // accumulator.
+            acc128[i + j] += a_i_b_j as u64 as u128;
+            // Add the more significant chunk to the more significant
+            // accumulator.
+            acc128[i + j + 1] += a_i_b_j >> 64;
+        }
+    }
+
+    let mut acc = [0u64; 6];
+    acc[0] = acc128[0] as u64;
+    let mut carry = false;
+    for i in 1..6 {
+        let last_chunk_big = (acc128[i - 1] >> 64) as u64;
+        let curr_chunk_small = acc128[i] as u64;
+        // Note that last_chunk_big won't get anywhere near 2^64,
+        // since it's essentially a carry from some additions in the
+        // previous phase, so we can add the carry bit to it without
+        // fear of overflow.
+        let result = curr_chunk_small.overflowing_add(
+            last_chunk_big + carry as u64);
+        acc[i] += result.0;
+        carry = result.1;
+    }
+    debug_assert!(!carry);
+    acc
+}
+
+#[inline]
+fn add_6_4(a: [u64; 6], b: [u64; 4]) -> [u64; 6] {
+    // TODO: This is slightly wasteful, since we know the two high
+    // words are zero.
+    let c = [b[0], b[1], b[2], b[3], 0, 0];
+    add_no_overflow(a, c)
+}
+
+/// Given x = sum_{i=0}^7 xi (2^64)^i, with x < 2^512, return
+/// y1,y2,y3,y4,z1,z2,z3,z4 such that x = y + z * 2^254 + w * (2^254)^2
+/// where y = sum_{i=0}^3 yi (2^64)^i and z = sum_{i=0}^3 zi (2^64)^i
+/// are both < 2^254, and w < 16
+#[inline]
+fn rebase_8(x: [u64; 8]) -> ([u64; 4], [u64; 4], u64) {
+    const MASK: u64 = (1u64 << 62) - 1u64; // 2^62-1
+
+    // bottom half is the same, except the top two bits are dropped
+    let y = [x[0], x[1], x[2], x[3] & MASK];
+
+    // shift the top half words up by two bits
+    let z = [((x[4] << 2) | (x[3] >> 62)),
+             ((x[5] << 2) | (x[4] >> 62)),
+             ((x[6] << 2) | (x[5] >> 62)),
+             ((x[7] << 2) | (x[6] >> 62)) & MASK];
+
+    // save the very top two bits in w
+    let w = x[7] >> 60;
+
+    (y, z, w)
+}
+
+#[inline]
+fn rebase_6(x: [u64; 6]) -> ([u64; 4], [u64; 2]) {
+    const MASK: u64 = (1u64 << 62) - 1u64; // 2^62-1
+
+    debug_assert_eq!(x[5] >> 62, 0, "highest word of x is too big");
+
+    // bottom half is the same, except the top two bits are dropped
+    let y = [x[0], x[1], x[2], x[3] & MASK];
+
+    // shift the top words up by two bits
+    let z = [((x[4] << 2) | (x[3] >> 62)),
+             ((x[5] << 2) | (x[4] >> 62))];
+
+    (y, z)
+}
+
 /// This modular arithmetic representation is based on Daira's
-/// adaptation of the tricks for moduli of the form 2^b + c.
+/// adaptation of the tricks for moduli of the form 2^n + C.
 /// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw
 pub trait DairaRepr {
+    /// Order of the field (i.e. modulus for operations)
+    const ORDER: [u64; 4];
+
+    const ZERO: [u64; 4] = [0u64; 4];
+    const ONE: [u64; 4] = [1u64, 0u64, 0u64, 0u64];
+
+    /// The C in 2^n + C.
+    const C: [u64; 2];
+
+    /// The value of x*C^2 for x in 1..16
+    const C_SQR_TBL: [[u64; 4]; 15];
+
+    const K_M: [u64; 6];
+
+    // TODO: This is copypasta from monty.rs
+    // TODO: Daira's representation actually allows for some redundancy,
+    // so reducing is not always necessary.
+    fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] {
+        let sum = add_no_overflow(lhs, rhs);
+        let res = if cmp(sum, Self::ORDER) == Less {
+            sum
+        } else {
+            sub(sum, Self::ORDER)
+        };
+        Self::daira_to_canonical(res)
+    }
+
+    // TODO: This is copypasta from monty.rs
+    fn daira_sub(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] {
+        let res = if cmp(lhs, rhs) == Less {
+            // Underflow occurs, so we compute the difference as `self + (-rhs)`.
+            add_no_overflow(lhs, Self::daira_neg(rhs))
+        } else {
+            // No underflow, so it's faster to subtract directly.
+            sub(lhs, rhs)
+        };
+        Self::daira_to_canonical(res)
+    }
+
+    // TODO: This is copypasta from monty.rs
+    fn daira_neg(limbs: [u64; 4]) -> [u64; 4] {
+        let res = if limbs == Self::ZERO {
+            Self::ZERO
+        } else {
+            sub(Self::ORDER, limbs)
+        };
+        Self::daira_to_canonical(res)
+    }
+
+    #[inline]
+    fn _reduce(x: [u64; 8]) -> [u64; 4] {
+        // x = (x0, x1, x2)
+        let (x0, x1, x2) = rebase_8(x);
+        // s = C * x1
+        let s = mul_2_4(Self::C, x1);
+        // t = C^2 * x2 + x0
+        let t = if x2 == 0 {
+            x0
+        } else {
+            add_no_overflow(Self::C_SQR_TBL[(x2 - 1) as usize], x0)
+        };
+
+        // xp = kM - s + t
+        let xp = add_6_4(sub(Self::K_M, s), t);
+
+        // xp = (xp0, xp1)
+        let (xp0, xp1) = rebase_6(xp);
+        // u = C * xp1
+        let u = mul_2_2(Self::C, xp1);
 
+        // return M - u + xp0
+        //Self::daira_to_canonical(add_no_overflow(sub(Self::ORDER, u), xp0))
+        let res = add_no_overflow(sub(Self::ORDER, u), xp0);
+        let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1];
+        debug_assert!(cmp(res, max_expected) != Greater);
+        Self::daira_to_canonical(res)
+    }
+
+    #[inline]
+    fn daira_multiply(a: [u64; 4], b: [u64; 4]) -> [u64; 4] {
+        Self::_reduce(mul_4_4(a, b))
+    }
+
+    #[inline]
+    fn daira_square(a: [u64; 4]) -> [u64; 4] {
+        Self::_reduce(sqr_4(a))
+    }
+
+    fn daira_inverse(a: [u64; 4]) -> [u64; 4] {
+        nonzero_multiplicative_inverse(a, Self::ORDER)
+    }
+
+    fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] {
+        let mut b = a;
+        let mut count = 0;
+        while cmp(b, Self::ORDER) != Less && count < 10 {
+            b = sub(b, Self::ORDER);
+            count += 1;
+        }
+        //println!("Reducing ({}) {:?} -> {:?}", count, a, b);
+        /*
+        if count > 1 {
+            println!("Expected at most one reduction loop but did {}", count);
+        }
+        */
+        debug_assert!(count < 2, "Expected at most one reduction loop");
+        b
+    }
+
+    /*
+    fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] {
+        let mut b = a;
+        if cmp(b, Self::ORDER) != Less {
+            b = sub(b, Self::ORDER);
+            debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop");
+        }
+        b
+    }
+    */
 }
diff --git a/src/field/field.rs b/src/field/field.rs
index ad7fc58..ee578c3 100644
--- a/src/field/field.rs
+++ b/src/field/field.rs
@@ -384,7 +384,7 @@ pub trait Field:
         if exp == Self::NEG_ONE {
             return false;
         }
-        panic!("Number theory is a lie!")
+        panic!("Number theory is a lie! Expected {:?}^{:?} == 1 or p-1 but got {:?}", self, power, exp)
     }
 
     /// The number of bits in the binary encoding of this field element.
@@ -403,8 +403,10 @@ pub trait Field:
     /// Like `Ord::cmp`. We can't implement `Ord` directly due to Rust's trait coherence rules, so
     /// instead we provide this helper which implementations can use to trivially implement `Ord`.
     fn cmp_helper(&self, other: &Self) -> Ordering {
+        //println!("Before");
         let self_limbs = self.to_canonical_u64_vec().into_iter();
         let other_limbs = other.to_canonical_u64_vec().into_iter();
+        //println!("After");
 
         let mut result = Equal;
         for (self_limb, other_limb) in self_limbs.zip(other_limbs) {
@@ -433,7 +435,10 @@ pub trait Field:
     /// roots, otherwise return `None`.
     /// Inspired by implementation in https://github.com/scipr-lab/zexe/blob/85bae796a411077733ddeefda042d02f4b4772e5/algebra-core/src/fields/arithmetic.rs
     fn square_root(&self) -> Option<Self> {
-        if self.is_zero() {
+        println!("square root input: {:?}", self);
+        let r = self.is_zero();
+        println!("  is zero? {}", r);
+        if r {
             Some(*self)
         } else if self.is_quadratic_residue() {
             let mut z = Self::MULTIPLICATIVE_SUBGROUP_GENERATOR.exp(Self::T);
@@ -558,6 +563,14 @@ pub mod field_tests {
         let output = inputs
             .iter()
             .map(|x| field_to_biguint(op(biguint_to_field(x.clone()))));
+        /*
+        for (x, y) in output.zip(expected) {
+            if x != y {
+                println!("got = {:?}; expected = {:?}", x, y);
+                assert!(false, "FAIL");
+            }
+        }
+        */
         // Compare expected outputs with actual outputs
         assert!(
             output.zip(expected).all(|(x, y)| x == y),
@@ -597,6 +610,14 @@ pub mod field_tests {
             let output = inputs.iter().zip(shifted_inputs).map(|(x, y)| {
                 field_to_biguint(op(biguint_to_field(x.clone()), biguint_to_field(y.clone())))
             });
+            /*
+            for (x, y) in output.zip(expected) {
+                if x != y {
+                    println!("got = {:?}; expected = {:?}", x, y);
+                    assert!(false, "FAIL");
+                }
+            }
+            */
             // Compare expected outputs with actual outputs
             assert!(
                 output.zip(expected).all(|(x, y)| x == y),
diff --git a/src/field/mod.rs b/src/field/mod.rs
index 3ca6abe..c402272 100644
--- a/src/field/mod.rs
+++ b/src/field/mod.rs
@@ -4,6 +4,7 @@ pub use field::*;
 pub use tweedledee_base::*;
 pub use tweedledum_base::*;
 pub use monty::*;
+pub use daira::*;
 
 mod bls12_377_base;
 mod bls12_377_scalar;
@@ -12,3 +13,4 @@ mod field;
 mod tweedledee_base;
 mod tweedledum_base;
 mod monty;
+mod daira;
diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs
index d491548..2318e32 100644
--- a/src/field/tweedledee_base.rs
+++ b/src/field/tweedledee_base.rs
@@ -1,6 +1,6 @@
 use rand::Rng;
 use std::cmp::Ordering;
-use std::cmp::Ordering::Less;
+use std::cmp::Ordering::{Less, Equal};
 use std::convert::TryInto;
 use std::ops::{Add, Div, Mul, Neg, Sub};
 use std::fmt;
@@ -8,76 +8,69 @@ use std::fmt::{Debug, Display, Formatter};
 
 use crate::{cmp, field_to_biguint,
             rand_range, rand_range_from_rng,
-            MontyRepr, Field};
+            DairaRepr, Field};
 
 /// An element of the Tweedledee group's base field.
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Default)]
+//#[derive(Copy, Clone, Hash, Default)]
 pub struct TweedledeeBase {
-    /// Montgomery representation, encoded with little-endian u64 limbs.
+    /// Daira representation, encoded with little-endian u64 limbs.
     pub limbs: [u64; 4],
 }
 
-impl MontyRepr for TweedledeeBase {
-    /// The order of the field: 28948022309329048855892746252171976963322203655954433126947083963168578338817
+impl DairaRepr for TweedledeeBase {
+    /// ORDER = 2^254 + C
     const ORDER: [u64; 4] = [
-        9524180637049683969,
-        255193519543715529,
-        0,
-        4611686018427387904,
+        0x842cafd400000001,
+        0x38aa127696286c9,
+        0x0,
+        0x4000000000000000
     ];
 
-    /// Twice the order of the field: 57896044618658097711785492504343953926644407311908866253894167926337156677634
-    #[allow(dead_code)]
-    const ORDER_X2: [u64; 4] = [
-        601617200389816322,
-        510387039087431059,
-        0,
-        9223372036854775808,
+    const C: [u64; 2] = [0x842cafd400000001, 0x38aa127696286c9];
+
+    /// Multiplication lookup table: ith element is (i+1)*C^2
+    const C_SQR_TBL: [[u64; 4]; 15] = [
+    [0x8595fa800000001, 0x27e16a565c689523, 0x3f4c0e6fcb03aa0a, 0xc8ad910688601],
+    [0x10b2bf5000000002, 0x4fc2d4acb8d12a46, 0x7e981cdf96075414, 0x1915b220d10c02],
+    [0x190c1ef800000003, 0x77a43f031539bf69, 0xbde42b4f610afe1e, 0x25a08b31399203],
+    [0x21657ea000000004, 0x9f85a95971a2548c, 0xfd3039bf2c0ea828, 0x322b6441a21804],
+    [0x29bede4800000005, 0xc76713afce0ae9af, 0x3c7c482ef7125232, 0x3eb63d520a9e06],
+    [0x32183df000000006, 0xef487e062a737ed2, 0x7bc8569ec215fc3c, 0x4b411662732407],
+    [0x3a719d9800000007, 0x1729e85c86dc13f5, 0xbb14650e8d19a647, 0x57cbef72dbaa08],
+    [0x42cafd4000000008, 0x3f0b52b2e344a918, 0xfa60737e581d5051, 0x6456c883443009],
+    [0x4b245ce800000009, 0x66ecbd093fad3e3b, 0x39ac81ee2320fa5b, 0x70e1a193acb60b],
+    [0x537dbc900000000a, 0x8ece275f9c15d35e, 0x78f8905dee24a465, 0x7d6c7aa4153c0c],
+    [0x5bd71c380000000b, 0xb6af91b5f87e6881, 0xb8449ecdb9284e6f, 0x89f753b47dc20d],
+    [0x64307be00000000c, 0xde90fc0c54e6fda4, 0xf790ad3d842bf879, 0x96822cc4e6480e],
+    [0x6c89db880000000d, 0x6726662b14f92c7, 0x36dcbbad4f2fa284, 0xa30d05d54ece10],
+    [0x74e33b300000000e, 0x2e53d0b90db827ea, 0x7628ca1d1a334c8e, 0xaf97dee5b75411],
+    [0x7d3c9ad80000000f, 0x56353b0f6a20bd0d, 0xb574d88ce536f698, 0xbc22b7f61fda12]
     ];
 
-    /// R in the context of the Montgomery reduction, i.e. 2^256 % |F|.
-    const R: [u64; 4] = [
-        8320946236270051325,
-        17681163515078405027,
-        18446744073709551615,
-        4611686018427387903,
+    /// K = 2*C, M = ORDER, K_M = K*M
+    const K_M: [u64; 6] = [
+        0x10b2bf5000000002, 0x4fc2d4acb8d12a46, 0x7e981cdf96075414,
+        0x801915b220d10c02, 0xc21657ea00000000, 0x1c55093b4b14364
     ];
-
-    /// R^2 in the context of the Montgomery reduction, i.e. 2^(256*2) % |F|.
-    const R2: [u64; 4] = [
-        9625875206237061136,
-        9085631154807722544,
-        17636350113745641634,
-        56485833733595155,
-    ];
-
-    /// R^3 in the context of the Montgomery reduction, i.e. 2^(256*3) % |F|.
-    const R3: [u64; 4] = [
-        11971961131424865118,
-        6311318431551332850,
-        14638507591886519234,
-        739379759776372087,
-    ];
-
-    /// In the context of Montgomery multiplication, µ = -|F|^-1 mod 2^64.
-    const MU: u64 = 9524180637049683967;
 }
 
 impl TweedledeeBase {
     pub fn from_canonical(c: [u64; 4]) -> Self {
-        Self { limbs: Self::from_monty(c) }
+        debug_assert!(cmp(c, Self::ORDER) == Less, "Given non-canonical input: {:?}", c);
+        Self { limbs: c }
     }
 
     pub fn to_canonical(&self) -> [u64; 4] {
-        Self::to_monty(self.limbs)
+        Self::daira_to_canonical(self.limbs)
     }
 }
 
 impl Add<TweedledeeBase> for TweedledeeBase {
     type Output = Self;
 
-    fn add(self, rhs: TweedledeeBase) -> Self::Output {
-        Self { limbs: Self::monty_add(self.limbs, rhs.limbs) }
+    fn add(self, rhs: TweedledeeBase) -> Self {
+        Self { limbs: Self::daira_add(self.limbs, rhs.limbs) }
     }
 }
 
@@ -85,7 +78,7 @@ impl Sub<TweedledeeBase> for TweedledeeBase {
     type Output = Self;
 
     fn sub(self, rhs: Self) -> Self {
-        Self { limbs: Self::monty_sub(self.limbs, rhs.limbs) }
+        Self { limbs: Self::daira_sub(self.limbs, rhs.limbs) }
     }
 }
 
@@ -93,7 +86,7 @@ impl Mul<TweedledeeBase> for TweedledeeBase {
     type Output = Self;
 
     fn mul(self, rhs: Self) -> Self {
-        Self { limbs: Self::monty_multiply(self.limbs, rhs.limbs) }
+        Self { limbs: Self::daira_multiply(self.limbs, rhs.limbs) }
     }
 }
 
@@ -109,49 +102,22 @@ impl Neg for TweedledeeBase {
     type Output = Self;
 
     fn neg(self) -> Self {
-        Self { limbs: Self::monty_neg(self.limbs) }
+        Self { limbs: Self::daira_neg(self.limbs) }
     }
 }
 
 impl Field for TweedledeeBase {
     const BITS: usize = 255;
     const BYTES: usize = 32;
-    const ZERO: Self = Self { limbs: <Self as MontyRepr>::ZERO };
-    const ONE: Self = Self { limbs: <Self as MontyRepr>::ONE };
-    const TWO: Self = Self {
-        limbs: [
-            7117711835490418681,
-            16660389436903542909,
-            18446744073709551615,
-            4611686018427387903,
-        ],
-    };
-    const THREE: Self = Self {
-        limbs: [
-            5914477434710786037,
-            15639615358728680791,
-            18446744073709551615,
-            4611686018427387903,
-        ],
-    };
-    const FOUR: Self = Self {
-        limbs: [
-            4711243033931153393,
-            14618841280553818673,
-            18446744073709551615,
-            4611686018427387903,
-        ],
-    };
-    const FIVE: Self = Self {
-        limbs: [
-            3508008633151520749,
-            13598067202378956555,
-            18446744073709551615,
-            4611686018427387903,
-        ],
-    };
+    const ZERO: Self = Self { limbs: <Self as DairaRepr>::ZERO };
+    const ONE: Self = Self { limbs: <Self as DairaRepr>::ONE };
+    const TWO: Self = Self { limbs: [2u64, 0, 0, 0] };
+    const THREE: Self = Self { limbs: [3u64, 0, 0, 0] };
+    const FOUR: Self = Self { limbs: [4u64, 0, 0, 0] };
+    const FIVE: Self = Self { limbs: [5u64, 0, 0, 0] };
+
     const NEG_ONE: Self = Self {
-        limbs: [1203234400779632644, 1020774078174862118, 0, 0],
+        limbs: [0x842cafd400000000, 0x38aa127696286c9, 0x0, 0x4000000000000000]
     };
 
     const MULTIPLICATIVE_SUBGROUP_GENERATOR: Self = Self::FIVE;
@@ -162,12 +128,7 @@ impl Field for TweedledeeBase {
 
     /// 1684996666696914987166688442938726917102595538363933628829375605749
     const T: Self = Self {
-        limbs: [
-            9524180637049683969,
-            255193519543715529,
-            0,
-            4611686017353646080,
-        ],
+        limbs: [0xda58a1b2610b2bf5, 0xe2a849, 0x0, 0x10000000]
     };
 
     fn to_canonical_u64_vec(&self) -> Vec<u64> {
@@ -188,7 +149,7 @@ impl Field for TweedledeeBase {
 
     fn multiplicative_inverse_assuming_nonzero(&self) -> Self {
         Self {
-            limbs: Self::monty_inverse(self.limbs)
+            limbs: Self::daira_inverse(self.limbs)
         }
     }
 
@@ -207,7 +168,7 @@ impl Field for TweedledeeBase {
     #[inline(always)]
     fn square(&self) -> Self {
         Self {
-            limbs: Self::monty_square(self.limbs),
+            limbs: Self::daira_square(self.limbs),
         }
     }
 }
@@ -224,6 +185,16 @@ impl PartialOrd for TweedledeeBase {
     }
 }
 
+/*
+impl PartialEq for TweedledeeBase {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp_helper(other) == Equal
+    }
+}
+
+impl Eq for TweedledeeBase { }
+*/
+
 impl Display for TweedledeeBase {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         write!(f, "{}", field_to_biguint(*self))
@@ -232,7 +203,8 @@ impl Display for TweedledeeBase {
 
 impl Debug for TweedledeeBase {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        write!(f, "TweedledeeBase {}", field_to_biguint(*self))
+        //write!(f, "TweedledeeBase {}", field_to_biguint(*self))
+        write!(f, "TweedledeeBase{:?}", self.limbs)
     }
 }
 
@@ -241,7 +213,6 @@ mod tests {
     use crate::test_arithmetic;
     use crate::Field;
     use crate::TweedledeeBase;
-    use crate::MontyRepr; // This is just to access ORDER_X2.
 
     #[test]
     fn primitive_root_order() {
@@ -257,9 +228,6 @@ mod tests {
         let small = <TweedledeeBase as Field>::ONE.to_canonical_u64_vec();
         assert!(TweedledeeBase::is_valid_canonical_u64(&small));
 
-        let big = TweedledeeBase::ORDER_X2.to_vec();
-        assert_eq!(TweedledeeBase::is_valid_canonical_u64(&big), false);
-
         let limbs = vec![1, 2, 3, 4, 5];
         assert_eq!(TweedledeeBase::is_valid_canonical_u64(&limbs), false);
     }

From 5f9b93a1fb5b5e0c9a2c252a7a2e3dec00242e2c Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Tue, 19 Jan 2021 20:11:53 +1100
Subject: [PATCH 04/10] Fastest compilation flags with Daira (and Monty)
 reduction.

---
 Cargo.toml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 56bbf9e..db20cd2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,10 @@ harness = false
 name = "tweedledee_base"
 harness = false
 
+[[bench]]
+name = "tweedledum_base"
+harness = false
+
 [[bench]]
 name = "bls12_g1"
 harness = false
@@ -66,6 +70,10 @@ harness = false
 
 [profile.release]
 opt-level = 3
+lto = "fat"
+codegen-units = 1
 
 [profile.bench]
 opt-level = 3
+lto = "fat"
+codegen-units = 1

From 4a490056de61151e4adf44476f32f5fb700a8914 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Tue, 19 Jan 2021 21:09:49 +1100
Subject: [PATCH 05/10] Remove unnecessary reductions where possible.

---
 src/field/daira.rs | 44 +++++++++++---------------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index c8f57f3..4ee3418 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -241,34 +241,31 @@ pub trait DairaRepr {
     // so reducing is not always necessary.
     fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] {
         let sum = add_no_overflow(lhs, rhs);
-        let res = if cmp(sum, Self::ORDER) == Less {
+        if cmp(sum, Self::ORDER) == Less {
             sum
         } else {
             sub(sum, Self::ORDER)
-        };
-        Self::daira_to_canonical(res)
+        }
     }
 
     // TODO: This is copypasta from monty.rs
     fn daira_sub(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] {
-        let res = if cmp(lhs, rhs) == Less {
+        if cmp(lhs, rhs) == Less {
             // Underflow occurs, so we compute the difference as `self + (-rhs)`.
             add_no_overflow(lhs, Self::daira_neg(rhs))
         } else {
             // No underflow, so it's faster to subtract directly.
             sub(lhs, rhs)
-        };
-        Self::daira_to_canonical(res)
+        }
     }
 
     // TODO: This is copypasta from monty.rs
     fn daira_neg(limbs: [u64; 4]) -> [u64; 4] {
-        let res = if limbs == Self::ZERO {
+        if limbs == Self::ZERO {
             Self::ZERO
         } else {
             sub(Self::ORDER, limbs)
-        };
-        Self::daira_to_canonical(res)
+        }
     }
 
     #[inline]
@@ -293,7 +290,6 @@ pub trait DairaRepr {
         let u = mul_2_2(Self::C, xp1);
 
         // return M - u + xp0
-        //Self::daira_to_canonical(add_no_overflow(sub(Self::ORDER, u), xp0))
         let res = add_no_overflow(sub(Self::ORDER, u), xp0);
         let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1];
         debug_assert!(cmp(res, max_expected) != Greater);
@@ -315,30 +311,12 @@ pub trait DairaRepr {
     }
 
     fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] {
-        let mut b = a;
-        let mut count = 0;
-        while cmp(b, Self::ORDER) != Less && count < 10 {
-            b = sub(b, Self::ORDER);
-            count += 1;
-        }
-        //println!("Reducing ({}) {:?} -> {:?}", count, a, b);
-        /*
-        if count > 1 {
-            println!("Expected at most one reduction loop but did {}", count);
-        }
-        */
-        debug_assert!(count < 2, "Expected at most one reduction loop");
-        b
-    }
-
-    /*
-    fn daira_to_canonical(a: [u64; 4]) -> [u64; 4] {
-        let mut b = a;
-        if cmp(b, Self::ORDER) != Less {
-            b = sub(b, Self::ORDER);
+        if cmp(a, Self::ORDER) == Less {
+            a
+        } else {
+            let b = sub(a, Self::ORDER);
             debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop");
+            b
         }
-        b
     }
-    */
 }

From 5ad97f60cc206ba4b402908d3326cea674228a37 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Tue, 19 Jan 2021 21:49:32 +1100
Subject: [PATCH 06/10] Remove final reduction from _reduce(); also some
 tidying.

---
 src/field/daira.rs           | 14 ++++++++++----
 src/field/field.rs           |  2 --
 src/field/tweedledee_base.rs |  8 ++++----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index 4ee3418..9894ea3 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -291,9 +291,14 @@ pub trait DairaRepr {
 
         // return M - u + xp0
         let res = add_no_overflow(sub(Self::ORDER, u), xp0);
-        let max_expected = [Self::ORDER[0] - 1, Self::ORDER[1], Self::ORDER[2], Self::ORDER[3] << 1];
-        debug_assert!(cmp(res, max_expected) != Greater);
-        Self::daira_to_canonical(res)
+        // max_expected = 2^254 + M - 1 = 2^255 + c - 1
+        let max_expected = [
+            Self::ORDER[0] - 1, Self::ORDER[1],
+            Self::ORDER[2], Self::ORDER[3] << 1
+        ];
+        debug_assert!(cmp(res, max_expected) != Greater,
+                      "Semi-reduced value exceeds maximum expected");
+        res
     }
 
     #[inline]
@@ -315,7 +320,8 @@ pub trait DairaRepr {
             a
         } else {
             let b = sub(a, Self::ORDER);
-            debug_assert!(cmp(b, Self::ORDER) == Less, "Expected at most one reduction loop");
+            debug_assert!(cmp(b, Self::ORDER) == Less,
+                          "Expected at most one reduction loop");
             b
         }
     }
diff --git a/src/field/field.rs b/src/field/field.rs
index 46ffeb6..ab4e8cc 100644
--- a/src/field/field.rs
+++ b/src/field/field.rs
@@ -409,10 +409,8 @@ pub trait Field:
     /// Like `Ord::cmp`. We can't implement `Ord` directly due to Rust's trait coherence rules, so
     /// instead we provide this helper which implementations can use to trivially implement `Ord`.
     fn cmp_helper(&self, other: &Self) -> Ordering {
-        //println!("Before");
         let self_limbs = self.to_canonical_u64_vec().into_iter();
         let other_limbs = other.to_canonical_u64_vec().into_iter();
-        //println!("After");
 
         let mut result = Equal;
         for (self_limb, other_limb) in self_limbs.zip(other_limbs) {
diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs
index 2318e32..087b2b6 100644
--- a/src/field/tweedledee_base.rs
+++ b/src/field/tweedledee_base.rs
@@ -11,8 +11,7 @@ use crate::{cmp, field_to_biguint,
             DairaRepr, Field};
 
 /// An element of the Tweedledee group's base field.
-#[derive(Copy, Clone, Eq, PartialEq, Hash, Default)]
-//#[derive(Copy, Clone, Hash, Default)]
+#[derive(Copy, Clone, Hash, Default)]
 pub struct TweedledeeBase {
     /// Daira representation, encoded with little-endian u64 limbs.
     pub limbs: [u64; 4],
@@ -185,7 +184,8 @@ impl PartialOrd for TweedledeeBase {
     }
 }
 
-/*
+/// NB: We need to implement this so that is_zero and == call
+/// to_canonical (via cmp_helper) before comparing numbers.
 impl PartialEq for TweedledeeBase {
     fn eq(&self, other: &Self) -> bool {
         self.cmp_helper(other) == Equal
@@ -193,7 +193,7 @@ impl PartialEq for TweedledeeBase {
 }
 
 impl Eq for TweedledeeBase { }
-*/
+
 
 impl Display for TweedledeeBase {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {

From bb4191cf789e3052337e7cdd37fa3b3a6607b154 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Tue, 19 Jan 2021 22:26:41 +1100
Subject: [PATCH 07/10] Reinstate final reduction to make tests from other
 crates pass.

---
 src/field/daira.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index 9894ea3..7eb4cfd 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -298,7 +298,10 @@ pub trait DairaRepr {
         ];
         debug_assert!(cmp(res, max_expected) != Greater,
                       "Semi-reduced value exceeds maximum expected");
-        res
+        // FIXME: This is not necessary in general and it wastes
+        // performance; only included to make tests of calling code
+        // pass (which require values to always be reduced).
+        Self::daira_to_canonical(res)
     }
 
     #[inline]

From 21741d8ca8afd3d051e2cecb6e7242934c97fd54 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Tue, 19 Jan 2021 22:31:44 +1100
Subject: [PATCH 08/10] Add missing file.

---
 benches/tweedledum_base.rs | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 benches/tweedledum_base.rs

diff --git a/benches/tweedledum_base.rs b/benches/tweedledum_base.rs
new file mode 100644
index 0000000..33d49e9
--- /dev/null
+++ b/benches/tweedledum_base.rs
@@ -0,0 +1,37 @@
+use criterion::{black_box, Criterion};
+use criterion::criterion_group;
+use criterion::criterion_main;
+
+use plonky::{Field, TweedledumBase};
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let x = TweedledumBase::from_canonical([11111111, 22222222, 33333333, 44444444]);
+    let y = TweedledumBase::from_canonical([44444444, 55555555, 66666666, 77777777]);
+
+    c.bench_function("TweedledumBase field addition", move |b| b.iter(|| {
+        black_box(y) + black_box(x)
+    }));
+
+    c.bench_function("TweedledumBase field subtraction (no underflow)", move |b| b.iter(|| {
+        black_box(y) - black_box(x)
+    }));
+
+    c.bench_function("TweedledumBase field subtraction (underflow)", move |b| b.iter(|| {
+        black_box(x) - black_box(y)
+    }));
+
+    c.bench_function("TweedledumBase field multiplication", move |b| b.iter(|| {
+        black_box(x) * black_box(y)
+    }));
+
+    c.bench_function("TweedledumBase field squaring", move |b| b.iter(|| {
+        black_box(x).square()
+    }));
+
+    c.bench_function("TweedledumBase field inversion", move |b| b.iter(|| {
+        black_box(x).multiplicative_inverse()
+    }));
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);

From ea189cfef4b3248b41e918d951ec90a921638630 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Wed, 20 Jan 2021 22:38:06 +1100
Subject: [PATCH 09/10] Switch explicit constants into raw form from Monty
 form.

---
 src/curve/tweedledee_curve.rs | 8 ++++----
 src/curve/tweedledum_curve.rs | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/curve/tweedledee_curve.rs b/src/curve/tweedledee_curve.rs
index 159c055..da44973 100644
--- a/src/curve/tweedledee_curve.rs
+++ b/src/curve/tweedledee_curve.rs
@@ -21,10 +21,10 @@ impl Curve for Tweedledee {
 impl HaloCurve for Tweedledee {
     const ZETA: Self::BaseField = TweedledeeBase {
         limbs: [
-            1444470991491022206,
-            3301226169728360777,
-            72516509137424193,
-            708688398506307241,
+            4869731214443914493,
+            7624468717990391489,
+            9776623610414440446,
+            3946962219815967320
         ],
     };
     const ZETA_SCALAR: Self::ScalarField = TweedledumBase {
diff --git a/src/curve/tweedledum_curve.rs b/src/curve/tweedledum_curve.rs
index 899875b..0bbf78c 100644
--- a/src/curve/tweedledum_curve.rs
+++ b/src/curve/tweedledum_curve.rs
@@ -44,10 +44,10 @@ impl HaloCurve for Tweedledum {
     };
     const ZETA_SCALAR: Self::ScalarField = TweedledeeBase {
         limbs: [
-            9282944046338294407,
-            16421485501699768486,
-            18374227564572127422,
-            3902997619921080662,
+            4654449422605769475,
+            11077468875262875656,
+            8670120463295111169,
+            664723798611420583
         ]
     };
 }

From c2785b053b52df02921102c80f7d7d6d9a513a28 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish@ivey-law.name>
Date: Thu, 21 Jan 2021 21:29:16 +1100
Subject: [PATCH 10/10] Tidying up and comments.

---
 src/field/daira.rs           | 19 ++++++++++++++-----
 src/field/field.rs           | 25 ++++---------------------
 src/field/tweedledee_base.rs |  3 +--
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/src/field/daira.rs b/src/field/daira.rs
index 7eb4cfd..81b7cb1 100644
--- a/src/field/daira.rs
+++ b/src/field/daira.rs
@@ -222,7 +222,8 @@ fn rebase_6(x: [u64; 6]) -> ([u64; 4], [u64; 2]) {
 /// adaptation of the tricks for moduli of the form 2^n + C.
 /// Source: https://hackmd.io/drzN-z-_So28zDLhK2tegw
 pub trait DairaRepr {
-    /// Order of the field (i.e. modulus for operations)
+    /// Order of the field (i.e. modulus for operations); equals 2^n +
+    /// C for some n and C.
     const ORDER: [u64; 4];
 
     const ZERO: [u64; 4] = [0u64; 4];
@@ -238,7 +239,7 @@ pub trait DairaRepr {
 
     // TODO: This is copypasta from monty.rs
     // TODO: Daira's representation actually allows for some redundancy,
-    // so reducing is not always necessary.
+    // so reducing is not always necessary; same in sub
     fn daira_add(lhs: [u64; 4], rhs: [u64; 4]) -> [u64; 4] {
         let sum = add_no_overflow(lhs, rhs);
         if cmp(sum, Self::ORDER) == Less {
@@ -268,6 +269,11 @@ pub trait DairaRepr {
         }
     }
 
+    /// Given an 8-word number (usually the result of multiplying or
+    /// squaring), reduce it modulo 2^n + C.
+    ///
+    /// The implementation is a direct translation of the formulae at
+    /// the end of https://hackmd.io/drzN-z-_So28zDLhK2tegw
     #[inline]
     fn _reduce(x: [u64; 8]) -> [u64; 4] {
         // x = (x0, x1, x2)
@@ -298,9 +304,12 @@ pub trait DairaRepr {
         ];
         debug_assert!(cmp(res, max_expected) != Greater,
                       "Semi-reduced value exceeds maximum expected");
-        // FIXME: This is not necessary in general and it wastes
-        // performance; only included to make tests of calling code
-        // pass (which require values to always be reduced).
+        // NB: This is not necessary in general; a different interface
+        // could accommodate semi-reduced results.  Some performance
+        // testing suggested that the gain (as a proportion of
+        // mul/sqr) were too small to notice though. Must be included
+        // at the moment to make tests of calling code pass (which
+        // require values to always be reduced).
         Self::daira_to_canonical(res)
     }
 
diff --git a/src/field/field.rs b/src/field/field.rs
index f722426..03ee586 100644
--- a/src/field/field.rs
+++ b/src/field/field.rs
@@ -388,7 +388,9 @@ pub trait Field:
         if exp == Self::NEG_ONE {
             return false;
         }
-        panic!("Number theory is a lie! Expected {:?}^{:?} == 1 or p-1 but got {:?}", self, power, exp)
+        panic!("Number theory is a lie! \
+                Expected {:?}^{:?} == 1 or p-1 \
+                but got {:?}", self, power, exp)
     }
 
     /// The number of bits in the binary encoding of this field element.
@@ -439,10 +441,7 @@ pub trait Field:
     /// roots, otherwise return `None`.
     /// Inspired by implementation in https://github.com/scipr-lab/zexe/blob/85bae796a411077733ddeefda042d02f4b4772e5/algebra-core/src/fields/arithmetic.rs
     fn square_root(&self) -> Option<Self> {
-        println!("square root input: {:?}", self);
-        let r = self.is_zero();
-        println!("  is zero? {}", r);
-        if r {
+        if self.is_zero() {
             Some(*self)
         } else if self.is_quadratic_residue() {
             let mut z = Self::MULTIPLICATIVE_SUBGROUP_GENERATOR.exp(Self::T);
@@ -569,14 +568,6 @@ pub mod field_tests {
         let output = inputs
             .iter()
             .map(|x| field_to_biguint(op(biguint_to_field(x.clone()))));
-        /*
-        for (x, y) in output.zip(expected) {
-            if x != y {
-                println!("got = {:?}; expected = {:?}", x, y);
-                assert!(false, "FAIL");
-            }
-        }
-        */
         // Compare expected outputs with actual outputs
         assert!(
             output.zip(expected).all(|(x, y)| x == y),
@@ -616,14 +607,6 @@ pub mod field_tests {
             let output = inputs.iter().zip(shifted_inputs).map(|(x, y)| {
                 field_to_biguint(op(biguint_to_field(x.clone()), biguint_to_field(y.clone())))
             });
-            /*
-            for (x, y) in output.zip(expected) {
-                if x != y {
-                    println!("got = {:?}; expected = {:?}", x, y);
-                    assert!(false, "FAIL");
-                }
-            }
-            */
             // Compare expected outputs with actual outputs
             assert!(
                 output.zip(expected).all(|(x, y)| x == y),
diff --git a/src/field/tweedledee_base.rs b/src/field/tweedledee_base.rs
index 087b2b6..85ec10b 100644
--- a/src/field/tweedledee_base.rs
+++ b/src/field/tweedledee_base.rs
@@ -203,8 +203,7 @@ impl Display for TweedledeeBase {
 
 impl Debug for TweedledeeBase {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        //write!(f, "TweedledeeBase {}", field_to_biguint(*self))
-        write!(f, "TweedledeeBase{:?}", self.limbs)
+        write!(f, "TweedledeeBase {}", field_to_biguint(*self))
     }
 }