From 790745e2b397a86af824b3ec1c9a4df68911f52d Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 29 Jan 2016 12:57:27 -0800
Subject: [PATCH] set mulw to <32 bits instead of <64 bits (but actually less
 than that: 1 limb instead of 2).  also there is a bug if you compile ed448
 for arch_32 on a 64-bit machine... tracing

---
 src/decaf.c                     | 39 ++++++++++---------------
 src/gen_headers/f_field_h.py    |  2 +-
 src/p25519/arch_32/f_impl.c     |  2 +-
 src/p25519/arch_ref64/f_impl.c  |  2 +-
 src/p25519/arch_x86_64/f_impl.c | 26 ++++++++++-------
 src/p448/arch_32/f_impl.c       | 17 ++++-------
 src/p448/arch_32/f_impl.h       |  4 +--
 src/p448/arch_arm_32/f_impl.c   | 52 ++++++++++++---------------------
 src/p448/arch_neon/f_impl.c     | 11 +++----
 src/p448/arch_ref64/f_impl.c    |  2 +-
 src/p448/arch_x86_64/f_impl.c   |  2 +-
 src/per_field.c                 |  2 +-
 12 files changed, 67 insertions(+), 94 deletions(-)

diff --git a/src/decaf.c b/src/decaf.c
index ae201b3..f4bb0a9 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) {
 static INLINE void
 cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
     constant_time_cond_swap(x,y,sizeof(gf_s),swap);
-    /*
-    UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
-        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
-        x->limb[i] ^= s;
-        y->limb[i] ^= s;
-    }
-    */
 }
 
 /** Inverse square root using addition chain. */
@@ -133,7 +126,7 @@ static void
 gf_invert(gf y, const gf x) {
     gf t1, t2;
     gf_sqr(t1, x); // o^2
-    decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
+    mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
     (void)ret; assert(ret);
     gf_sqr(t1, t2);
     gf_mul(t2, t1, x); // not direct to y in case of alias.
@@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) {
 
 /** Mul by signed int.  Not constant-time WRT the sign of that int. */
 static INLINE void
-gf_mulw_sgn(gf c, const gf a, int w) {
+gf_mulw_sgn(gf c, const gf a, int32_t w) {
     if (w>0) {
         gf_mulw(c, a, w);
     } else {
@@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) {
 }
 
 /** Return high bit of x = low bit of 2x mod p */
-static decaf_word_t hibit(const gf x) {
+static mask_t hibit(const gf x) {
     gf y;
     gf_add(y,x,x);
     gf_strong_reduce(y);
@@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) {
 
 #if COFACTOR==8
 /** Return high bit of x = low bit of 2x mod p */
-static decaf_word_t lobit(const gf x) {
+static mask_t lobit(const gf x) {
     gf y;
     gf_copy(y,x);
     gf_strong_reduce(y);
@@ -873,9 +866,9 @@ static INLINE void
 constant_time_lookup_xx (
     void *__restrict__ out_,
     const void *table_,
-    decaf_word_t elem_bytes,
-    decaf_word_t n_table,
-    decaf_word_t idx
+    word_t elem_bytes,
+    word_t n_table,
+    word_t idx
 ) {
     constant_time_lookup(out_,table_,elem_bytes,n_table,idx);
 }
@@ -928,12 +921,12 @@ void API_NS(point_scalarmul) (
 
     for (; i>=0; i-=WINDOW) {
         /* Fetch another block of bits */
-        decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
+        word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
             bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
         }
         bits &= WINDOW_MASK;
-        decaf_word_t inv = (bits>>(WINDOW-1))-1;
+        mask_t inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
         /* Add in from table.  Compute t only on last iteration. */
@@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) (
 
     for (; i>=0; i-=WINDOW) {
         /* Fetch another block of bits */
-        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
+        word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
                      bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
             bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
@@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) (
         }
         bits1 &= WINDOW_MASK;
         bits2 &= WINDOW_MASK;
-        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
-        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
+        mask_t inv1 = (bits1>>(WINDOW-1))-1;
+        mask_t inv2 = (bits2>>(WINDOW-1))-1;
         bits1 ^= inv1;
         bits2 ^= inv2;
     
@@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) (
         }
         
         /* Fetch another block of bits */
-        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
-                     bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
+        word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
+               bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
             bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
             bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
         }
         bits1 &= WINDOW_MASK;
         bits2 &= WINDOW_MASK;
-        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
-        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
+        mask_t inv1 = (bits1>>(WINDOW-1))-1;
+        mask_t inv2 = (bits2>>(WINDOW-1))-1;
         bits1 ^= inv1;
         bits2 ^= inv2;
         
diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index abc29d6..fa3fa5b 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout);
 void gf_add (gf out, const gf a, const gf b);
 void gf_sub (gf out, const gf a, const gf b);
 void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
-void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
+void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b);
 void gf_sqr (gf_s *__restrict__ out, const gf a);
 void gf_serialize (uint8_t *serial, const gf x);
 void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */
diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c
index 7c9ab84..656d9f7 100644
--- a/src/p25519/arch_32/f_impl.c
+++ b/src/p25519/arch_32/f_impl.c
@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += accum;
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
     uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
     uint32_t *c = cs->limb;
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index e8cd206..1f0e22d 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += accum;
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     int i;
     
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 2f94164..1ae69ef 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -4,6 +4,7 @@
 
 #include "f_field.h"
 
+/** Requires: input limbs < 9*2^51 */
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
@@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     
     ai = a[4];
     mac_rm(&accum1, ai, &b[0]);
+    /* Here accum1 < 5*(9*2^51)^2 */
     
     c[3] = accum0 & mask;
     accum1 += shrld(accum0, 51);
@@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     
     /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
      * = 2^(-13 + <13)
-     * PERF: good enough to fit into uint64_t?
+     * PERF: good enough to fit into uint64_t.
      */
     
     uint64_t a1 = shrld(accum1,51);
-    accum1 = (__uint128_t)a1 * 19 + c0;
+    /* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small
+     * a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51.
+     */
+    accum1 = a1 * 19 + c0;
     c[0] = accum1 & mask;
-    c[1] = c1 + shrld(accum1,51);
+    c[1] = c1 + (accum1>>51);
 }
 
 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
@@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     
     /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
      * = 2^(-13 + <13)
-     * PERF: good enough to fit into uint64_t?
      */
     
     uint64_t a1 = shrld(accum1,51);
-    accum1 = (__uint128_t)a1 * 19 + c0;
+    accum1 = a1 * 19 + c0;
     c[0] = accum1 & mask;
-    c[1] = c1 + shrld(accum1,51);
+    c[1] = c1 + (accum1>>51);
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
 
@@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     mac_rm(&accum, b, &a[4]);
     c[4] = accum & mask;
 
-    accum = shrld(accum,51);
-    accum = accum * 19 + c0;
+    uint64_t a1 = shrld(accum,51);
+    a1 = a1*19+c0;
     
-    c[0] = accum & mask;
-    c[1] = c1 + shrld(accum,51);
+    c[0] = a1 & mask;
+    c[1] = c1 + (a1>>51);
 }
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index f70b236..a07aae5 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += ((uint32_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
-    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+    assert(b<1<<28);
     
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
@@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 
     int i;
 
-    accum0 = widemul(blo, a[0]);
-    accum8 = widemul(blo, a[8]);
-    accum0 += widemul(bhi, a[15]);
-    accum8 += widemul(bhi, a[15] + a[7]);
+    accum0 = widemul(b, a[0]);
+    accum8 = widemul(b, a[8]);
 
     c[0] = accum0 & mask; accum0 >>= 28;
     c[8] = accum8 & mask; accum8 >>= 28;
     
     for (i=1; i<8; i++) {
-        accum0 += widemul(blo, a[i]);
-        accum8 += widemul(blo, a[i+8]);
-        
-        accum0 += widemul(bhi, a[i-1]);
-        accum8 += widemul(bhi, a[i+7]);
+        accum0 += widemul(b, a[i]);
+        accum8 += widemul(b, a[i+8]);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h
index 330a29c..7eae599 100644
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) {
 }
 
 void gf_weak_reduce (gf a) {
-    uint64_t mask = (1ull<<28) - 1;
-    uint64_t tmp = a->limb[15] >> 28;
+    uint32_t mask = (1ull<<28) - 1;
+    uint32_t tmp = a->limb[15] >> 28;
     a->limb[8] += tmp;
     for (unsigned int i=15; i>0; i--) {
         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index ddb0494..887c083 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
 void gf_mulw (
     gf_s *__restrict__ cs,
     const gf as,
-    uint64_t b
+    uint32_t b
 ) {
     uint32_t mask = (1ull<<28)-1;  
-    const uint32_t bhi = b>>28, blo = b & mask;
+    assert(b <= mask);
     
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
@@ -737,11 +737,9 @@ void gf_mulw (
     int i;
 
     uint32_t c0, c8, n0, n8;
-    accum0 = widemul(bhi, a[15]);
-    accum8 = widemul(bhi, a[15] + a[7]);
     c0 = a[0]; c8 = a[8];
-    smlal(&accum0, blo, c0);
-    smlal(&accum8, blo, c8);
+    accum0 = widemul(b, c0);
+    accum8 = widemul(b, c8);
 
     c[0] = accum0 & mask; accum0 >>= 28;
     c[8] = accum8 & mask; accum8 >>= 28;
@@ -749,10 +747,8 @@ void gf_mulw (
     i=1;
     {
         n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
+        smlal(&accum0, b, n0);
+        smlal(&accum8, b, n8);
         
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -760,10 +756,8 @@ void gf_mulw (
     }
     {
         c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
+        smlal(&accum0, b, c0);
+        smlal(&accum8, b, c8);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -771,10 +765,8 @@ void gf_mulw (
     }
     {
         n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
+        smlal(&accum0, b, n0);
+        smlal(&accum8, b, n8);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -782,10 +774,8 @@ void gf_mulw (
     }
     {
         c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
+        smlal(&accum0, b, c0);
+        smlal(&accum8, b, c8);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -793,10 +783,8 @@ void gf_mulw (
     }
     {
         n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
+        smlal(&accum0, b, n0);
+        smlal(&accum8, b, n8);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -804,10 +792,8 @@ void gf_mulw (
     }
     {
         c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
+        smlal(&accum0, b, c0);
+        smlal(&accum8, b, c8);
         
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -815,10 +801,8 @@ void gf_mulw (
     }
     {
         n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
+        smlal(&accum0, b, n0);
+        smlal(&accum8, b, n8);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
diff --git a/src/p448/arch_neon/f_impl.c b/src/p448/arch_neon/f_impl.c
index 2319c7b..ba0e303 100644
--- a/src/p448/arch_neon/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
     );
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { 
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
+    assert(b<(1<<28));
     
     uint64x2_t accum;
     const uint32x2_t *va = (const uint32x2_t *) as->limb;
     uint32x2_t *vo = (uint32x2_t *) cs->limb;
     uint32x2_t vc, vn;
-    uint32x2_t vb = {b & ((1<<28)-1), b>>28};
-    
-    accum = vmull_lane_u32(va[7], vb, 1);
-    accum = xx_vaddup_u64(vrev128_u64(accum));
+    uint32x2_t vb = {b, 0};
     
     vc = va[0];
-    accum = vmlal_lane_u32(accum, vc, vb, 0);
+    accum = vmull_lane_u32(accum, vc, vb, 0);
     vo[0] = vmovn_u64(accum) & vmask;
     accum = vshrq_n_u64(accum,28);
     
@@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     int i;
     for (i=1; i<8; i++) {
         vn = va[i];
-        accum = vmlal_lane_u32(accum, vc, vb, 1);
         accum = vmlal_lane_u32(accum, vn, vb, 0);
         vo[i] = vmovn_u64(accum) & vmask;
         accum = vshrq_n_u64(accum,28);
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index 22162aa..4273d3d 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += ((uint64_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 943e80b..4989cb5 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[0] += ((uint64_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
diff --git a/src/per_field.c b/src/per_field.c
index c76be14..c60b17f 100644
--- a/src/per_field.c
+++ b/src/per_field.c
@@ -1,6 +1,6 @@
 /**
  * @cond internal
- * @file decaf_crypto.c
+ * @file per_field.c
  * @copyright
  *   Copyright (c) 2015-2016 Cryptography Research, Inc.  \n
  *   Released under the MIT License.  See LICENSE.txt for license information.