From f5b98269598bcbea45aae7561944c7277b605976 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Fri, 6 Mar 2015 16:00:31 -0800
Subject: [PATCH] precomputed scalarmul almost ported, but doesnt work yet

---
 src/decaf_fast.c | 213 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 191 insertions(+), 22 deletions(-)

diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index 45a1e61..36c676b 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -89,12 +89,13 @@ const decaf_448_point_t decaf_448_point_base = {{
       LIMB(0x723a55709a3983),LIMB(0xe1c0107a823dd4) }
 }};
 
-struct decaf_448_precomputed_s {
-    decaf_448_point_t p[1];
-};
+/* Projective Niels coordinates */
+typedef struct { gf a, b, c; } niels_s, niels_t[1];
+typedef struct { niels_t n; gf z; } pniels_s, pniels_t[1];
+struct decaf_448_precomputed_s { niels_t table [5<<4]; /* MAGIC */ };
 
-const struct decaf_448_precomputed_s *decaf_448_precomputed_base =
-    (const struct decaf_448_precomputed_s *)decaf_448_point_base;
+const struct decaf_448_precomputed_s decaf_448_precomputed_base_s,
+    *decaf_448_precomputed_base = &decaf_448_precomputed_base_s;
 
 const size_t sizeof_decaf_448_precomputed_s = sizeof(struct decaf_448_precomputed_s);
 const size_t alignof_decaf_448_precomputed_s = 32;
@@ -704,16 +705,13 @@ void decaf_448_point_scalarmul_xxx (
     decaf_448_point_sub(a,w,tmp);
 }
 
-/* Projective Niels coordinates */
-typedef struct { gf a, b, c; } niels_s, niels_t[1];
-typedef struct { niels_t n; gf z; } pniels_s, pniels_t[1];
-
-static void cond_neg_pniels (
-    pniels_t b,
+/* Operations on [p]niels */
+static void cond_neg_niels (
+    niels_t n,
     decaf_bool_t neg
 ) {
-    cond_swap(b->n->a, b->n->b, neg);
-    cond_neg(b->n->c, neg);
+    cond_swap(n->a, n->b, neg);
+    cond_neg(n->c, neg);
 }
 
 static void pt_to_pniels (
@@ -739,6 +737,16 @@ static void pniels_to_pt (
     gf_sqr ( e->z, d->z );
 }
 
+static void niels_to_pt (
+    decaf_448_point_t e,
+    const niels_t n
+) {
+    gf_add ( e->y, n->b, n->a );
+    gf_sub ( e->x, n->b, n->a );
+    gf_mul ( e->t, e->y, e->x );
+    gf_cpy ( e->z, ONE );
+}
+
 static void add_niels_to_pt (
     decaf_448_point_t d,
     const niels_t e,
@@ -814,7 +822,7 @@ void decaf_448_point_scalarmul (
     bits ^= inv;
     
     constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
-    cond_neg_pniels(pn, inv);
+    cond_neg_niels(pn->n, inv);
     pniels_to_pt(tmp, pn);
 
     for (i-=WINDOW; i>=0; i-=WINDOW) {
@@ -837,7 +845,7 @@ void decaf_448_point_scalarmul (
     
         /* Add in from table.  Compute t only on last iteration. */
         constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
-        cond_neg_pniels(pn, inv);
+        cond_neg_niels(pn->n, inv);
         add_pniels_to_pt(tmp, pn, i ? -1 : 0);
     }
     
@@ -961,19 +969,180 @@ decaf_bool_t decaf_448_point_valid (
     return out;
 }
 
-void decaf_448_precompute (
-    decaf_448_precomputed_s *a,
-    const decaf_448_point_t b
+// void decaf_448_precompute (
+//     decaf_448_precomputed_s *a,
+//     const decaf_448_point_t b
+// ) {
+//     decaf_448_point_copy(a->p[0],b);
+// }
+
+// void decaf_448_precomputed_scalarmul (
+//     decaf_448_point_t a,
+//     const decaf_448_precomputed_s *b,
+//     const decaf_448_scalar_t scalar
+// ) {
+//     decaf_448_point_scalarmul(a,b->p[0],scalar);
+// }
+
+void gf_batch_invert (
+    gf *__restrict__ out,
+    const gf *in,
+    unsigned int n
 ) {
-    decaf_448_point_copy(a->p[0],b);
+    // if (n==0) {
+    //     return;
+    // } else if (n==1) {
+    //     field_inverse(out[0],in[0]);
+    //     return;
+    // }
+    assert(n>1);
+  
+    gf_cpy(out[1], in[0]);
+    int i;
+    for (i=1; i<(int) (n-1); i++) {
+        gf_mul(out[i+1], out[i], in[i]);
+    }
+    gf_mul(out[0], out[n-1], in[n-1]);
+
+    gf t1, t2;
+    gf_isqrt(t1, out[0]);
+    gf_sqr(t2, t1);
+    gf_sqr(t1, t2);
+    gf_mul(t2, t1, out[0]);
+    gf_cpy(out[0], t2);
+
+    for (i=n-1; i>0; i--) {
+        gf_mul(t1, out[i], out[0]);
+        gf_cpy(out[i], t1);
+        gf_mul(t1, out[0], in[i]);
+        gf_cpy(out[0], t1);
+    }
+}
+
+void
+decaf_448_precompute (
+    struct decaf_448_precomputed_s *table,
+    const decaf_448_point_t base
+) { 
+    const int n = 5, t = 5, s = 18; // TODO MAGIC
+    assert(n*t*s >= DECAF_448_SCALAR_BITS);
+  
+    decaf_448_point_t working, start, doubles[t-1];
+    decaf_448_point_copy(working, base);
+    pniels_t pn_tmp;
+  
+    gf zs[n<<(t-1)], zis[n<<(t-1)];
+  
+    unsigned int i,j,k;
+    
+    /* Compute n tables */
+    for (i=0; i<n; i++) {
+
+        /* Doubling phase */
+        for (j=0; j<t; j++) {
+            if (j) decaf_448_point_add(start, start, working);
+            else decaf_448_point_copy(start, working);
+
+            if (j==t-1 && i==n-1) break;
+
+            decaf_448_point_double(working, working);
+            if (j<t-1) decaf_448_point_copy(doubles[j], working);
+
+            for (k=0; k<s-1; k++)
+                decaf_448_point_double_internal(working, working, k<s-2);
+        }
+
+        /* Gray-code phase */
+        for (j=0;; j++) {
+            int gray = j ^ (j>>1);
+            int idx = (((i+1)<<(t-1))-1) ^ gray;
+
+            pt_to_pniels(pn_tmp, start);
+            memcpy(table->table[idx], pn_tmp->n, sizeof(pn_tmp->n));
+            gf_cpy(zs[idx], pn_tmp->z);
+			
+            if (j >= (1u<<(t-1)) - 1) break;
+            int delta = (j+1) ^ ((j+1)>>1) ^ gray;
+
+            for (k=0; delta>1; k++)
+                delta >>=1;
+            
+            if (gray & (1<<k)) {
+                decaf_448_point_add(start, start, doubles[k]);
+            } else {
+                decaf_448_point_sub(start, start, doubles[k]);
+            }
+        }
+    }
+    
+    gf_batch_invert(zis, zs, n<<(t-1));
+
+    gf product;
+    for (i=0; i<n<<(t-1); i++) {
+        gf_mul(product, table->table[i]->a, zis[i]);
+        gf_canon(product);
+        gf_cpy(table->table[i]->a, product);
+        
+        gf_mul(product, table->table[i]->b, zis[i]);
+        gf_canon(product);
+        gf_cpy(table->table[i]->b, product);
+        
+        gf_mul(product, table->table[i]->c, zis[i]);
+        gf_canon(product);
+        gf_cpy(table->table[i]->c, product);
+    }
 }
 
 void decaf_448_precomputed_scalarmul (
-    decaf_448_point_t a,
-    const decaf_448_precomputed_s *b,
+    decaf_448_point_t out,
+    const struct decaf_448_precomputed_s *table,
     const decaf_448_scalar_t scalar
 ) {
-    decaf_448_point_scalarmul(a,b->p[0],scalar);
+    unsigned int i,j,k;
+    const int n = 5, t = 5, s = 18, nbits = 450; // TODO MAGIC
+    
+    unsigned int scalar2_words = (nbits + WBITS - 1)/WBITS;
+    if (scalar2_words < SCALAR_WORDS) scalar2_words = SCALAR_WORDS;
+    
+    decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr;
+    onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1);
+
+    /* FIXME PERF MAGIC precompute 2^449-1/2 mod q.  Could instead use 2^446-1/2 mod q though. */
+    decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
+
+    /* PERF dedicated halve */
+    decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one);
+    decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
+    decaf_448_scalar_add(scalar2, scalar2, arrr);
+    
+    niels_t ni;
+    
+    for (i=0; i<s; i++) {
+        if (i) decaf_448_point_double(out,out);
+        
+        for (j=0; j<n; j++) {
+            int tab = 0;
+            
+            for (k=0; k<t; k++) {
+                unsigned int bit = (s-1-i) + k*s + j*(s*t);
+                if (bit < scalar2_words * WBITS) {
+                    tab |= (scalar2->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
+                }
+            }
+            
+            mask_t invert = (tab>>(t-1))-1;
+            tab ^= invert;
+            tab &= (1<<(t-1)) - 1;
+            
+            constant_time_lookup(ni, &table->table[j<<(t-1)], sizeof(ni), 1<<(t-1), tab);
+            cond_neg_niels(ni, invert);
+            if (i||j) {
+                add_niels_to_pt(out, ni, (j==n-1 && i<s-1));
+            } else {
+                niels_to_pt(out, ni);
+            }
+        }
+    }
 }
 
 decaf_bool_t decaf_448_direct_scalarmul (