From 704b4249827641a29565f68c851feb22db37accc Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Tue, 24 Nov 2015 12:00:00 -0800
Subject: [PATCH] dual scalarmul because of TLS discussion

---
 src/decaf_fast.c                       | 100 +++++++++++++++++++++++++
 src/include/constant_time.h            |  67 +++++++++++++++++
 src/public_include/decaf/decaf_255.h   |  22 ++++++
 src/public_include/decaf/decaf_255.hxx |   7 ++
 src/public_include/decaf/decaf_448.h   |  22 ++++++
 src/public_include/decaf/decaf_448.hxx |   7 ++
 test/bench_decaf.cxx                   |   1 +
 test/test_decaf.cxx                    |   9 ++-
 8 files changed, 234 insertions(+), 1 deletion(-)

diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index bf85a3d..2025ca3 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -1064,6 +1064,106 @@ void API_NS(point_double_scalarmul) (
     decaf_bzero(tmp,sizeof(tmp));
 }
 
+void API_NS(point_dual_scalarmul) (
+    point_t a1,
+    point_t a2,
+    const point_t b,
+    const scalar_t scalar1,
+    const scalar_t scalar2
+) {
+    const int WINDOW = DECAF_WINDOW_BITS,
+        WINDOW_MASK = (1<<WINDOW)-1,
+        WINDOW_T_MASK = WINDOW_MASK >> 1,
+        NTABLE = 1<<(WINDOW-1);
+        
+    scalar_t scalar1x, scalar2x;
+    API_NS(scalar_add)(scalar1x, scalar1, API_NS(point_scalarmul_adjustment));
+    sc_halve(scalar1x,scalar1x,sc_p);
+    API_NS(scalar_add)(scalar2x, scalar2, API_NS(point_scalarmul_adjustment));
+    sc_halve(scalar2x,scalar2x,sc_p);
+    
+    /* Set up a precomputed table with odd multiples of b. */
+    point_t multiples1[NTABLE], multiples2[NTABLE], working, tmp;
+    pniels_t pn;
+    
+    API_NS(point_copy)(working, b);
+
+    /* Initialize. */
+    int i,j;
+    
+    for (i=0; i<NTABLE; i++) {
+        API_NS(point_copy)(multiples1[i], API_NS(point_identity));
+        API_NS(point_copy)(multiples2[i], API_NS(point_identity));
+    }
+
+    for (i=0; i<SCALAR_BITS; i+=WINDOW) {   
+        if (i) {
+            for (j=0; j<WINDOW-1; j++)
+                point_double_internal(working, working, -1);
+            point_double_internal(working, working, 0);
+        }
+        
+        /* Fetch another block of bits */
+        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
+                     bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
+        if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
+            bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
+            bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
+        }
+        bits1 &= WINDOW_MASK;
+        bits2 &= WINDOW_MASK;
+        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
+        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
+        bits1 ^= inv1;
+        bits2 ^= inv2;
+        
+        pt_to_pniels(pn, working);
+
+        constant_time_lookup_xx(tmp, multiples1, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
+        cond_neg_niels(pn->n, inv1);
+        /* add_pniels_to_pt(multiples1[bits1 & WINDOW_T_MASK], pn, 0); */
+        add_pniels_to_pt(tmp, pn, 0);
+        constant_time_insert(multiples1, tmp, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
+        
+        
+        constant_time_lookup_xx(tmp, multiples2, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
+        cond_neg_niels(pn->n, inv1^inv2);
+        /* add_pniels_to_pt(multiples2[bits2 & WINDOW_T_MASK], pn, 0); */
+        add_pniels_to_pt(tmp, pn, 0);
+        constant_time_insert(multiples2, tmp, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
+    }
+    
+    if (NTABLE > 1) {
+        API_NS(point_copy)(working, multiples1[NTABLE-1]);
+        API_NS(point_copy)(tmp    , multiples2[NTABLE-1]);
+    
+        for (i=NTABLE-1; i>1; i--) {
+            API_NS(point_add)(multiples1[i-1], multiples1[i-1], multiples1[i]);
+            API_NS(point_add)(multiples2[i-1], multiples2[i-1], multiples2[i]);
+            API_NS(point_add)(working, working, multiples1[i-1]);
+            API_NS(point_add)(tmp,     tmp,     multiples2[i-1]);
+        }
+    
+        API_NS(point_add)(multiples1[0], multiples1[0], multiples1[1]);
+        API_NS(point_add)(multiples2[0], multiples2[0], multiples2[1]);
+        point_double_internal(working, working, 0);
+        point_double_internal(tmp,         tmp, 0);
+        API_NS(point_add)(a1, working, multiples1[0]);
+        API_NS(point_add)(a2, tmp,     multiples2[0]);
+    } else {
+        API_NS(point_copy)(a1, multiples1[0]);
+        API_NS(point_copy)(a2, multiples2[0]);
+    }
+
+    decaf_bzero(scalar1x,sizeof(scalar1x));
+    decaf_bzero(scalar2x,sizeof(scalar2x));
+    decaf_bzero(pn,sizeof(pn));
+    decaf_bzero(multiples1,sizeof(multiples1));
+    decaf_bzero(multiples2,sizeof(multiples2));
+    decaf_bzero(tmp,sizeof(tmp));
+    decaf_bzero(working,sizeof(working));
+}
+
 decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
     /* equality mod 2-torsion compares x/y */
     gf a, b;
diff --git a/src/include/constant_time.h b/src/include/constant_time.h
index 170b4d9..2cc0ee4 100644
--- a/src/include/constant_time.h
+++ b/src/include/constant_time.h
@@ -184,6 +184,73 @@ constant_time_lookup (
     }
 }
 
+/**
+ * @brief Constant-time equivalent of memcpy(table + elem_bytes*idx, in, elem_bytes);
+ *
+ * The table must be at least as aligned as elem_bytes.  The input must be word aligned,
+ * and if the output size is vector aligned it must also be vector aligned.
+ *
+ * The table and input must not alias.
+ */
+static __inline__ void
+__attribute__((unused,always_inline))
+constant_time_insert (
+    void *__restrict__ table_,
+    const void *in_,
+    word_t elem_bytes,
+    word_t n_table,
+    word_t idx
+) {
+    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
+    
+    /* Can't do pointer arithmetic on void* */
+    const unsigned char *in = (const unsigned char *)in_;
+    unsigned char *table = (unsigned char *)table_;
+    word_t j,k;
+    
+    for (j=0; j<n_table; j++, big_i-=big_one) {        
+        big_register_t br_mask = br_is_zero(big_i);
+        for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
+            if (elem_bytes % sizeof(big_register_t)) {
+                /* unaligned */
+                ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned
+                    = ( ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned & ~br_mask )
+                    | ( ((const unaligned_br_t *)(in+k))->unaligned & br_mask );
+            } else {
+                /* aligned */
+                *(big_register_t*)(&table[k+j*elem_bytes])
+                    = ( *(big_register_t*)(&table[k+j*elem_bytes]) & ~br_mask )
+                    | ( *(const big_register_t *)(in+k) & br_mask );
+            }
+        }
+
+        word_t mask = word_is_zero(idx^j);
+        if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
+            for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
+                if (elem_bytes % sizeof(word_t)) {
+                    /* output unaligned, input aligned */
+                    ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned
+                        = ( ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned & ~mask )
+                        | ( *(const word_t *)(in+k) & mask );
+                } else {
+                    /* aligned */
+                    *(word_t*)(&table[k+j*elem_bytes])
+                        = ( *(word_t*)(&table[k+j*elem_bytes]) & ~mask )
+                        | ( *(const word_t *)(in+k) & mask );
+                }
+            }
+        }
+        
+        if (elem_bytes % sizeof(word_t)) {
+            for (; k<elem_bytes; k+=1) {
+                table[k+j*elem_bytes]
+                    = ( table[k+j*elem_bytes] & ~mask )
+                    | ( in[k] & mask );
+            }
+        }
+    }
+}
+
 /**
  * @brief Constant-time a = b&mask.
  *
diff --git a/src/public_include/decaf/decaf_255.h b/src/public_include/decaf/decaf_255.h
index 2853d8b..1397031 100644
--- a/src/public_include/decaf/decaf_255.h
+++ b/src/public_include/decaf/decaf_255.h
@@ -391,6 +391,28 @@ void decaf_255_point_double_scalarmul (
     const decaf_255_point_t base2,
     const decaf_255_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;
+    
+/*
+ * @brief Multiply one base point by two scalars:
+ * a1 = scalar1 * base
+ * a2 = scalar2 * base
+ *
+ * Equivalent to two calls to decaf_255_point_scalarmul, but may be
+ * faster.
+ *
+ * @param [out] a1 The first multiple
+ * @param [out] a2 The second multiple
+ * @param [in] base1 A point to be scaled.
+ * @param [in] scalar1 A first scalar to multiply by.
+ * @param [in] scalar2 A second scalar to multiply by.
+ */
+void decaf_255_point_dual_scalarmul (
+    decaf_255_point_t a1,
+    decaf_255_point_t a2,
+    const decaf_255_point_t b,
+    const decaf_255_scalar_t scalar1,
+    const decaf_255_scalar_t scalar2
+) API_VIS NONNULL5 NOINLINE;
 
 /**
  * @brief Multiply two base points by two scalars:
diff --git a/src/public_include/decaf/decaf_255.hxx b/src/public_include/decaf/decaf_255.hxx
index d2a4177..1229fe7 100644
--- a/src/public_include/decaf/decaf_255.hxx
+++ b/src/public_include/decaf/decaf_255.hxx
@@ -363,6 +363,13 @@ public:
         Point p((NOINIT())); decaf_255_point_double_scalarmul(p.p,q.p,qs.s,r.p,rs.s); return p;
     }
     
+    /** @brief Dual-scalar multiply, equivalent to this*r1, this*r2 but faster. */
+    inline void dual_scalarmul (
+        Point &q1, Point &q2, const Scalar &r1, const Scalar &r2
+    ) const NOEXCEPT {
+        decaf_255_point_dual_scalarmul(q1.p,q2.p,p,r1.s,r2.s);
+    }
+    
     /**
      * @brief Double-scalar multiply, equivalent to q*qs + r*rs but faster.
      * For those who like their scalars before the point.
diff --git a/src/public_include/decaf/decaf_448.h b/src/public_include/decaf/decaf_448.h
index ba64bf7..98a2ad7 100644
--- a/src/public_include/decaf/decaf_448.h
+++ b/src/public_include/decaf/decaf_448.h
@@ -394,6 +394,28 @@ void decaf_448_point_double_scalarmul (
     const decaf_448_point_t base2,
     const decaf_448_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;
+    
+/*
+ * @brief Multiply one base point by two scalars:
+ * a1 = scalar1 * base
+ * a2 = scalar2 * base
+ *
+ * Equivalent to two calls to decaf_255_point_scalarmul, but may be
+ * faster.
+ *
+ * @param [out] a1 The first multiple
+ * @param [out] a2 The second multiple
+ * @param [in] base1 A point to be scaled.
+ * @param [in] scalar1 A first scalar to multiply by.
+ * @param [in] scalar2 A second scalar to multiply by.
+ */
+void decaf_448_point_dual_scalarmul (
+   decaf_448_point_t a1,
+   decaf_448_point_t a2,
+   const decaf_448_point_t b,
+   const decaf_448_scalar_t scalar1,
+   const decaf_448_scalar_t scalar2
+) API_VIS NONNULL5 NOINLINE;
 
 /**
  * @brief Multiply two base points by two scalars:
diff --git a/src/public_include/decaf/decaf_448.hxx b/src/public_include/decaf/decaf_448.hxx
index 85250af..3214565 100644
--- a/src/public_include/decaf/decaf_448.hxx
+++ b/src/public_include/decaf/decaf_448.hxx
@@ -374,6 +374,13 @@ public:
     ) NOEXCEPT {
         Point p((NOINIT())); decaf_448_point_double_scalarmul(p.p,q.p,qs.s,r.p,rs.s); return p;
     }
+
+    /** @brief Dual-scalar multiply, equivalent to this*r1, this*r2 but faster. */
+    inline void dual_scalarmul (
+        Point &q1, Point &q2, const Scalar &r1, const Scalar &r2
+    ) const NOEXCEPT {
+        decaf_448_point_dual_scalarmul(q1.p,q2.p,p,r1.s,r2.s);
+    }
     
     /**
      * @brief Double-scalar multiply, equivalent to q*qs + r*rs but faster.
diff --git a/test/bench_decaf.cxx b/test/bench_decaf.cxx
index 861902f..43fb08e 100644
--- a/test/bench_decaf.cxx
+++ b/test/bench_decaf.cxx
@@ -358,6 +358,7 @@ static void micro() {
     for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
     for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
     for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
+    for (Benchmark b("Point dual scalarmul"); b.iter(); ) { p.dual_scalarmul(p,q,s,t); }
     for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
     for (Benchmark b("Point double scalarmul_v"); b.iter(); ) {
         s = Scalar(rng);
diff --git a/test/test_decaf.cxx b/test/test_decaf.cxx
index c603405..5c30794 100644
--- a/test/test_decaf.cxx
+++ b/test/test_decaf.cxx
@@ -286,6 +286,8 @@ static void test_ec() {
         Point p(rng);
         Point q(rng);
         
+        Point d1, d2;
+        
         SecureBuffer buffer(2*Point::HASH_BYTES);
         rng.read(buffer);
         Point r = Point::from_hash(buffer);
@@ -305,7 +307,12 @@ static void test_ec() {
         if (i%10) continue;
         point_check(test,p,q,r,x,0,x*(p+q),x*p+x*q,"distr mul");
         point_check(test,p,q,r,x,y,(x*y)*p,x*(y*p),"assoc mul");
-        point_check(test,p,q,r,x,y,x*p+y*q,Point::double_scalarmul(x,p,y,q),"ds mul");
+        point_check(test,p,q,r,x,y,x*p+y*q,Point::double_scalarmul(x,p,y,q),"double mul");
+        
+        p.dual_scalarmul(d1,d2,x,y);
+        point_check(test,p,q,r,x,y,x*p,d1,"dual mul 1");
+        point_check(test,p,q,r,x,y,y*p,d2,"dual mul 2");
+        
         point_check(test,base,q,r,x,y,x*base+y*q,q.non_secret_combo_with_base(y,x),"ds vt mul");
         point_check(test,p,q,r,x,0,Precomputed(p)*x,p*x,"precomp mul");
         point_check(test,p,q,r,0,0,r,