decaf scalarmul signed w=2 working. Surprisingly only twice as slow as Goldilocks

10 years ago · b3f7d97977
--- a/include/decaf.h
+++ b/include/decaf.h
@@ -5,7 +5,17 @@
 /**
 * @file decaf.h
 * @author Mike Hamburg
 * @brief Decaf high-level functions.
 * @brief A group of prime order p.
 *
 * The Decaf library implements cryptographic operations on a an elliptic curve
 * group of prime order p.  It accomplishes this by using a twisted Edwards
 * curve (isogenous to Ed448-Goldilocks) and wiping out the cofactor.
 *
 * The formulas are all complete and have no special cases, except that
 * decaf_decode can fail because not every sequence of bytes is a valid group
 * element.
 *
 * The formulas contain no data-dependent branches, timing or memory accesses.
 */
 #ifndef __DECAF_H__
 #define __DECAF_H__ 1
@@ -21,7 +31,7 @@ typedef struct decaf_point_s {

 static const decaf_bool_t DECAF_SUCCESS = -(decaf_bool_t)1, DECAF_FAILURE = 0;

 const decaf_point_t decaf_identity_point;
 const decaf_point_t decaf_identity;

 #ifdef __cplusplus
 extern "C" {
@@ -49,6 +59,11 @@ void decaf_add (
    const decaf_point_t c
 ) API_VIS NONNULL3;
    
 void decaf_copy (
    decaf_point_t a,
    const decaf_point_t b
 ) API_VIS NONNULL2;
    
 decaf_bool_t decaf_eq (
    const decaf_point_t a,
    const decaf_point_t b
@@ -66,6 +81,13 @@ void decaf_add_sub (
    const decaf_point_t c,
    decaf_bool_t do_sub
 ) API_VIS NONNULL3;

 void decaf_scalarmul (
    decaf_point_t a,
    const decaf_point_t b,
    const decaf_word_t *scalar,
    unsigned int scalar_words
 ) API_VIS NONNULL3;
    
 #undef API_VIS
 #undef WARN_UNUSED
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -16,13 +16,13 @@ typedef __int128_t sdword_t;
 #define WBITS 64
 #define LBITS 56

 #define siv static inline void
 #define sv static void
 #define NLIMBS 8

 typedef word_t gf[NLIMBS];
 static const gf ZERO = {0}, ONE = {1}, TWO = {2};

 static const word_t LMASK = (1ull<<LBITS)-1;
 #define LMASK ((1ull<<LBITS)-1)
 static const gf P = { LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK };
 #define FOR_LIMB(i,op) { unsigned int i=0; \
   op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \
@@ -30,9 +30,11 @@ static const gf P = { LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK }

 static const int EDWARDS_D = -39081;

 siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }
 /** Copy x = y */
 sv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }

 static inline void __attribute__((always_inline)) gf_mul_inline (gf c, const gf a, const gf b) {
 /** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */
 sv gf_mul (gf c, const gf a, const gf b) {
    gf aa;
    gf_cpy(aa,a);
    
@@ -52,13 +54,15 @@ static inline void __attribute__((always_inline)) gf_mul_inline (gf c, const gf
    FOR_LIMB(j, c[j] = accum[j] );
 }

 static void gf_mul( gf a, const gf b, const gf c ) { gf_mul_inline(a,b,c); }
 static void gf_sqr( gf a, const gf b ) { gf_mul_inline(a,b,b); }
 /** No dedicated square (PERF) */
 #define gf_sqr(c,a) gf_mul(c,a,a)

 static void gf_isqrt(gf y, const gf x) {
 /** Inverse square root using addition chain. */
 sv gf_isqrt(gf y, const gf x) {
    int i;
 #define STEP(s,m,n) gf_mul(s,m,c); gf_cpy(c,s); for (i=0;i<n;i++) gf_sqr(c,c);
    gf a, b, c;
    gf_sqr ( c,   x );
 #define STEP(s,m,n) {gf_mul(s,m,c); gf_cpy(c,s); int i; for (i=0;i<n;i++) gf_sqr(c,c);}
    STEP(b,x,1);
    STEP(b,x,3);
    STEP(a,b,3);
@@ -73,7 +77,8 @@ static void gf_isqrt(gf y, const gf x) {
    gf_mul(y,a,c);
 }

 siv gf_reduce(gf x) {
 /** Weak reduce mod p. */
 sv gf_reduce(gf x) {
    x[NLIMBS/2] += x[NLIMBS-1] >> LBITS;
    FOR_LIMB(j,{
        x[j] += x[(j-1)%NLIMBS] >> LBITS;
@@ -81,34 +86,32 @@ siv gf_reduce(gf x) {
    });
 }

 siv gf_add ( gf x, const gf y, const gf z ) {
 /** Add mod p.  Conservatively always weak-reduce. (PERF) */
 sv gf_add ( gf x, const gf y, const gf z ) {
    FOR_LIMB(i, x[i] = y[i] + z[i] );
    gf_reduce(x);
 }

 siv gf_sub ( gf x, const gf y, const gf z ) {
 /** Subtract mod p.  Conservatively always weak-reduce. (PERF) */
 sv gf_sub ( gf x, const gf y, const gf z ) {
    FOR_LIMB(i, x[i] = y[i] - z[i] + 2*P[i] );
    gf_reduce(x);
 }

 siv gf_mlw(gf a, const gf b, word_t w) {
    if (w>0) {
        gf ww = {w};
        gf_mul_inline(a,b,ww);
    } else {
        gf ww = {-w};
        gf_mul_inline(a,b,ww);
        gf_sub(a,ZERO,a);
    }
 /** Constant time, x = is_z ? z : y */
 sv cond_sel(gf x, const gf y, const gf z, mask_t is_z) {
    FOR_LIMB(i, x[i] = (y[i] & ~is_z) | (z[i] & is_z) );
 }

 siv cond_neg(gf x, mask_t neg) {
 /** Constant time, if (neg) x=-x; */
 sv cond_neg(gf x, mask_t neg) {
    gf y;
    gf_sub(y,ZERO,x);
    FOR_LIMB(i, x[i] = (x[i] & ~neg) | (y[i] & neg) );
    cond_sel(x,x,y,neg);
 }

 siv cond_swap(gf x, gf y, mask_t swap) {
 /** Constant time, if (swap) (x,y) = (y,x); */
 sv cond_swap(gf x, gf y, mask_t swap) {
    FOR_LIMB(i, {
        word_t s = (x[i] ^ y[i]) & swap;
        x[i] ^= s;
@@ -116,7 +119,23 @@ siv cond_swap(gf x, gf y, mask_t swap) {
    });
 }

 static void gf_canon ( gf a ) {
 /**
 * Mul by signed int.  Not constant-time WRT the sign of that int.
 * Just uses a full mul (PERF)
 */
 sv gf_mlw(gf a, const gf b, int w) {
    if (w>0) {
        gf ww = {w};
        gf_mul(a,b,ww);
    } else {
        gf ww = {-w};
        gf_mul(a,b,ww);
        gf_sub(a,ZERO,a);
    }
 }

 /** Canonicalize */
 sv gf_canon ( gf a ) {
    gf_reduce(a);

    /* subtract p with borrow */
@@ -138,53 +157,43 @@ static void gf_canon ( gf a ) {
    });
 }

 static inline word_t gf_eq(const gf a, const gf b) {
 /** Compare a==b */
 static word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
    gf c;
    gf_sub(c,a,b);
    gf_canon(c);
    word_t ret=0;
    FOR_LIMB(i, ret |= c[i] );
    /* Hope the compiler is too dumb to optimize this, thus noinline */
    return ((dword_t)ret - 1) >> WBITS;
 }

 static inline word_t hibit(const gf x) {
 /** Return high bit of x = low bit of 2x mod p */
 static word_t hibit(const gf x) {
    gf y;
    gf_add(y,x,x);
    gf_canon(y);
    return -(y[0]&1);
 }

 const decaf_point_t decaf_identity_point = {{{0},{1},{1},{0}}};

 siv add_sub_point (
    decaf_point_t p,
    const decaf_point_t q,
    const decaf_point_t r,
    mask_t sub
 /* a = use_c ? c : b */
 sv decaf_cond_sel (
    decaf_point_t a,
    const decaf_point_t b,
    const decaf_point_t c,
    mask_t use_c
 ) {
    gf a, b, c, d;
    gf_sub ( b, q->y, q->x );
    gf_sub ( c, r->y, r->x );
    gf_add ( d, r->y, r->x );
    cond_swap(c,d,sub);
    gf_mul ( a, c, b );
    gf_add ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mlw ( p->x, b, 2-2*EDWARDS_D );
    gf_add ( b, a, p->y );
    gf_sub ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
    gf_add ( a, a, a );
    gf_add ( p->y, a, p->x );
    gf_sub ( a, a, p->x );
    cond_swap(a,p->y,sub);
    gf_mul ( p->z, a, p->y );
    gf_mul ( p->x, p->y, c );
    gf_mul ( p->y, a, b );
    gf_mul ( p->t, b, c );
    cond_sel(a->x, b->x, c->x, use_c);
    cond_sel(a->y, b->y, c->y, use_c);
    cond_sel(a->z, b->z, c->z, use_c);
    cond_sel(a->t, b->t, c->t, use_c);
 }
    

 /* *** API begins here *** */    

 /** identity = (0,1) */
 const decaf_point_t decaf_identity = {{{0},{1},{1},{0}}};

 void decaf_encode( unsigned char ser[DECAF_SER_BYTES], const decaf_point_t p ) {
    gf a, b, c, d;
    gf_mlw ( a, p->y, 1-EDWARDS_D ); 
@@ -216,14 +225,11 @@ void decaf_encode( unsigned char ser[DECAF_SER_BYTES], const decaf_point_t p ) {
        }
    });
 }
    
 decaf_bool_t decaf_decode (
    decaf_point_t p,
    const unsigned char ser[DECAF_SER_BYTES],
    decaf_bool_t allow_identity
 ) {
    gf s, a, b, c, d, e;
    

 /**
 * Deserialize a bool, return TRUE if < p.
 */
 static decaf_bool_t gf_deser(gf s, const unsigned char ser[DECAF_SER_BYTES]) {
    // FIXME arch
    int j;
    FOR_LIMB(i, {
@@ -235,9 +241,17 @@ decaf_bool_t decaf_decode (
    });
    
    sdword_t accum = 0;
    FOR_LIMB(i, accum = (accum + P[i] - s[i]) >> WBITS );
    FOR_LIMB(i, accum = (accum + s[i] - P[i]) >> WBITS );
    return accum;
 }
    
    mask_t succ = ~accum;
 decaf_bool_t decaf_decode (
    decaf_point_t p,
    const unsigned char ser[DECAF_SER_BYTES],
    decaf_bool_t allow_identity
 ) {
    gf s, a, b, c, d, e;
    mask_t succ = gf_deser(s, ser);
    mask_t zero = gf_eq(s, ZERO);
    succ &= allow_identity | ~zero;
    succ &= ~hibit(s);
@@ -264,24 +278,92 @@ decaf_bool_t decaf_decode (
    return succ;
 }
    
 void decaf_add(decaf_point_t a, const decaf_point_t b, const decaf_point_t c) {
    add_sub_point(a,b,c,0);
 void decaf_add_sub (
    decaf_point_t p,
    const decaf_point_t q,
    const decaf_point_t r,
    decaf_bool_t do_sub
 ) {
    /* Twisted Edward formulas, complete when 4-torsion isn't involved */
    gf a, b, c, d;
    gf_sub ( b, q->y, q->x );
    gf_sub ( c, r->y, r->x );
    gf_add ( d, r->y, r->x );
    cond_swap(c,d,do_sub);
    gf_mul ( a, c, b );
    gf_add ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mlw ( p->x, b, 2-2*EDWARDS_D );
    gf_add ( b, a, p->y );
    gf_sub ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
    gf_add ( a, a, a );
    gf_add ( p->y, a, p->x );
    gf_sub ( a, a, p->x );
    cond_swap(a,p->y,do_sub);
    gf_mul ( p->z, a, p->y );
    gf_mul ( p->x, p->y, c );
    gf_mul ( p->y, a, b );
    gf_mul ( p->t, b, c );
 }
    
 void decaf_sub(decaf_point_t a, const decaf_point_t b, const decaf_point_t c) {
    add_sub_point(a,b,c,-1);
    decaf_add_sub(a,b,c,-1);
 }
    
 void decaf_add_sub (
 void decaf_add(decaf_point_t a, const decaf_point_t b, const decaf_point_t c) {
    decaf_add_sub(a,b,c,0);
 }

 /* No dedicated point double (PERF) */
 #define decaf_dbl(a,b) decaf_add(a,b,b)

 void decaf_copy (
    decaf_point_t a,
    const decaf_point_t b
 ) {
    gf_cpy(a->x, b->x);
    gf_cpy(a->y, b->y);
    gf_cpy(a->z, b->z);
    gf_cpy(a->t, b->t);
 }

 void decaf_scalarmul (
    decaf_point_t a,
    const decaf_point_t b,
    const decaf_point_t c,
    decaf_bool_t do_sub
    const decaf_word_t *scalar,
    unsigned int scalar_words
 ) {
    add_sub_point(a,b,c,do_sub);
    if (scalar_words == 0) {
        decaf_copy(a,decaf_identity);
        return;
    }
    /* w=2 signed window uses about 1.5 adds per bit.
     * I figured a few extra lines was worth the 25% speedup.
     * NB: if adapting this function to scalarmul by a
     * possibly-odd number of unmasked bits, may need to mask.
     */
    decaf_point_t w,b3,tmp;
    decaf_dbl(w,b);
    /* b3 = b*3 */
    decaf_add(b3,w,b);
    int i;
    for (i=scalar_words*WBITS-2; i>0; i-=2) {
        decaf_word_t bits = scalar[i/WBITS]>>(i%WBITS);
        decaf_cond_sel(tmp,b,b3,((bits^(bits>>1))&1)-1);
        decaf_dbl(w,w);
        decaf_add_sub(w,w,tmp,((bits>>1)&1)-1);
        decaf_dbl(w,w);
    }
    decaf_add_sub(w,w,b,((scalar[0]>>1)&1)-1);
    /* low bit is special because fo signed window */
    decaf_cond_sel(tmp,b,decaf_identity,-(scalar[0]&1));
    decaf_sub(a,w,tmp);
 }

 decaf_bool_t decaf_eq ( const decaf_point_t p, const decaf_point_t q ) {
    /* equality mod 2-torsion compares x/y */
    gf a, b;
    gf_mul ( a, p->y, q->x );
    gf_mul ( b, q->y, p->x );
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -548,7 +548,7 @@ decaf_serialize_extensible (
    const extensible_a_t a
 ) {
    field_a_t L0, L1, L2, L3;
    field_mulw_scc ( L2, a->y, EDWARDS_D ); 
    field_mulw_scc_wr ( L2, a->y, EDWARDS_D ); 
    field_mul ( L3, L2, a->t ); 
    field_mul ( L2, L3, a->u ); 
    field_mul ( L0, a->x, a->z ); 
@@ -556,9 +556,9 @@ decaf_serialize_extensible (
    field_add ( L0, a->y, a->z ); 
    field_sub ( L1, a->z, a->y ); 
    field_mul ( L2, L1, L0 );
    field_mulw_scc ( L1, L2, 1-EDWARDS_D );
    field_mulw_scc_wr ( L1, L2, 1-EDWARDS_D );
    field_isr ( L0, L1 );
    field_mulw_scc ( L1, L0, 1-EDWARDS_D ); 
    field_mulw_scc_wr ( L1, L0, 1-EDWARDS_D ); 
    field_mul ( L2, L1, L0 );
    field_mul ( L0, L2, L3 );
    field_add ( L3, L1, L1 );        
--- a/test/bench.c
+++ b/test/bench.c
@@ -295,7 +295,7 @@ int main(int argc, char **argv) {
        decaf_add(Da,Db,Dc);
    }
    when = now() - when;
    printf("dec + dec :  %5.1fns\n", when * 1e9 / i);
    printf("dec + dec:   %5.1fns\n", when * 1e9 / i);
    
    convert_tw_extensible_to_tw_pniels(&pniels, &ext);
    when = now();
@@ -355,7 +355,14 @@ int main(int argc, char **argv) {
    }
    when = now() - when;
    printf("decafladder: %5.1fµs\n", when * 1e6 / i);
    
   
    when = now();
    for (i=0; i<nbase/10; i++) {
        decaf_scalarmul(Da,Db,sk,sizeof(sk)/sizeof(word_t));
    }
    when = now() - when;
    printf("decaf slow:  %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<nbase/10; i++) {
        scalarmul(&ext,sk);
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -3,6 +3,7 @@
 #include <stdio.h>

 #include "scalarmul.h"
 #include "decaf.h"
 #include "ec_point.h"
 #include "field.h"
 #include "crandom.h"
@@ -110,21 +111,29 @@ single_scalarmul_compatibility_test (
        scalarmul_vt(&work, scalar, nbits);
        untwist_and_double_and_serialize(vt, &work);
        
        decaf_point_t ed2;
    	tw_extended_a_t ed;
        convert_tw_extensible_to_tw_extended(ed, &text);
 	scalarmul_ed(ed, scalar);
 	field_copy(work.x, ed->x);
 	field_copy(work.y, ed->y);
 	field_copy(work.z, ed->z);
 	field_copy(work.t, ed->t);
 	field_set_ui(work.u, 1);
        decaf_scalarmul(ed2, (struct decaf_point_s *)ed, scalar, 7);

        scalarmul_ed(ed, scalar);
        field_copy(work.x, ed->x);
        field_copy(work.y, ed->y);
        field_copy(work.z, ed->z);
        field_copy(work.t, ed->t);
        field_set_ui(work.u, 1);
        untwist_and_double_and_serialize(sced, &work);

        uint8_t ser1[(FIELD_BITS+6)/8], ser2[(FIELD_BITS+6)/8];
        decaf_encode(ser1, (struct decaf_point_s *)ed);
        decaf_encode(ser2, ed2);

        /* check consistency mont vs window */
        consistent &= field_eq(mont, ct);
        consistent &= field_eq(mont, vl);
        consistent &= field_eq(mont, vt);
        consistent &= field_eq(mont, sced);
        consistent &= memcmp(ser1,ser2,sizeof(ser1)) ? 0 : -1;
    }
    
    /* check consistency mont vs combs */
@@ -141,7 +150,7 @@ single_scalarmul_compatibility_test (
    copy_tw_extensible(&work,&text);
    double_tw_extensible(&work);
    decaf_serialize_tw_extensible(decaf_s, &work);
    

    mask_t succ_dm, succ_dta;
    succ_dm  = decaf_montgomery_ladder(decaf_m, decaf_s, scalar, nbits);
    succ_dta = deserialize_and_twist_approx(&work, mont);