dual scalarmul because of TLS discussion

9 years ago · 704b424982
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -1064,6 +1064,106 @@ void API_NS(point_double_scalarmul) (
    decaf_bzero(tmp,sizeof(tmp));
 }

 void API_NS(point_dual_scalarmul) (
    point_t a1,
    point_t a2,
    const point_t b,
    const scalar_t scalar1,
    const scalar_t scalar2
 ) {
    const int WINDOW = DECAF_WINDOW_BITS,
        WINDOW_MASK = (1<<WINDOW)-1,
        WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);
        
    scalar_t scalar1x, scalar2x;
    API_NS(scalar_add)(scalar1x, scalar1, API_NS(point_scalarmul_adjustment));
    sc_halve(scalar1x,scalar1x,sc_p);
    API_NS(scalar_add)(scalar2x, scalar2, API_NS(point_scalarmul_adjustment));
    sc_halve(scalar2x,scalar2x,sc_p);
    
    /* Set up a precomputed table with odd multiples of b. */
    point_t multiples1[NTABLE], multiples2[NTABLE], working, tmp;
    pniels_t pn;
    
    API_NS(point_copy)(working, b);

    /* Initialize. */
    int i,j;
    
    for (i=0; i<NTABLE; i++) {
        API_NS(point_copy)(multiples1[i], API_NS(point_identity));
        API_NS(point_copy)(multiples2[i], API_NS(point_identity));
    }

    for (i=0; i<SCALAR_BITS; i+=WINDOW) {   
        if (i) {
            for (j=0; j<WINDOW-1; j++)
                point_double_internal(working, working, -1);
            point_double_internal(working, working, 0);
        }
        
        /* Fetch another block of bits */
        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
                     bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
        if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
            bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
            bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
        }
        bits1 &= WINDOW_MASK;
        bits2 &= WINDOW_MASK;
        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
        bits1 ^= inv1;
        bits2 ^= inv2;
        
        pt_to_pniels(pn, working);

        constant_time_lookup_xx(tmp, multiples1, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
        cond_neg_niels(pn->n, inv1);
        /* add_pniels_to_pt(multiples1[bits1 & WINDOW_T_MASK], pn, 0); */
        add_pniels_to_pt(tmp, pn, 0);
        constant_time_insert(multiples1, tmp, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
        
        
        constant_time_lookup_xx(tmp, multiples2, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
        cond_neg_niels(pn->n, inv1^inv2);
        /* add_pniels_to_pt(multiples2[bits2 & WINDOW_T_MASK], pn, 0); */
        add_pniels_to_pt(tmp, pn, 0);
        constant_time_insert(multiples2, tmp, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
    }
    
    if (NTABLE > 1) {
        API_NS(point_copy)(working, multiples1[NTABLE-1]);
        API_NS(point_copy)(tmp    , multiples2[NTABLE-1]);
    
        for (i=NTABLE-1; i>1; i--) {
            API_NS(point_add)(multiples1[i-1], multiples1[i-1], multiples1[i]);
            API_NS(point_add)(multiples2[i-1], multiples2[i-1], multiples2[i]);
            API_NS(point_add)(working, working, multiples1[i-1]);
            API_NS(point_add)(tmp,     tmp,     multiples2[i-1]);
        }
    
        API_NS(point_add)(multiples1[0], multiples1[0], multiples1[1]);
        API_NS(point_add)(multiples2[0], multiples2[0], multiples2[1]);
        point_double_internal(working, working, 0);
        point_double_internal(tmp,         tmp, 0);
        API_NS(point_add)(a1, working, multiples1[0]);
        API_NS(point_add)(a2, tmp,     multiples2[0]);
    } else {
        API_NS(point_copy)(a1, multiples1[0]);
        API_NS(point_copy)(a2, multiples2[0]);
    }

    decaf_bzero(scalar1x,sizeof(scalar1x));
    decaf_bzero(scalar2x,sizeof(scalar2x));
    decaf_bzero(pn,sizeof(pn));
    decaf_bzero(multiples1,sizeof(multiples1));
    decaf_bzero(multiples2,sizeof(multiples2));
    decaf_bzero(tmp,sizeof(tmp));
    decaf_bzero(working,sizeof(working));
 }

 decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
    /* equality mod 2-torsion compares x/y */
    gf a, b;
--- a/src/include/constant_time.h
+++ b/src/include/constant_time.h
@@ -184,6 +184,73 @@ constant_time_lookup (
    }
 }

 /**
 * @brief Constant-time equivalent of memcpy(table + elem_bytes*idx, in, elem_bytes);
 *
 * The table must be at least as aligned as elem_bytes.  The input must be word aligned,
 * and if the output size is vector aligned it must also be vector aligned.
 *
 * The table and input must not alias.
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_insert (
    void *__restrict__ table_,
    const void *in_,
    word_t elem_bytes,
    word_t n_table,
    word_t idx
 ) {
    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
    
    /* Can't do pointer arithmetic on void* */
    const unsigned char *in = (const unsigned char *)in_;
    unsigned char *table = (unsigned char *)table_;
    word_t j,k;
    
    for (j=0; j<n_table; j++, big_i-=big_one) {        
        big_register_t br_mask = br_is_zero(big_i);
        for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
            if (elem_bytes % sizeof(big_register_t)) {
                /* unaligned */
                ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned
                    = ( ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned & ~br_mask )
                    | ( ((const unaligned_br_t *)(in+k))->unaligned & br_mask );
            } else {
                /* aligned */
                *(big_register_t*)(&table[k+j*elem_bytes])
                    = ( *(big_register_t*)(&table[k+j*elem_bytes]) & ~br_mask )
                    | ( *(const big_register_t *)(in+k) & br_mask );
            }
        }

        word_t mask = word_is_zero(idx^j);
        if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
            for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
                if (elem_bytes % sizeof(word_t)) {
                    /* output unaligned, input aligned */
                    ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned
                        = ( ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned & ~mask )
                        | ( *(const word_t *)(in+k) & mask );
                } else {
                    /* aligned */
                    *(word_t*)(&table[k+j*elem_bytes])
                        = ( *(word_t*)(&table[k+j*elem_bytes]) & ~mask )
                        | ( *(const word_t *)(in+k) & mask );
                }
            }
        }
        
        if (elem_bytes % sizeof(word_t)) {
            for (; k<elem_bytes; k+=1) {
                table[k+j*elem_bytes]
                    = ( table[k+j*elem_bytes] & ~mask )
                    | ( in[k] & mask );
            }
        }
    }
 }

 /**
 * @brief Constant-time a = b&mask.
 *
--- a/src/public_include/decaf/decaf_255.h
+++ b/src/public_include/decaf/decaf_255.h
@@ -391,6 +391,28 @@ void decaf_255_point_double_scalarmul (
    const decaf_255_point_t base2,
    const decaf_255_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;
    
 /*
 * @brief Multiply one base point by two scalars:
 * a1 = scalar1 * base
 * a2 = scalar2 * base
 *
 * Equivalent to two calls to decaf_255_point_scalarmul, but may be
 * faster.
 *
 * @param [out] a1 The first multiple
 * @param [out] a2 The second multiple
 * @param [in] base1 A point to be scaled.
 * @param [in] scalar1 A first scalar to multiply by.
 * @param [in] scalar2 A second scalar to multiply by.
 */
 void decaf_255_point_dual_scalarmul (
    decaf_255_point_t a1,
    decaf_255_point_t a2,
    const decaf_255_point_t b,
    const decaf_255_scalar_t scalar1,
    const decaf_255_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;

 /**
 * @brief Multiply two base points by two scalars:
--- a/src/public_include/decaf/decaf_255.hxx
+++ b/src/public_include/decaf/decaf_255.hxx
@@ -363,6 +363,13 @@ public:
        Point p((NOINIT())); decaf_255_point_double_scalarmul(p.p,q.p,qs.s,r.p,rs.s); return p;
    }
    
    /** @brief Dual-scalar multiply, equivalent to this*r1, this*r2 but faster. */
    inline void dual_scalarmul (
        Point &q1, Point &q2, const Scalar &r1, const Scalar &r2
    ) const NOEXCEPT {
        decaf_255_point_dual_scalarmul(q1.p,q2.p,p,r1.s,r2.s);
    }
    
    /**
     * @brief Double-scalar multiply, equivalent to q*qs + r*rs but faster.
     * For those who like their scalars before the point.
--- a/src/public_include/decaf/decaf_448.h
+++ b/src/public_include/decaf/decaf_448.h
@@ -394,6 +394,28 @@ void decaf_448_point_double_scalarmul (
    const decaf_448_point_t base2,
    const decaf_448_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;
    
 /*
 * @brief Multiply one base point by two scalars:
 * a1 = scalar1 * base
 * a2 = scalar2 * base
 *
 * Equivalent to two calls to decaf_255_point_scalarmul, but may be
 * faster.
 *
 * @param [out] a1 The first multiple
 * @param [out] a2 The second multiple
 * @param [in] base1 A point to be scaled.
 * @param [in] scalar1 A first scalar to multiply by.
 * @param [in] scalar2 A second scalar to multiply by.
 */
 void decaf_448_point_dual_scalarmul (
   decaf_448_point_t a1,
   decaf_448_point_t a2,
   const decaf_448_point_t b,
   const decaf_448_scalar_t scalar1,
   const decaf_448_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;

 /**
 * @brief Multiply two base points by two scalars:
--- a/src/public_include/decaf/decaf_448.hxx
+++ b/src/public_include/decaf/decaf_448.hxx
@@ -374,6 +374,13 @@ public:
    ) NOEXCEPT {
        Point p((NOINIT())); decaf_448_point_double_scalarmul(p.p,q.p,qs.s,r.p,rs.s); return p;
    }

    /** @brief Dual-scalar multiply, equivalent to this*r1, this*r2 but faster. */
    inline void dual_scalarmul (
        Point &q1, Point &q2, const Scalar &r1, const Scalar &r2
    ) const NOEXCEPT {
        decaf_448_point_dual_scalarmul(q1.p,q2.p,p,r1.s,r2.s);
    }
    
    /**
     * @brief Double-scalar multiply, equivalent to q*qs + r*rs but faster.
--- a/test/bench_decaf.cxx
+++ b/test/bench_decaf.cxx
@@ -358,6 +358,7 @@ static void micro() {
    for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
    for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
    for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
    for (Benchmark b("Point dual scalarmul"); b.iter(); ) { p.dual_scalarmul(p,q,s,t); }
    for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
    for (Benchmark b("Point double scalarmul_v"); b.iter(); ) {
        s = Scalar(rng);
--- a/test/test_decaf.cxx
+++ b/test/test_decaf.cxx
@@ -286,6 +286,8 @@ static void test_ec() {
        Point p(rng);
        Point q(rng);
        
        Point d1, d2;
        
        SecureBuffer buffer(2*Point::HASH_BYTES);
        rng.read(buffer);
        Point r = Point::from_hash(buffer);
@@ -305,7 +307,12 @@ static void test_ec() {
        if (i%10) continue;
        point_check(test,p,q,r,x,0,x*(p+q),x*p+x*q,"distr mul");
        point_check(test,p,q,r,x,y,(x*y)*p,x*(y*p),"assoc mul");
        point_check(test,p,q,r,x,y,x*p+y*q,Point::double_scalarmul(x,p,y,q),"ds mul");
        point_check(test,p,q,r,x,y,x*p+y*q,Point::double_scalarmul(x,p,y,q),"double mul");
        
        p.dual_scalarmul(d1,d2,x,y);
        point_check(test,p,q,r,x,y,x*p,d1,"dual mul 1");
        point_check(test,p,q,r,x,y,y*p,d2,"dual mul 2");
        
        point_check(test,base,q,r,x,y,x*base+y*q,q.non_secret_combo_with_base(y,x),"ds vt mul");
        point_check(test,p,q,r,x,0,Precomputed(p)*x,p*x,"precomp mul");
        point_check(test,p,q,r,0,0,r,