From 704b4249827641a29565f68c851feb22db37accc Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Tue, 24 Nov 2015 12:00:00 -0800 Subject: [PATCH] dual scalarmul because of TLS discussion --- src/decaf_fast.c | 100 +++++++++++++++++++++++++ src/include/constant_time.h | 67 +++++++++++++++++ src/public_include/decaf/decaf_255.h | 22 ++++++ src/public_include/decaf/decaf_255.hxx | 7 ++ src/public_include/decaf/decaf_448.h | 22 ++++++ src/public_include/decaf/decaf_448.hxx | 7 ++ test/bench_decaf.cxx | 1 + test/test_decaf.cxx | 9 ++- 8 files changed, 234 insertions(+), 1 deletion(-) diff --git a/src/decaf_fast.c b/src/decaf_fast.c index bf85a3d..2025ca3 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -1064,6 +1064,106 @@ void API_NS(point_double_scalarmul) ( decaf_bzero(tmp,sizeof(tmp)); } +void API_NS(point_dual_scalarmul) ( + point_t a1, + point_t a2, + const point_t b, + const scalar_t scalar1, + const scalar_t scalar2 +) { + const int WINDOW = DECAF_WINDOW_BITS, + WINDOW_MASK = (1<> 1, + NTABLE = 1<<(WINDOW-1); + + scalar_t scalar1x, scalar2x; + API_NS(scalar_add)(scalar1x, scalar1, API_NS(point_scalarmul_adjustment)); + sc_halve(scalar1x,scalar1x,sc_p); + API_NS(scalar_add)(scalar2x, scalar2, API_NS(point_scalarmul_adjustment)); + sc_halve(scalar2x,scalar2x,sc_p); + + /* Set up a precomputed table with odd multiples of b. */ + point_t multiples1[NTABLE], multiples2[NTABLE], working, tmp; + pniels_t pn; + + API_NS(point_copy)(working, b); + + /* Initialize. */ + int i,j; + + for (i=0; ilimb[i/WBITS] >> (i%WBITS), + bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); + if (i%WBITS >= WBITS-WINDOW && i/WBITSlimb[i/WBITS+1] << (WBITS - (i%WBITS)); + bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); + } + bits1 &= WINDOW_MASK; + bits2 &= WINDOW_MASK; + decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; + decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; + bits1 ^= inv1; + bits2 ^= inv2; + + pt_to_pniels(pn, working); + + constant_time_lookup_xx(tmp, multiples1, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK); + cond_neg_niels(pn->n, inv1); + /* add_pniels_to_pt(multiples1[bits1 & WINDOW_T_MASK], pn, 0); */ + add_pniels_to_pt(tmp, pn, 0); + constant_time_insert(multiples1, tmp, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK); + + + constant_time_lookup_xx(tmp, multiples2, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK); + cond_neg_niels(pn->n, inv1^inv2); + /* add_pniels_to_pt(multiples2[bits2 & WINDOW_T_MASK], pn, 0); */ + add_pniels_to_pt(tmp, pn, 0); + constant_time_insert(multiples2, tmp, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK); + } + + if (NTABLE > 1) { + API_NS(point_copy)(working, multiples1[NTABLE-1]); + API_NS(point_copy)(tmp , multiples2[NTABLE-1]); + + for (i=NTABLE-1; i>1; i--) { + API_NS(point_add)(multiples1[i-1], multiples1[i-1], multiples1[i]); + API_NS(point_add)(multiples2[i-1], multiples2[i-1], multiples2[i]); + API_NS(point_add)(working, working, multiples1[i-1]); + API_NS(point_add)(tmp, tmp, multiples2[i-1]); + } + + API_NS(point_add)(multiples1[0], multiples1[0], multiples1[1]); + API_NS(point_add)(multiples2[0], multiples2[0], multiples2[1]); + point_double_internal(working, working, 0); + point_double_internal(tmp, tmp, 0); + API_NS(point_add)(a1, working, multiples1[0]); + API_NS(point_add)(a2, tmp, multiples2[0]); + } else { + API_NS(point_copy)(a1, multiples1[0]); + API_NS(point_copy)(a2, multiples2[0]); + } + + decaf_bzero(scalar1x,sizeof(scalar1x)); + decaf_bzero(scalar2x,sizeof(scalar2x)); + decaf_bzero(pn,sizeof(pn)); + decaf_bzero(multiples1,sizeof(multiples1)); + decaf_bzero(multiples2,sizeof(multiples2)); + decaf_bzero(tmp,sizeof(tmp)); + decaf_bzero(working,sizeof(working)); +} + decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) { /* equality mod 2-torsion compares x/y */ gf a, b; diff --git a/src/include/constant_time.h b/src/include/constant_time.h index 170b4d9..2cc0ee4 100644 --- a/src/include/constant_time.h +++ b/src/include/constant_time.h @@ -184,6 +184,73 @@ constant_time_lookup ( } } +/** + * @brief Constant-time equivalent of memcpy(table + elem_bytes*idx, in, elem_bytes); + * + * The table must be at least as aligned as elem_bytes. The input must be word aligned, + * and if the output size is vector aligned it must also be vector aligned. + * + * The table and input must not alias. + */ +static __inline__ void +__attribute__((unused,always_inline)) +constant_time_insert ( + void *__restrict__ table_, + const void *in_, + word_t elem_bytes, + word_t n_table, + word_t idx +) { + big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); + + /* Can't do pointer arithmetic on void* */ + const unsigned char *in = (const unsigned char *)in_; + unsigned char *table = (unsigned char *)table_; + word_t j,k; + + for (j=0; junaligned + = ( ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned & ~br_mask ) + | ( ((const unaligned_br_t *)(in+k))->unaligned & br_mask ); + } else { + /* aligned */ + *(big_register_t*)(&table[k+j*elem_bytes]) + = ( *(big_register_t*)(&table[k+j*elem_bytes]) & ~br_mask ) + | ( *(const big_register_t *)(in+k) & br_mask ); + } + } + + word_t mask = word_is_zero(idx^j); + if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { + for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { + if (elem_bytes % sizeof(word_t)) { + /* output unaligned, input aligned */ + ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned + = ( ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned & ~mask ) + | ( *(const word_t *)(in+k) & mask ); + } else { + /* aligned */ + *(word_t*)(&table[k+j*elem_bytes]) + = ( *(word_t*)(&table[k+j*elem_bytes]) & ~mask ) + | ( *(const word_t *)(in+k) & mask ); + } + } + } + + if (elem_bytes % sizeof(word_t)) { + for (; k