From d36b1b0d1bc756ffa040e6cac57d5551fea06f06 Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Thu, 5 Mar 2015 23:54:23 -0800 Subject: [PATCH] begin porting faster scalarmul algos (currently broken tho) --- src/decaf_fast.c | 158 ++++++++++++++++++++++++++++++++++++++++++----- src/scalarmul.c | 1 + 2 files changed, 145 insertions(+), 14 deletions(-) diff --git a/src/decaf_fast.c b/src/decaf_fast.c index 0145f8e..1874c87 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -13,7 +13,8 @@ #include #include "field.h" -#include "constant_time.h" /* TODO REMOVE */ + /* TODO REMOVE */ +#include "constant_time.h" #define WBITS DECAF_WORD_BITS @@ -547,7 +548,6 @@ void decaf_448_point_add ( gf_mul ( p->t, b, c ); } -/* No dedicated point double yet (PERF) */ void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) { gf a, b, c, d; gf_sqr ( c, q->x ); @@ -563,6 +563,7 @@ void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) { gf_mul ( p->x, a, b ); gf_mul ( p->z, p->t, a ); gf_mul ( p->y, p->t, d ); + /* TODO: conditional? */ gf_mul ( p->t, b, d ); } @@ -670,7 +671,7 @@ void decaf_448_scalar_encode( } } -void decaf_448_point_scalarmul ( +void decaf_448_point_scalarmul_xxx ( decaf_448_point_t a, const decaf_448_point_t b, const decaf_448_scalar_t scalar @@ -696,6 +697,137 @@ void decaf_448_point_scalarmul ( decaf_448_point_sub(a,w,tmp); } +/* Projective Niels coordinates */ +typedef struct { gf a, b, c; } niels_s, niels_t[1]; +typedef struct { niels_t n; gf z; } pniels_s, pniels_t[1]; + +static void cond_neg_pniels ( + pniels_t b, + decaf_bool_t neg +) { + cond_swap(b->n->a, b->n->b, neg); + cond_neg(b->n->c, neg); +} + +static void pt_to_pniels ( + pniels_t b, + const decaf_448_point_t a +) { + gf_sub ( b->n->a, a->y, a->x ); + gf_add ( b->n->b, a->x, a->y ); + gf_mlw ( b->n->c, a->t, 2*EDWARDS_D-2 ); + gf_add ( b->z, a->z, a->z ); +} + +static void pniels_to_pt ( + decaf_448_point_t e, + const pniels_t d +) { + gf eu; + gf_add ( eu, d->n->b, d->n->a ); + gf_sub ( e->y, d->n->b, d->n->a ); + gf_mul ( e->t, e->y, eu); + gf_mul ( e->x, d->z, e->y ); + gf_mul ( e->y, d->z, eu ); + gf_sqr ( e->z, d->z ); +} + +static void add_niels_to_pt ( + decaf_448_point_t d, + const niels_t e +) { + gf a, b, c; + gf_sub_nr ( b, d->y, d->x ); + gf_mul ( a, e->a, b ); + gf_add_nr ( b, d->x, d->y ); + gf_mul ( d->y, e->b, b ); + gf_mul ( d->x, e->c, d->t ); + gf_add_nr ( c, a, d->y ); + gf_sub_nr ( b, d->y, a ); + gf_sub_nr ( d->y, d->z, d->x ); + gf_add_nr ( a, d->x, d->z ); + gf_mul ( d->z, a, d->y ); + gf_mul ( d->x, d->y, b ); + gf_mul ( d->y, a, c ); + /* TODO: if... */ + gf_mul ( d->t, b, c ); +} + +static void add_pniels_to_pt ( + decaf_448_point_t p, + const pniels_t pn +) { + gf L0; + gf_mul ( L0, p->z, pn->z ); + gf_cpy ( p->z, L0 ); + add_niels_to_pt( p, pn->n ); +} + +void decaf_448_point_scalarmul ( + decaf_448_point_t a, + const decaf_448_point_t b, + const decaf_448_scalar_t scalar +) { + const int WINDOW = 5, + WINDOW_MASK = (1<> 1, + NTABLE = 1<<(WINDOW-1); + + decaf_448_scalar_t scalar2, onehalf = {{{0}}}, arrr; + onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1); + + /* PERF dedicated halve */ + decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one); + decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); + + /* FIXME PERF precompute 2^447-1/2 mod q. Could instead use 2^446-1/2 mod q though. */ + decaf_448_montmul(arrr,decaf_448_scalar_one,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); + decaf_448_montmul(arrr,arrr,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); + + decaf_448_scalar_add(scalar2, scalar2, arrr); + + pniels_t pn, multiples[NTABLE]; + pt_to_pniels(multiples[0], b); + decaf_448_point_double(a, b); + pt_to_pniels(pn, a); + + int i,j; + for (i=1; ilimb[i/WBITS] >> (i%WBITS), + inv = (bits>>((448 % WINDOW)-1))-1; + bits ^= inv; + + constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_U_MASK); + cond_neg_pniels(pn, inv); + pniels_to_pt(a, pn); + + for (i-=WINDOW; i>=0; i-=WINDOW) { + for (j=0; jlimb[i/WBITS] >> (i%WBITS); + + if (i%WBITS >= WBITS-WINDOW) { + bits ^= scalar2->limb[i/WBITS+1] << (WBITS - (i%WBITS)); + } + + bits &= WINDOW_MASK; + inv = (bits>>(WINDOW-1))-1; + bits ^= inv; + + constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK); + cond_neg_pniels(pn, inv); + add_pniels_to_pt(a, pn); + } +} + void decaf_448_point_double_scalarmul ( decaf_448_point_t a, const decaf_448_point_t b, @@ -854,10 +986,9 @@ decaf_bool_t decaf_448_direct_scalarmul ( decaf_bool_t pflip = 0; for (j=DECAF_448_SCALAR_BITS+1; j>=0; j--) { /* FIXME: -1, but the test cases use too many bits */ - /* TODO PERF: consider a selection-based ladder. It uses more memory but is probably faster. */ /* Augmented Montgomery ladder */ - decaf_bool_t flip = -((scalar->limb[j/WORD_BITS]>>(j%WORD_BITS))&1); + decaf_bool_t flip = -((scalar->limb[j/WBITS]>>(j%WBITS))&1); /* Differential add first... */ gf_add_nr ( xs, xa, za ); @@ -873,19 +1004,18 @@ decaf_bool_t decaf_448_direct_scalarmul ( gf_add_nr ( xs, xd, zd ); gf_sub_nr ( zd, xd, zd ); gf_mul ( zs, zd, s0 ); + gf_sqr ( xa, xs ); + gf_sqr ( za, zs ); /* ... and then double */ gf_sqr ( zd, L0 ); - gf_sqr ( xa, L1 ); - gf_sub_nr ( za, zd, xa ); - gf_mul ( xd, xa, zd ); - gf_mlw ( zd, za, 1-EDWARDS_D ); - gf_add_nr ( xa, xa, zd ); - gf_mul ( zd, xa, za ); + gf_sqr ( L0, L1 ); + gf_sub_nr ( L1, zd, L0 ); + gf_mul ( xd, L0, zd ); + gf_mlw ( zd, L1, 1-EDWARDS_D ); + gf_add_nr ( L0, L0, zd ); + gf_mul ( zd, L0, L1 ); - /* OK, finish the dadd */ - gf_sqr ( xa, xs ); - gf_sqr ( za, zs ); pflip = flip; } cond_swap(xa,xd,pflip); diff --git a/src/scalarmul.c b/src/scalarmul.c index 07d35b8..f7d7d09 100644 --- a/src/scalarmul.c +++ b/src/scalarmul.c @@ -153,6 +153,7 @@ scalarmul ( SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS ); + /* FIXME: tabulator is redundant */ tw_extensible_a_t tabulator; copy_tw_extensible(tabulator, working); double_tw_extensible(tabulator);