| @@ -548,7 +548,11 @@ void decaf_448_point_add ( | |||||
| gf_mul ( p->t, b, c ); | gf_mul ( p->t, b, c ); | ||||
| } | } | ||||
| void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) { | |||||
| static void decaf_448_point_double_internal ( | |||||
| decaf_448_point_t p, | |||||
| const decaf_448_point_t q, | |||||
| decaf_bool_t before_double | |||||
| ) { | |||||
| gf a, b, c, d; | gf a, b, c, d; | ||||
| gf_sqr ( c, q->x ); | gf_sqr ( c, q->x ); | ||||
| gf_sqr ( a, q->y ); | gf_sqr ( a, q->y ); | ||||
| @@ -563,8 +567,11 @@ void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) { | |||||
| gf_mul ( p->x, a, b ); | gf_mul ( p->x, a, b ); | ||||
| gf_mul ( p->z, p->t, a ); | gf_mul ( p->z, p->t, a ); | ||||
| gf_mul ( p->y, p->t, d ); | gf_mul ( p->y, p->t, d ); | ||||
| /* TODO: conditional? */ | |||||
| gf_mul ( p->t, b, d ); | |||||
| if (!before_double) gf_mul ( p->t, b, d ); | |||||
| } | |||||
| void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) { | |||||
| decaf_448_point_double_internal(p,q,0); | |||||
| } | } | ||||
| void decaf_448_point_copy ( | void decaf_448_point_copy ( | ||||
| @@ -734,7 +741,8 @@ static void pniels_to_pt ( | |||||
| static void add_niels_to_pt ( | static void add_niels_to_pt ( | ||||
| decaf_448_point_t d, | decaf_448_point_t d, | ||||
| const niels_t e | |||||
| const niels_t e, | |||||
| decaf_bool_t before_double | |||||
| ) { | ) { | ||||
| gf a, b, c; | gf a, b, c; | ||||
| gf_sub_nr ( b, d->y, d->x ); | gf_sub_nr ( b, d->y, d->x ); | ||||
| @@ -749,18 +757,18 @@ static void add_niels_to_pt ( | |||||
| gf_mul ( d->z, a, d->y ); | gf_mul ( d->z, a, d->y ); | ||||
| gf_mul ( d->x, d->y, b ); | gf_mul ( d->x, d->y, b ); | ||||
| gf_mul ( d->y, a, c ); | gf_mul ( d->y, a, c ); | ||||
| /* TODO: if... */ | |||||
| gf_mul ( d->t, b, c ); | |||||
| if (!before_double) gf_mul ( d->t, b, c ); | |||||
| } | } | ||||
| static void add_pniels_to_pt ( | static void add_pniels_to_pt ( | ||||
| decaf_448_point_t p, | decaf_448_point_t p, | ||||
| const pniels_t pn | |||||
| const pniels_t pn, | |||||
| decaf_bool_t before_double | |||||
| ) { | ) { | ||||
| gf L0; | gf L0; | ||||
| gf_mul ( L0, p->z, pn->z ); | gf_mul ( L0, p->z, pn->z ); | ||||
| gf_cpy ( p->z, L0 ); | gf_cpy ( p->z, L0 ); | ||||
| add_niels_to_pt( p, pn->n ); | |||||
| add_niels_to_pt( p, pn->n, before_double ); | |||||
| } | } | ||||
| void decaf_448_point_scalarmul ( | void decaf_448_point_scalarmul ( | ||||
| @@ -768,64 +776,73 @@ void decaf_448_point_scalarmul ( | |||||
| const decaf_448_point_t b, | const decaf_448_point_t b, | ||||
| const decaf_448_scalar_t scalar | const decaf_448_scalar_t scalar | ||||
| ) { | ) { | ||||
| const int WINDOW = 5, | |||||
| const int WINDOW = 5, /* PERF: Make 4 on non hugevector platforms? */ | |||||
| WINDOW_MASK = (1<<WINDOW)-1, | WINDOW_MASK = (1<<WINDOW)-1, | ||||
| WINDOW_U_MASK = (1<<((448%WINDOW)-1))-1, | |||||
| WINDOW_T_MASK = WINDOW_MASK >> 1, | WINDOW_T_MASK = WINDOW_MASK >> 1, | ||||
| NTABLE = 1<<(WINDOW-1); | NTABLE = 1<<(WINDOW-1); | ||||
| decaf_448_scalar_t scalar2, onehalf = {{{0}}}, arrr; | |||||
| /* Adjust the scalar to SABS window. TODO: optimize, subroutinize */ | |||||
| decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr; | |||||
| onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1); | onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1); | ||||
| /* FIXME PERF MAGIC precompute 2^449-1/2 mod q. Could instead use 2^446-1/2 mod q though. */ | |||||
| decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); | |||||
| /* PERF dedicated halve */ | /* PERF dedicated halve */ | ||||
| decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one); | decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one); | ||||
| decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); | decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); | ||||
| /* FIXME PERF precompute 2^447-1/2 mod q. Could instead use 2^446-1/2 mod q though. */ | |||||
| decaf_448_montmul(arrr,decaf_448_scalar_one,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); | |||||
| decaf_448_montmul(arrr,arrr,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); | |||||
| decaf_448_scalar_add(scalar2, scalar2, arrr); | decaf_448_scalar_add(scalar2, scalar2, arrr); | ||||
| /* Set up a precomputed table with odd multiples of b. */ | |||||
| pniels_t pn, multiples[NTABLE]; | pniels_t pn, multiples[NTABLE]; | ||||
| decaf_448_point_t tmp; | |||||
| decaf_448_point_double(tmp, b); | |||||
| pt_to_pniels(pn, tmp); | |||||
| pt_to_pniels(multiples[0], b); | pt_to_pniels(multiples[0], b); | ||||
| decaf_448_point_double(a, b); | |||||
| pt_to_pniels(pn, a); | |||||
| decaf_448_point_copy(tmp, b); | |||||
| int i,j; | int i,j; | ||||
| for (i=1; i<NTABLE; i++) { | for (i=1; i<NTABLE; i++) { | ||||
| add_pniels_to_pt(a, pn); | |||||
| pt_to_pniels(multiples[i], a); | |||||
| add_pniels_to_pt(tmp, pn, 0); | |||||
| pt_to_pniels(multiples[i], tmp); | |||||
| } | } | ||||
| i = 448 - (448 % WINDOW); | |||||
| int bits = scalar2->limb[i/WBITS] >> (i%WBITS), | |||||
| inv = (bits>>((448 % WINDOW)-1))-1; | |||||
| /* Initialize. */ | |||||
| i = DECAF_448_SCALAR_BITS - ((DECAF_448_SCALAR_BITS-1) % WINDOW) - 1; | |||||
| int bits = scalar2->limb[i/WBITS] >> (i%WBITS) & WINDOW_MASK, | |||||
| inv = (bits>>(WINDOW-1))-1; | |||||
| bits ^= inv; | bits ^= inv; | ||||
| constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_U_MASK); | |||||
| constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK); | |||||
| cond_neg_pniels(pn, inv); | cond_neg_pniels(pn, inv); | ||||
| pniels_to_pt(a, pn); | |||||
| pniels_to_pt(tmp, pn); | |||||
| for (i-=WINDOW; i>=0; i-=WINDOW) { | for (i-=WINDOW; i>=0; i-=WINDOW) { | ||||
| for (j=0; j<WINDOW; j++) { | |||||
| decaf_448_point_double(a, a); | |||||
| } | |||||
| /* Using Hisil et al's lookahead method instead of extensible here | |||||
| * for no particular reason. Double WINDOW times, but only compute t on | |||||
| * the last one. | |||||
| */ | |||||
| for (j=0; j<WINDOW-1; j++) | |||||
| decaf_448_point_double_internal(tmp, tmp, -1); | |||||
| decaf_448_point_double(tmp, tmp); | |||||
| /* Fetch another block of bits */ | |||||
| bits = scalar2->limb[i/WBITS] >> (i%WBITS); | bits = scalar2->limb[i/WBITS] >> (i%WBITS); | ||||
| if (i%WBITS >= WBITS-WINDOW) { | if (i%WBITS >= WBITS-WINDOW) { | ||||
| bits ^= scalar2->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits ^= scalar2->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
| } | } | ||||
| bits &= WINDOW_MASK; | bits &= WINDOW_MASK; | ||||
| inv = (bits>>(WINDOW-1))-1; | inv = (bits>>(WINDOW-1))-1; | ||||
| bits ^= inv; | bits ^= inv; | ||||
| /* Add in from table. Compute t only on last iteration. */ | |||||
| constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK); | constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK); | ||||
| cond_neg_pniels(pn, inv); | cond_neg_pniels(pn, inv); | ||||
| add_pniels_to_pt(a, pn); | |||||
| add_pniels_to_pt(tmp, pn, i ? -1 : 0); | |||||
| } | } | ||||
| /* Write out the answer */ | |||||
| decaf_448_point_copy(a,tmp); | |||||
| } | } | ||||
| void decaf_448_point_double_scalarmul ( | void decaf_448_point_double_scalarmul ( | ||||