Browse Source

Faster scalarmul is ported from Goldilocks, modulo a bit of magic. Of course, it's the one that doesn't matter as much because we have the monty ladder. Next up, port wNAF and recomputation?

master
Mike Hamburg 10 years ago
parent
commit
7c8a2a72c5
1 changed files with 50 additions and 33 deletions
  1. +50
    -33
      src/decaf_fast.c

+ 50
- 33
src/decaf_fast.c View File

@@ -548,7 +548,11 @@ void decaf_448_point_add (
gf_mul ( p->t, b, c );
}

void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
static void decaf_448_point_double_internal (
decaf_448_point_t p,
const decaf_448_point_t q,
decaf_bool_t before_double
) {
gf a, b, c, d;
gf_sqr ( c, q->x );
gf_sqr ( a, q->y );
@@ -563,8 +567,11 @@ void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
gf_mul ( p->x, a, b );
gf_mul ( p->z, p->t, a );
gf_mul ( p->y, p->t, d );
/* TODO: conditional? */
gf_mul ( p->t, b, d );
if (!before_double) gf_mul ( p->t, b, d );
}

void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
decaf_448_point_double_internal(p,q,0);
}

void decaf_448_point_copy (
@@ -734,7 +741,8 @@ static void pniels_to_pt (

static void add_niels_to_pt (
decaf_448_point_t d,
const niels_t e
const niels_t e,
decaf_bool_t before_double
) {
gf a, b, c;
gf_sub_nr ( b, d->y, d->x );
@@ -749,18 +757,18 @@ static void add_niels_to_pt (
gf_mul ( d->z, a, d->y );
gf_mul ( d->x, d->y, b );
gf_mul ( d->y, a, c );
/* TODO: if... */
gf_mul ( d->t, b, c );
if (!before_double) gf_mul ( d->t, b, c );
}

static void add_pniels_to_pt (
decaf_448_point_t p,
const pniels_t pn
const pniels_t pn,
decaf_bool_t before_double
) {
gf L0;
gf_mul ( L0, p->z, pn->z );
gf_cpy ( p->z, L0 );
add_niels_to_pt( p, pn->n );
add_niels_to_pt( p, pn->n, before_double );
}

void decaf_448_point_scalarmul (
@@ -768,64 +776,73 @@ void decaf_448_point_scalarmul (
const decaf_448_point_t b,
const decaf_448_scalar_t scalar
) {
const int WINDOW = 5,
const int WINDOW = 5, /* PERF: Make 4 on non hugevector platforms? */
WINDOW_MASK = (1<<WINDOW)-1,
WINDOW_U_MASK = (1<<((448%WINDOW)-1))-1,
WINDOW_T_MASK = WINDOW_MASK >> 1,
NTABLE = 1<<(WINDOW-1);
decaf_448_scalar_t scalar2, onehalf = {{{0}}}, arrr;
/* Adjust the scalar to SABS window. TODO: optimize, subroutinize */
decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr;
onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1);
/* FIXME PERF MAGIC precompute 2^449-1/2 mod q. Could instead use 2^446-1/2 mod q though. */
decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

/* PERF dedicated halve */
decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one);
decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

/* FIXME PERF precompute 2^447-1/2 mod q. Could instead use 2^446-1/2 mod q though. */
decaf_448_montmul(arrr,decaf_448_scalar_one,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
decaf_448_montmul(arrr,arrr,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

decaf_448_scalar_add(scalar2, scalar2, arrr);

/* Set up a precomputed table with odd multiples of b. */
pniels_t pn, multiples[NTABLE];
decaf_448_point_t tmp;
decaf_448_point_double(tmp, b);
pt_to_pniels(pn, tmp);
pt_to_pniels(multiples[0], b);
decaf_448_point_double(a, b);
pt_to_pniels(pn, a);
decaf_448_point_copy(tmp, b);

int i,j;
for (i=1; i<NTABLE; i++) {
add_pniels_to_pt(a, pn);
pt_to_pniels(multiples[i], a);
add_pniels_to_pt(tmp, pn, 0);
pt_to_pniels(multiples[i], tmp);
}

i = 448 - (448 % WINDOW);
int bits = scalar2->limb[i/WBITS] >> (i%WBITS),
inv = (bits>>((448 % WINDOW)-1))-1;
/* Initialize. */
i = DECAF_448_SCALAR_BITS - ((DECAF_448_SCALAR_BITS-1) % WINDOW) - 1;
int bits = scalar2->limb[i/WBITS] >> (i%WBITS) & WINDOW_MASK,
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_U_MASK);
constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
cond_neg_pniels(pn, inv);
pniels_to_pt(a, pn);
pniels_to_pt(tmp, pn);

for (i-=WINDOW; i>=0; i-=WINDOW) {
for (j=0; j<WINDOW; j++) {
decaf_448_point_double(a, a);
}

/* Using Hisil et al's lookahead method instead of extensible here
* for no particular reason. Double WINDOW times, but only compute t on
* the last one.
*/
for (j=0; j<WINDOW-1; j++)
decaf_448_point_double_internal(tmp, tmp, -1);
decaf_448_point_double(tmp, tmp);

/* Fetch another block of bits */
bits = scalar2->limb[i/WBITS] >> (i%WBITS);
if (i%WBITS >= WBITS-WINDOW) {
bits ^= scalar2->limb[i/WBITS+1] << (WBITS - (i%WBITS));
}
bits &= WINDOW_MASK;
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
/* Add in from table. Compute t only on last iteration. */
constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
cond_neg_pniels(pn, inv);
add_pniels_to_pt(a, pn);
add_pniels_to_pt(tmp, pn, i ? -1 : 0);
}
/* Write out the answer */
decaf_448_point_copy(a,tmp);
}

void decaf_448_point_double_scalarmul (


Loading…
Cancel
Save