diff --git a/src/decaf_fast.c b/src/decaf_fast.c index f2e8ed4..9fb2c4c 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -817,14 +817,123 @@ decaf_bool_t decaf_448_direct_scalarmul ( decaf_bool_t allow_identity, decaf_bool_t short_circuit ) { - decaf_448_point_t basep; - decaf_bool_t succ = decaf_448_point_decode(basep, base, allow_identity); - /* FIXME: compiler can probably reorder this to something non-consttime even if - * !short_circuit. - */ - if (short_circuit && ~succ) return succ; - decaf_448_point_scalarmul(basep, basep, scalar); - decaf_448_point_encode(scaled, basep); + gf s0, xa, za, xd, zd, xs, zs; + decaf_bool_t succ = gf_deser ( s0, base ); + succ &= allow_identity |~ gf_eq(s0, ZERO); + (void) short_circuit; + gf_sqr ( xa, s0 ); + gf_cpy ( za, ONE ); + gf_cpy ( xd, ONE ); + gf_cpy ( zd, ZERO ); + + int j; + decaf_bool_t pflip = 0; + for (j=DECAF_448_SCALAR_BITS-1; j>=0; j--) { + decaf_bool_t flip = -((scalar->limb[j/WORD_BITS]>>(j%WORD_BITS))&1);; + cond_swap(xa,xd,flip^pflip); + cond_swap(za,zd,flip^pflip); + gf_add_nr ( xs, xa, za ); + gf_sub_nr ( zs, xa, za ); + gf_add_nr ( xa, xd, zd ); + gf_sub_nr ( za, xd, zd ); + gf_mul ( xd, xa, zs ); + gf_mul ( zd, xs, za ); + gf_add_nr ( xs, xd, zd ); + gf_sub_nr ( zd, xd, zd ); + gf_mul ( zs, zd, s0 ); + gf_sqr ( zd, xa ); + gf_sqr ( xa, za ); + gf_sub_nr ( za, zd, xa ); + gf_mul ( xd, xa, zd ); + gf_mlw ( zd, za, 1-EDWARDS_D ); + gf_add_nr ( xa, xa, zd ); + gf_mul ( zd, xa, za ); + gf_sqr ( xa, xs ); + gf_sqr ( za, zs ); + pflip = flip; + } + cond_swap(xa,xd,pflip); + cond_swap(za,zd,pflip); + + /* OK, time to reserialize! */ + gf xz_d, xz_a, x0, den, L0, L1, L2, L3, out; /* TODO: simplify */ + mask_t zcase, output_zero, sflip, za_zero; + gf_mul(xz_d, xd, zd); + gf_mul(xz_a, xa, za); + output_zero = gf_eq(xz_d, ZERO); + za_zero = gf_eq(za, ZERO); + cond_sel(xz_d, xz_d, ONE, output_zero); /* make xz_d always nonzero */ + zcase = output_zero | gf_eq(xz_a, ZERO); + + gf_sqr(x0, s0); + + /* Curve test in zcase */ + gf_cpy(L0,x0); + gf_add(L0,L0,ONE); + gf_sqr(L1,L0); + gf_mlw(L0,x0,-4*EDWARDS_D); + gf_add(L1,L1,L0); + cond_sel(xz_a,xz_a,L1,zcase); + + /* Compute denominator */ + gf_mul(L0, x0, xz_d); + gf_mlw(L2, L0, 4); + gf_mul(L1, L2, xz_a); + gf_isqrt(den, L1); + + /* Check squareness */ + gf_sqr(L2, den); + gf_mul(L0, L1, L2); + gf_add(L0, L0, ONE); + succ &= ~hibit(s0) & ~gf_eq(L0, ZERO); + + /* Compute y/x */ + gf_mul(L1, x0, xd); + gf_sub(L1, zd, L1); + gf_mul(L0, za, L1); /* L0 = "opq" */ + gf_mul(L1, x0, zd); + gf_sub(L1, L1, xd); + gf_mul(L2, xa, L1); /* L2 = "pqr" */ + + gf_sub(L1, L0, L2); + gf_add(L0, L0, L2); + gf_mul(L2, L1, den); /* L2 = y0 / x0 */ + gf_mul(L1, L0, den); /* L1 = yO / xO */ + sflip = hibit(L1) ^ hibit(L2) ^ za_zero; + cond_sel(L0, xd, zd, sflip); /* L0 = "times" */ + /* OK, done with y-coordinates */ + + /* OK, now correct for swappage */ + gf_add(den,den,den); + gf_mul(L1,den,s0); + gf_sqr(L2,L1); + gf_mul(L3,L2,xz_a); + cond_sel(den,L1,L3,pflip|zcase); + + /* compute the output */ + gf_mul(L1,L0,den); + + cond_sel(L2,zs,s0,zcase); /* zs, but s0 in zcase */ + gf_mul(L0,L1,L2); + + cond_sel(L3,xd,zd,za_zero); + cond_sel(L2,xs,L3,zcase); /* xs, but zq or qq in zcase */ + gf_mul(out,L0,L2); + + cond_sel(out,out,ZERO,output_zero); + cond_neg(out,hibit(out)); + + /* TODO: resubroutineize? */ + gf_canon(out); + int i, k=0, bits=0; + decaf_dword_t buf=0; + for (i=0; i=8 || i==DECAF_448_LIMBS-1) && k>=8) { + scaled[k++]=buf; + } + } + return succ; } diff --git a/test/bench.c b/test/bench.c index bb4d21b..d9edeae 100644 --- a/test/bench.c +++ b/test/bench.c @@ -373,6 +373,15 @@ int main(int argc, char **argv) { when = now() - when; printf("decaf slow: %5.1fµs\n", when * 1e6 / i); + uint8_t enc[DECAF_448_SER_BYTES]; + memset(enc,4,sizeof(enc)); + when = now(); + for (i=0; i