Browse Source

optimizing and cleanup; there is still a perf regression in decaf_fast but it is now smaller, and there isnt as big a space regression due to more careful noinline

master
Mike Hamburg 10 years ago
parent
commit
746b050584
4 changed files with 118 additions and 54 deletions
  1. +1
    -4
      src/decaf.c
  2. +62
    -49
      src/decaf_fast.c
  3. +32
    -1
      src/decaf_gen_tables.c
  4. +23
    -0
      test/bench.c

+ 1
- 4
src/decaf.c View File

@@ -792,10 +792,7 @@ decaf_bool_t decaf_448_direct_scalarmul (
) { ) {
decaf_448_point_t basep; decaf_448_point_t basep;
decaf_bool_t succ = decaf_448_point_decode(basep, base, allow_identity); decaf_bool_t succ = decaf_448_point_decode(basep, base, allow_identity);
/* FIXME: compiler can probably reorder this to something non-consttime even if
* !short_circuit.
*/
if (short_circuit && ~succ) return succ;
if (short_circuit & ~succ) return succ;
decaf_448_point_scalarmul(basep, basep, scalar); decaf_448_point_scalarmul(basep, basep, scalar);
decaf_448_point_encode(scaled, basep); decaf_448_point_encode(scaled, basep);
return succ; return succ;


+ 62
- 49
src/decaf_fast.c View File

@@ -38,6 +38,8 @@ typedef int64_t decaf_sdword_t;
static const int QUADRATIC_NONRESIDUE = -1; static const int QUADRATIC_NONRESIDUE = -1;


#define sv static void #define sv static void
#define snv static void __attribute__((noinline))
#define siv static inline void __attribute__((always_inline))
typedef decaf_word_t gf[DECAF_448_LIMBS] __attribute__((aligned(32))); typedef decaf_word_t gf[DECAF_448_LIMBS] __attribute__((aligned(32)));
static const gf ZERO = {0}, ONE = {1}, TWO = {2}; static const gf ZERO = {0}, ONE = {1}, TWO = {2};


@@ -120,54 +122,54 @@ const size_t alignof_decaf_448_precomputed_s = 32;
#endif #endif


/** Copy x = y */ /** Copy x = y */
sv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }
siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }


/** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */
static inline void gf_mul (gf c, const gf a, const gf b) {
/** Mostly-unoptimized multiply, but at least it's unrolled. */
siv gf_mul (gf c, const gf a, const gf b) {
field_mul((field_t *)c, (const field_t *)a, (const field_t *)b); field_mul((field_t *)c, (const field_t *)a, (const field_t *)b);
} }


/** No dedicated square (PERF) */
static inline void gf_sqr (gf c, const gf a) {
/** Dedicated square */
siv gf_sqr (gf c, const gf a) {
field_sqr((field_t *)c, (const field_t *)a); field_sqr((field_t *)c, (const field_t *)a);
} }


/** Inverse square root using addition chain. */ /** Inverse square root using addition chain. */
sv gf_isqrt(gf y, const gf x) {
siv gf_isqrt(gf y, const gf x) {
field_isr((field_t *)y, (const field_t *)x); field_isr((field_t *)y, (const field_t *)x);
} }


/** Add mod p. Conservatively always weak-reduce. (PERF) */
static inline void gf_add ( gf c, const gf a, const gf b ) {
/** Add mod p. Conservatively always weak-reduce. */
snv gf_add ( gf c, const gf a, const gf b ) {
field_add((field_t *)c, (const field_t *)a, (const field_t *)b); field_add((field_t *)c, (const field_t *)a, (const field_t *)b);
} }


/** Subtract mod p. Conservatively always weak-reduce. (PERF) */
static inline void gf_sub ( gf c, const gf a, const gf b ) {
/** Subtract mod p. Conservatively always weak-reduce. */
snv gf_sub ( gf c, const gf a, const gf b ) {
field_sub((field_t *)c, (const field_t *)a, (const field_t *)b); field_sub((field_t *)c, (const field_t *)a, (const field_t *)b);
} }


/** Add mod p. Conservatively always weak-reduce. (PERF) */
static inline void gf_bias ( gf c, int amt) {
/** Add mod p. Conservatively always weak-reduce.) */
siv gf_bias ( gf c, int amt) {
field_bias((field_t *)c, amt); field_bias((field_t *)c, amt);
} }


/** Subtract mod p. Bias by 2 and don't reduce */ /** Subtract mod p. Bias by 2 and don't reduce */
static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
siv gf_sub_nr ( gf c, const gf a, const gf b ) {
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
gf_bias(c, 2); gf_bias(c, 2);
} }


/** Subtract mod p. Bias by amt but don't reduce. */ /** Subtract mod p. Bias by amt but don't reduce. */
static inline void gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
gf_bias(c, amt); gf_bias(c, amt);
} }


/** Add mod p. Don't reduce. */ /** Add mod p. Don't reduce. */
static inline void gf_add_nr ( gf c, const gf a, const gf b ) {
siv gf_add_nr ( gf c, const gf a, const gf b ) {
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
} }
@@ -186,10 +188,13 @@ sv cond_neg(gf x, decaf_bool_t neg) {
} }


/** Constant time, if (swap) (x,y) = (y,x); */ /** Constant time, if (swap) (x,y) = (y,x); */
static inline void cond_swap(gf x, gf y, decaf_bool_t swap) {
siv cond_swap(gf x, decaf_word_t *__restrict__ y, decaf_bool_t swap) {
int i; int i;
/* PERF */
//_Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(4) interleave_count(2)")
#ifdef __clang__
#if 10*__clang_major__ + __clang_minor__ > 35
_Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(4) interleave_count(2)")
#endif
#endif
for (i=0; i<DECAF_448_LIMBS; i++) { for (i=0; i<DECAF_448_LIMBS; i++) {
decaf_word_t s = (x[i] ^ y[i]) & swap; decaf_word_t s = (x[i] ^ y[i]) & swap;
x[i] ^= s; x[i] ^= s;
@@ -201,7 +206,7 @@ static inline void cond_swap(gf x, gf y, decaf_bool_t swap) {
* Mul by signed int. Not constant-time WRT the sign of that int. * Mul by signed int. Not constant-time WRT the sign of that int.
* Just uses a full mul (PERF) * Just uses a full mul (PERF)
*/ */
static inline void gf_mlw(gf c, const gf a, int w) {
siv gf_mlw(gf c, const gf a, int w) {
if (w>0) { if (w>0) {
field_mulw((field_t *)c, (const field_t *)a, w); field_mulw((field_t *)c, (const field_t *)a, w);
} else { } else {
@@ -211,7 +216,7 @@ static inline void gf_mlw(gf c, const gf a, int w) {
} }


/** Canonicalize */ /** Canonicalize */
static inline void gf_canon ( gf a ) {
siv gf_canon ( gf a ) {
field_strong_reduce((field_t *)a); field_strong_reduce((field_t *)a);
} }


@@ -258,7 +263,7 @@ sv decaf_448_cond_sel (
/** {extra,accum} - sub +? p /** {extra,accum} - sub +? p
* Must have extra <= 1 * Must have extra <= 1
*/ */
sv decaf_448_subx(
snv decaf_448_subx(
decaf_448_scalar_t out, decaf_448_scalar_t out,
const decaf_word_t accum[DECAF_448_SCALAR_LIMBS], const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
const decaf_448_scalar_t sub, const decaf_448_scalar_t sub,
@@ -282,7 +287,7 @@ sv decaf_448_subx(
} }
} }


sv decaf_448_montmul (
snv decaf_448_montmul (
decaf_448_scalar_t out, decaf_448_scalar_t out,
const decaf_448_scalar_t a, const decaf_448_scalar_t a,
const decaf_448_scalar_t b, const decaf_448_scalar_t b,
@@ -354,6 +359,25 @@ void decaf_448_scalar_add (
decaf_448_subx(out, out->limb, decaf_448_scalar_p, decaf_448_scalar_p, chain); decaf_448_subx(out, out->limb, decaf_448_scalar_p, decaf_448_scalar_p, chain);
} }


snv decaf_448_halve (
decaf_448_scalar_t out,
const decaf_448_scalar_t a,
const decaf_448_scalar_t p
) {
decaf_word_t mask = -(a->limb[0] & 1);
decaf_dword_t chain = 0;
unsigned int i;
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
chain = (chain + a->limb[i]) + (p->limb[i] & mask);
out->limb[i] = chain;
chain >>= WBITS;
}
for (i=0; i<DECAF_448_SCALAR_LIMBS-1; i++) {
out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
}
out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
}

void decaf_448_scalar_copy ( void decaf_448_scalar_copy (
decaf_448_scalar_t out, decaf_448_scalar_t out,
const decaf_448_scalar_t a const decaf_448_scalar_t a
@@ -551,7 +575,7 @@ void decaf_448_point_add (
gf_mul ( p->t, b, c ); gf_mul ( p->t, b, c );
} }


static void decaf_448_point_double_internal (
snv decaf_448_point_double_internal (
decaf_448_point_t p, decaf_448_point_t p,
const decaf_448_point_t q, const decaf_448_point_t q,
decaf_bool_t before_double decaf_bool_t before_double
@@ -682,7 +706,7 @@ void decaf_448_scalar_encode(
} }


/* Operations on [p]niels */ /* Operations on [p]niels */
static void cond_neg_niels (
siv cond_neg_niels (
niels_t n, niels_t n,
decaf_bool_t neg decaf_bool_t neg
) { ) {
@@ -713,7 +737,7 @@ static void pniels_to_pt (
gf_sqr ( e->z, d->z ); gf_sqr ( e->z, d->z );
} }


static void niels_to_pt (
snv niels_to_pt (
decaf_448_point_t e, decaf_448_point_t e,
const niels_t n const niels_t n
) { ) {
@@ -723,7 +747,7 @@ static void niels_to_pt (
gf_cpy ( e->z, ONE ); gf_cpy ( e->z, ONE );
} }


static void add_niels_to_pt (
snv add_niels_to_pt (
decaf_448_point_t d, decaf_448_point_t d,
const niels_t e, const niels_t e,
decaf_bool_t before_double decaf_bool_t before_double
@@ -744,7 +768,7 @@ static void add_niels_to_pt (
if (!before_double) gf_mul ( d->t, b, c ); if (!before_double) gf_mul ( d->t, b, c );
} }


static void add_pniels_to_pt (
sv add_pniels_to_pt (
decaf_448_point_t p, decaf_448_point_t p,
const pniels_t pn, const pniels_t pn,
decaf_bool_t before_double decaf_bool_t before_double
@@ -755,6 +779,8 @@ static void add_pniels_to_pt (
add_niels_to_pt( p, pn->n, before_double ); add_niels_to_pt( p, pn->n, before_double );
} }


extern const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment;

void decaf_448_point_scalarmul ( void decaf_448_point_scalarmul (
decaf_448_point_t a, decaf_448_point_t a,
const decaf_448_point_t b, const decaf_448_point_t b,
@@ -764,18 +790,10 @@ void decaf_448_point_scalarmul (
WINDOW_MASK = (1<<WINDOW)-1, WINDOW_MASK = (1<<WINDOW)-1,
WINDOW_T_MASK = WINDOW_MASK >> 1, WINDOW_T_MASK = WINDOW_MASK >> 1,
NTABLE = 1<<(WINDOW-1); NTABLE = 1<<(WINDOW-1);
/* Adjust the scalar to SABS window. TODO: optimize, subroutinize */
decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr;
onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1);
/* FIXME PERF MAGIC precompute 2^449-1/2 mod q. Could instead use 2^446-1/2 mod q though. */
decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

/* PERF dedicated halve */
decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one);
decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
decaf_448_scalar_add(scalar2, scalar2, arrr);
decaf_448_scalar_t scalar2;
decaf_448_scalar_add(scalar2, scalar, decaf_448_point_scalarmul_adjustment);
decaf_448_halve(scalar2,scalar2,decaf_448_scalar_p);
/* Set up a precomputed table with odd multiples of b. */ /* Set up a precomputed table with odd multiples of b. */
pniels_t pn, multiples[NTABLE]; pniels_t pn, multiples[NTABLE];
@@ -1054,6 +1072,8 @@ decaf_448_precompute (
} }
} }


extern const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment;

void decaf_448_precomputed_scalarmul ( void decaf_448_precomputed_scalarmul (
decaf_448_point_t out, decaf_448_point_t out,
const decaf_448_precomputed_s *table, const decaf_448_precomputed_s *table,
@@ -1062,16 +1082,9 @@ void decaf_448_precomputed_scalarmul (
unsigned int i,j,k; unsigned int i,j,k;
const unsigned int n = 5, t = 5, s = 18; // TODO MAGIC const unsigned int n = 5, t = 5, s = 18; // TODO MAGIC
decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr;
onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1);

/* FIXME PERF MAGIC precompute 2^449-1/2 mod q. Could instead use 2^446-1/2 mod q though. */
decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

/* PERF dedicated halve */
decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one);
decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
decaf_448_scalar_add(scalar2, scalar2, arrr);
decaf_448_scalar_t scalar2;
decaf_448_scalar_add(scalar2, scalar, decaf_448_precomputed_scalarmul_adjustment);
decaf_448_halve(scalar2,scalar2,decaf_448_scalar_p);
niels_t ni; niels_t ni;


+ 32
- 1
src/decaf_gen_tables.c View File

@@ -13,7 +13,20 @@
#include <stdlib.h> #include <stdlib.h>
#include "decaf.h" #include "decaf.h"


const decaf_word_t decaf_448_precomputed_base_as_words[1]; /* To satisfy linker. */
/* To satisfy linker. */
const decaf_word_t decaf_448_precomputed_base_as_words[1];
const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment;
const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment;

void scalar_print(const char *name, const decaf_448_scalar_t sc) {
printf("const decaf_448_scalar_t %s = {{{\n", name);
unsigned i;
for (i=0; i<sizeof(decaf_448_scalar_t)/sizeof(decaf_word_t); i++) {
if (i) printf(", ");
printf("0x%0*llxull", (int)sizeof(decaf_word_t)*2, (unsigned long long)sc->limb[i] );
}
printf("}}};\n\n");
}


int main(int argc, char **argv) { int main(int argc, char **argv) {
(void)argc; (void)argv; (void)argc; (void)argv;
@@ -40,5 +53,23 @@ int main(int argc, char **argv) {
} }
printf("\n};\n"); printf("\n};\n");
decaf_448_scalar_t smadj;
decaf_448_scalar_copy(smadj,decaf_448_scalar_one);

const unsigned int n = 5, t = 5, s = 18; // TODO MAGIC
for (i=0; i<n*t*s; i++) {
decaf_448_scalar_add(smadj,smadj,smadj);
}
decaf_448_scalar_sub(smadj, smadj, decaf_448_scalar_one);
scalar_print("decaf_448_precomputed_scalarmul_adjustment", smadj);
const unsigned int WINDOW=5; // TODO magic
decaf_448_scalar_copy(smadj,decaf_448_scalar_one);
for (i=0; i<DECAF_448_SCALAR_BITS-1 + WINDOW - ((DECAF_448_SCALAR_BITS-1)%WINDOW); i++) {
decaf_448_scalar_add(smadj,smadj,smadj);
}
decaf_448_scalar_sub(smadj, smadj, decaf_448_scalar_one);
scalar_print("decaf_448_point_scalarmul_adjustment", smadj);
return 0; return 0;
} }

+ 23
- 0
test/bench.c View File

@@ -388,6 +388,13 @@ int main(int argc, char **argv) {
} }
when = now() - when; when = now() - when;
printf("decaf slo2: %5.1fµs\n", when * 1e6 / i); printf("decaf slo2: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase/10; i++) {
decaf_448_precomputed_scalarmul(Da,decaf_448_precomputed_base,bsc);
}
when = now() - when;
printf("decaf pres: %5.1fµs\n", when * 1e6 / i);


when = now(); when = now();
for (i=0; i<nbase/10; i++) { for (i=0; i<nbase/10; i++) {
@@ -731,6 +738,10 @@ int main(int argc, char **argv) {
when = now() - when; when = now() - when;
printf("sign: %5.1fµs\n", when * 1e6 / i); printf("sign: %5.1fµs\n", when * 1e6 / i);
if (memcmp(dshared[0], dshared[1], 32)) {
printf("BUG: mismatched shared secrets\n");
}
when = now(); when = now();
for (i=0; i<nbase/10; i++) { for (i=0; i<nbase/10; i++) {
decaf_bool_t ret = decaf_448_verify(dsig, dpub[0], decaf_bool_t ret = decaf_448_verify(dsig, dpub[0],
@@ -746,5 +757,17 @@ int main(int argc, char **argv) {
when = now() - when; when = now() - when;
printf("verify: %5.1fµs\n", when * 1e6 / i); printf("verify: %5.1fµs\n", when * 1e6 / i);
decaf_448_precomputed_s *dpre;
ignore_result(posix_memalign((void**)&dpre,
alignof_decaf_448_precomputed_s, sizeof_decaf_448_precomputed_s));
assert(dpre);
when = now();
for (i=0; i<nbase/10; i++) {
decaf_448_precompute(dpre, Da);
}
when = now() - when;
printf("pre: %5.1fµs\n", when * 1e6 / i);
free(dpre);
return 0; return 0;
} }

Loading…
Cancel
Save