From b2dc216b409280ff3569a87aa8040c5f1d26f027 Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Tue, 24 Mar 2015 16:25:42 -0700 Subject: [PATCH] finish porting precomputed verify to decaf_fast. Remove tables as dependency of decaf slow --- Makefile | 6 +- src/decaf.c | 62 ++++++++------ src/decaf_fast.c | 180 ++++++++++++++++++++++++----------------- src/decaf_gen_tables.c | 28 ++++++- 4 files changed, 175 insertions(+), 101 deletions(-) diff --git a/Makefile b/Makefile index 107f4d9..f6b8379 100644 --- a/Makefile +++ b/Makefile @@ -70,8 +70,12 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \ build/f_arithmetic.o build/arithmetic.o -DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o \ + +DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o \ build/$(FIELD).o build/f_arithmetic.o # TODO +ifeq ($(DECAF),decaf_fast) +DECAFCOMPONENTS += build/decaf_tables.o +endif TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o \ diff --git a/src/decaf.c b/src/decaf.c index 3209801..7de3f56 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -89,35 +89,47 @@ const decaf_448_point_t decaf_448_point_base = {{ struct decaf_448_precomputed_s { decaf_448_point_t p[1]; }; /* FIXME: restore */ -// const struct decaf_448_precomputed_s *decaf_448_precomputed_base = -// (const struct decaf_448_precomputed_s *)decaf_448_point_base; - -extern const decaf_word_t decaf_448_precomputed_base_as_words[]; -const decaf_448_precomputed_s *decaf_448_precomputed_base = - (const decaf_448_precomputed_s *) &decaf_448_precomputed_base_as_words; +const struct decaf_448_precomputed_s *decaf_448_precomputed_base = + (const struct decaf_448_precomputed_s *)decaf_448_point_base; const size_t sizeof_decaf_448_precomputed_s = sizeof(struct decaf_448_precomputed_s); const size_t alignof_decaf_448_precomputed_s = 32; +#ifdef __clang__ +#if 100*__clang_major__ + __clang_minor__ > 305 +#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") +#endif +#endif + +#ifndef VECTORIZE +#define VECTORIZE +#endif + #if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__)) || defined(DECAF_FORCE_UNROLL) #if DECAF_448_LIMBS==8 - #define FOR_LIMB(i,op) { unsigned int i=0; \ + #define FOR_LIMB_U(i,op) { unsigned int i=0; \ op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ } #elif DECAF_448_LIMBS==16 - #define FOR_LIMB(i,op) { unsigned int i=0; \ + #define FOR_LIMB_U(i,op) { unsigned int i=0; \ op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ } #else - #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; ilimb[i] = y->limb[i]); } +siv gf_cpy(gf x, const gf y) { FOR_LIMB_U(i, x->limb[i] = y->limb[i]); } /** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */ snv gf_mul (gf c, const gf a, const gf b) { @@ -125,19 +137,19 @@ snv gf_mul (gf c, const gf a, const gf b) { gf_cpy(aa,a); decaf_dword_t accum[DECAF_448_LIMBS] = {0}; - FOR_LIMB(i, { - FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); + FOR_LIMB_U(i, { + FOR_LIMB_U(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); aa->limb[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa->limb[DECAF_448_LIMBS-1-i]; }); accum[DECAF_448_LIMBS-1] += accum[DECAF_448_LIMBS-2] >> LBITS; accum[DECAF_448_LIMBS-2] &= LMASK; accum[DECAF_448_LIMBS/2] += accum[DECAF_448_LIMBS-1] >> LBITS; - FOR_LIMB(j,{ + FOR_LIMB_U(j,{ accum[j] += accum[(j-1)%DECAF_448_LIMBS] >> LBITS; accum[(j-1)%DECAF_448_LIMBS] &= LMASK; }); - FOR_LIMB(j, c->limb[j] = accum[j] ); + FOR_LIMB_U(j, c->limb[j] = accum[j] ); } /** No dedicated square (PERF) */ @@ -166,7 +178,7 @@ snv gf_isqrt(gf y, const gf x) { /** Weak reduce mod p. */ siv gf_reduce(gf x) { x->limb[DECAF_448_LIMBS/2] += x->limb[DECAF_448_LIMBS-1] >> LBITS; - FOR_LIMB(j,{ + FOR_LIMB_U(j,{ x->limb[j] += x->limb[(j-1)%DECAF_448_LIMBS] >> LBITS; x->limb[(j-1)%DECAF_448_LIMBS] &= LMASK; }); @@ -174,19 +186,19 @@ siv gf_reduce(gf x) { /** Add mod p. Conservatively always weak-reduce. (PERF) */ sv gf_add ( gf x, const gf y, const gf z ) { - FOR_LIMB(i, x->limb[i] = y->limb[i] + z->limb[i] ); + FOR_LIMB_U(i, x->limb[i] = y->limb[i] + z->limb[i] ); gf_reduce(x); } /** Subtract mod p. Conservatively always weak-reduce. (PERF) */ sv gf_sub ( gf x, const gf y, const gf z ) { - FOR_LIMB(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); + FOR_LIMB_U(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); gf_reduce(x); } /** Constant time, x = is_z ? z : y */ sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) { - FOR_LIMB(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); + FOR_LIMB_U(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); } /** Constant time, if (neg) x=-x; */ @@ -198,7 +210,7 @@ siv cond_neg(gf x, decaf_bool_t neg) { /** Constant time, if (swap) (x,y) = (y,x); */ sv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { - FOR_LIMB(i, { + FOR_LIMB_U(i, { decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; x->limb[i] ^= s; y->limb[i] ^= s; @@ -388,9 +400,9 @@ decaf_bool_t decaf_448_scalar_eq ( const decaf_448_scalar_t a, const decaf_448_scalar_t b ) { + int i; decaf_word_t diff = 0; - unsigned int i; - for (i=0; ilimb[i] ^ b->limb[i]; } return (((decaf_dword_t)diff)-1)>>WBITS; @@ -424,14 +436,14 @@ void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf cond_neg ( a, hibit(a) ); gf_canon(a); - int i, k=0, bits=0; + int k=0, bits=0; decaf_dword_t buf=0; - for (i=0; ilimb[i]<=8 || i==DECAF_448_LIMBS-1) && k>=8) { ser[k++]=buf; } - } + }); } /** diff --git a/src/decaf_fast.c b/src/decaf_fast.c index e084216..00fa6de 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -1073,6 +1073,31 @@ static void gf_batch_invert ( } } +static void batch_normalize_niels ( + niels_t *table, + gf *zs, + gf *zis, + int n +) { + int i; + gf product; + gf_batch_invert(zis, zs, n); + + for (i=0; ia, zis[i]); + gf_canon(product); + gf_cpy(table[i]->a, product); + + gf_mul(product, table[i]->b, zis[i]); + gf_canon(product); + gf_cpy(table[i]->b, product); + + gf_mul(product, table[i]->c, zis[i]); + gf_canon(product); + gf_cpy(table[i]->c, product); + } +} + void decaf_448_precompute ( decaf_448_precomputed_s *table, @@ -1129,22 +1154,7 @@ decaf_448_precompute ( } } - gf_batch_invert(zis, zs, n<<(t-1)); - - gf product; - for (i=0; itable[i]->a, zis[i]); - gf_canon(product); - gf_cpy(table->table[i]->a, product); - - gf_mul(product, table->table[i]->b, zis[i]); - gf_canon(product); - gf_cpy(table->table[i]->b, product); - - gf_mul(product, table->table[i]->c, zis[i]); - gf_canon(product); - gf_cpy(table->table[i]->c, product); - } + batch_normalize_niels(table->table,zs,zis,n<<(t-1)); } extern const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; @@ -1396,93 +1406,115 @@ static int recode_wnaf ( sv prepare_wnaf_table( pniels_t *output, - decaf_448_point_t working, + const decaf_448_point_t working, unsigned int tbits ) { + decaf_448_point_t tmp; int i; pt_to_pniels(output[0], working); if (tbits == 0) return; - decaf_448_point_double(working,working); + decaf_448_point_double(tmp,working); pniels_t twop; - pt_to_pniels(twop, working); + pt_to_pniels(twop, tmp); - add_pniels_to_pt(working, output[0],0); - pt_to_pniels(output[1], working); + add_pniels_to_pt(tmp, output[0],0); + pt_to_pniels(output[1], tmp); for (i=2; i < 1<n, sizeof(niels_t)); + gf_cpy(zs[i], tmp[i]->z); + } + batch_normalize_niels(out, zs, zis, 1<<5); +} + void decaf_448_base_double_scalarmul_non_secret ( decaf_448_point_t combo, const decaf_448_scalar_t scalar1, const decaf_448_point_t base2, const decaf_448_scalar_t scalar2 ) { - int i; - unsigned j,k; - const unsigned int n = 5, t = 5; - const int s = 18; // TODO MAGIC + const int table_bits_var = 3, table_bits_pre = 5; // TODO MAGIC + struct smvt_control control_var[DECAF_448_SCALAR_BITS/(table_bits_var+1)+3]; + struct smvt_control control_pre[DECAF_448_SCALAR_BITS/(table_bits_pre+1)+3]; - decaf_448_scalar_t scalar1x; - decaf_448_scalar_add(scalar1x, scalar1, decaf_448_precomputed_scalarmul_adjustment); - decaf_448_halve(scalar1x,scalar1x,decaf_448_scalar_p); - - decaf_448_point_copy(combo, base2); - const int table_bits = 4; // TODO MAGIC - struct smvt_control control[DECAF_448_SCALAR_BITS/(table_bits+1)+3]; - - int control_bits = recode_wnaf(control, scalar2, table_bits); + int ncb_pre = recode_wnaf(control_pre, scalar1, table_bits_pre); + int ncb_var = recode_wnaf(control_var, scalar2, table_bits_var); - pniels_t precmp[1<= 0; i--) { + if (i < 0) { + decaf_448_point_copy(combo, decaf_448_point_identity); + return; + } else if (i > control_pre[0].power) { + pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); + contv++; + } else if (i == control_pre[0].power && i >=0 ) { + pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); + add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1], i); + contv++; contp++; + } else { + i = control_pre[0].power; + niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1]); + contp++; + } + + for (i--; i >= 0; i--) { + int cv = (i==control_var[contv].power), cp = (i==control_pre[contp].power); + decaf_448_point_double_internal(combo,combo,i && !(cv||cp)); - if (i == control[conti].power) { - decaf_448_point_double_internal(combo,combo,0); - assert(control[conti].addend); + if (cv) { + assert(control_var[contv].addend); - if (control[conti].addend > 0) { - add_pniels_to_pt(combo, precmp[control[conti].addend >> 1], i>=s); // TODO PERF: internal + if (control_var[contv].addend > 0) { + add_pniels_to_pt(combo, precmp_var[control_var[contv].addend >> 1], i&&!cp); } else { - sub_pniels_from_pt(combo, precmp[(-control[conti].addend) >> 1], i>=s); // TODO PERF: internal + sub_pniels_from_pt(combo, precmp_var[(-control_var[contv].addend) >> 1], i&&!cp); } - conti++; - assert(conti <= control_bits); - } else { - decaf_448_point_double_internal(combo,combo,i>=s); + contv++; } - - if (i < s) { - /* comb component */ - for (j=0; jlimb[bit/WBITS] >> (bit%WBITS) & 1) << k; - } - } - - decaf_bool_t invert = (tab>>(t-1))-1; - tab ^= invert; - tab &= (1<<(t-1)) - 1; - - if (invert) { - sub_niels_from_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); - } else { - add_niels_to_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); - } + + if (cp) { + assert(control_pre[contp].addend); + + if (control_pre[contp].addend > 0) { + add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[contp].addend >> 1], i); + } else { + sub_niels_from_pt(combo, decaf_448_wnaf_base[(-control_pre[contp].addend) >> 1], i); } + contp++; } } + + assert(contv == ncb_var); (void)ncb_var; + assert(contp == ncb_pre); (void)ncb_pre; } diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c index 03dd1f2..9b8572d 100644 --- a/src/decaf_gen_tables.c +++ b/src/decaf_gen_tables.c @@ -18,6 +18,15 @@ const decaf_word_t decaf_448_precomputed_base_as_words[1]; const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment; +struct niels_s; +const decaf_word_t *decaf_448_precomputed_wnaf_as_words; +extern const size_t sizeof_decaf_448_precomputed_wnafs; + +void decaf_448_precompute_wnafs ( + struct niels_s *out, + const decaf_448_point_t base +); + static void scalar_print(const char *name, const decaf_448_scalar_t sc) { printf("const decaf_448_scalar_t %s = {{{\n", name); unsigned i; @@ -36,6 +45,11 @@ int main(int argc, char **argv) { if (ret || !pre) return 1; decaf_448_precompute(pre, decaf_448_point_base); + struct niels_s *preWnaf; + ret = posix_memalign((void**)&preWnaf, alignof_decaf_448_precomputed_s, sizeof_decaf_448_precomputed_wnafs); + if (ret || !preWnaf) return 1; + decaf_448_precompute_wnafs(preWnaf, decaf_448_point_base); + const decaf_word_t *output = (const decaf_word_t *)pre; unsigned i; @@ -43,7 +57,7 @@ int main(int argc, char **argv) { printf("#include \"decaf.h\"\n\n"); printf("const decaf_word_t decaf_448_precomputed_base_as_words[%d]\n", (int)(sizeof_decaf_448_precomputed_s / sizeof(decaf_word_t))); - printf("__attribute__((aligned(%d))) = {\n ", (int)alignof_decaf_448_precomputed_s); + printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); for (i=0; i < sizeof_decaf_448_precomputed_s; i+=sizeof(decaf_word_t)) { if (i && (i%8==0)) printf(",\n "); @@ -53,6 +67,18 @@ int main(int argc, char **argv) { } printf("\n};\n"); + output = (const decaf_word_t *)preWnaf; + printf("const decaf_word_t decaf_448_precomputed_wnaf_as_words[%d]\n", + (int)(sizeof_decaf_448_precomputed_wnafs / sizeof(decaf_word_t))); + printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); + for (i=0; i < sizeof_decaf_448_precomputed_wnafs; i+=sizeof(decaf_word_t)) { + if (i && (i%8==0)) printf(",\n "); + else if (i) printf(", "); + printf("0x%0*llxull", (int)sizeof(decaf_word_t)*2, (unsigned long long)*output ); + output++; + } + printf("\n};\n"); + decaf_448_scalar_t smadj; decaf_448_scalar_copy(smadj,decaf_448_scalar_one);