| @@ -70,8 +70,12 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ | |||
| build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \ | |||
| build/f_arithmetic.o build/arithmetic.o | |||
| DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o \ | |||
| DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o \ | |||
| build/$(FIELD).o build/f_arithmetic.o # TODO | |||
| ifeq ($(DECAF),decaf_fast) | |||
| DECAFCOMPONENTS += build/decaf_tables.o | |||
| endif | |||
| TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ | |||
| build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o \ | |||
| @@ -89,35 +89,47 @@ const decaf_448_point_t decaf_448_point_base = {{ | |||
| struct decaf_448_precomputed_s { decaf_448_point_t p[1]; }; | |||
| /* FIXME: restore */ | |||
| // const struct decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
| // (const struct decaf_448_precomputed_s *)decaf_448_point_base; | |||
| extern const decaf_word_t decaf_448_precomputed_base_as_words[]; | |||
| const decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
| (const decaf_448_precomputed_s *) &decaf_448_precomputed_base_as_words; | |||
| const struct decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
| (const struct decaf_448_precomputed_s *)decaf_448_point_base; | |||
| const size_t sizeof_decaf_448_precomputed_s = sizeof(struct decaf_448_precomputed_s); | |||
| const size_t alignof_decaf_448_precomputed_s = 32; | |||
| #ifdef __clang__ | |||
| #if 100*__clang_major__ + __clang_minor__ > 305 | |||
| #define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||
| #endif | |||
| #endif | |||
| #ifndef VECTORIZE | |||
| #define VECTORIZE | |||
| #endif | |||
| #if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__)) || defined(DECAF_FORCE_UNROLL) | |||
| #if DECAF_448_LIMBS==8 | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; \ | |||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; \ | |||
| op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
| } | |||
| #elif DECAF_448_LIMBS==16 | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; \ | |||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; \ | |||
| op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
| op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
| } | |||
| #else | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| #endif | |||
| #else | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| #endif | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| /* TODO: figure out why this horribly degrades speed if you use it */ | |||
| #define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
| /** Copy x = y */ | |||
| siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x->limb[i] = y->limb[i]); } | |||
| siv gf_cpy(gf x, const gf y) { FOR_LIMB_U(i, x->limb[i] = y->limb[i]); } | |||
| /** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */ | |||
| snv gf_mul (gf c, const gf a, const gf b) { | |||
| @@ -125,19 +137,19 @@ snv gf_mul (gf c, const gf a, const gf b) { | |||
| gf_cpy(aa,a); | |||
| decaf_dword_t accum[DECAF_448_LIMBS] = {0}; | |||
| FOR_LIMB(i, { | |||
| FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); | |||
| FOR_LIMB_U(i, { | |||
| FOR_LIMB_U(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); | |||
| aa->limb[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa->limb[DECAF_448_LIMBS-1-i]; | |||
| }); | |||
| accum[DECAF_448_LIMBS-1] += accum[DECAF_448_LIMBS-2] >> LBITS; | |||
| accum[DECAF_448_LIMBS-2] &= LMASK; | |||
| accum[DECAF_448_LIMBS/2] += accum[DECAF_448_LIMBS-1] >> LBITS; | |||
| FOR_LIMB(j,{ | |||
| FOR_LIMB_U(j,{ | |||
| accum[j] += accum[(j-1)%DECAF_448_LIMBS] >> LBITS; | |||
| accum[(j-1)%DECAF_448_LIMBS] &= LMASK; | |||
| }); | |||
| FOR_LIMB(j, c->limb[j] = accum[j] ); | |||
| FOR_LIMB_U(j, c->limb[j] = accum[j] ); | |||
| } | |||
| /** No dedicated square (PERF) */ | |||
| @@ -166,7 +178,7 @@ snv gf_isqrt(gf y, const gf x) { | |||
| /** Weak reduce mod p. */ | |||
| siv gf_reduce(gf x) { | |||
| x->limb[DECAF_448_LIMBS/2] += x->limb[DECAF_448_LIMBS-1] >> LBITS; | |||
| FOR_LIMB(j,{ | |||
| FOR_LIMB_U(j,{ | |||
| x->limb[j] += x->limb[(j-1)%DECAF_448_LIMBS] >> LBITS; | |||
| x->limb[(j-1)%DECAF_448_LIMBS] &= LMASK; | |||
| }); | |||
| @@ -174,19 +186,19 @@ siv gf_reduce(gf x) { | |||
| /** Add mod p. Conservatively always weak-reduce. (PERF) */ | |||
| sv gf_add ( gf x, const gf y, const gf z ) { | |||
| FOR_LIMB(i, x->limb[i] = y->limb[i] + z->limb[i] ); | |||
| FOR_LIMB_U(i, x->limb[i] = y->limb[i] + z->limb[i] ); | |||
| gf_reduce(x); | |||
| } | |||
| /** Subtract mod p. Conservatively always weak-reduce. (PERF) */ | |||
| sv gf_sub ( gf x, const gf y, const gf z ) { | |||
| FOR_LIMB(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); | |||
| FOR_LIMB_U(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); | |||
| gf_reduce(x); | |||
| } | |||
| /** Constant time, x = is_z ? z : y */ | |||
| sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) { | |||
| FOR_LIMB(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); | |||
| FOR_LIMB_U(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); | |||
| } | |||
| /** Constant time, if (neg) x=-x; */ | |||
| @@ -198,7 +210,7 @@ siv cond_neg(gf x, decaf_bool_t neg) { | |||
| /** Constant time, if (swap) (x,y) = (y,x); */ | |||
| sv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | |||
| FOR_LIMB(i, { | |||
| FOR_LIMB_U(i, { | |||
| decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
| x->limb[i] ^= s; | |||
| y->limb[i] ^= s; | |||
| @@ -388,9 +400,9 @@ decaf_bool_t decaf_448_scalar_eq ( | |||
| const decaf_448_scalar_t a, | |||
| const decaf_448_scalar_t b | |||
| ) { | |||
| int i; | |||
| decaf_word_t diff = 0; | |||
| unsigned int i; | |||
| for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { | |||
| for(i=0; i<DECAF_448_SCALAR_LIMBS; i++) { | |||
| diff |= a->limb[i] ^ b->limb[i]; | |||
| } | |||
| return (((decaf_dword_t)diff)-1)>>WBITS; | |||
| @@ -424,14 +436,14 @@ void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf | |||
| cond_neg ( a, hibit(a) ); | |||
| gf_canon(a); | |||
| int i, k=0, bits=0; | |||
| int k=0, bits=0; | |||
| decaf_dword_t buf=0; | |||
| for (i=0; i<DECAF_448_LIMBS; i++) { | |||
| FOR_LIMB(i, { | |||
| buf |= (decaf_dword_t)a->limb[i]<<bits; | |||
| for (bits += LBITS; (bits>=8 || i==DECAF_448_LIMBS-1) && k<DECAF_448_SER_BYTES; bits-=8, buf>>=8) { | |||
| ser[k++]=buf; | |||
| } | |||
| } | |||
| }); | |||
| } | |||
| /** | |||
| @@ -1073,6 +1073,31 @@ static void gf_batch_invert ( | |||
| } | |||
| } | |||
| static void batch_normalize_niels ( | |||
| niels_t *table, | |||
| gf *zs, | |||
| gf *zis, | |||
| int n | |||
| ) { | |||
| int i; | |||
| gf product; | |||
| gf_batch_invert(zis, zs, n); | |||
| for (i=0; i<n; i++) { | |||
| gf_mul(product, table[i]->a, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table[i]->a, product); | |||
| gf_mul(product, table[i]->b, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table[i]->b, product); | |||
| gf_mul(product, table[i]->c, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table[i]->c, product); | |||
| } | |||
| } | |||
| void | |||
| decaf_448_precompute ( | |||
| decaf_448_precomputed_s *table, | |||
| @@ -1129,22 +1154,7 @@ decaf_448_precompute ( | |||
| } | |||
| } | |||
| gf_batch_invert(zis, zs, n<<(t-1)); | |||
| gf product; | |||
| for (i=0; i<n<<(t-1); i++) { | |||
| gf_mul(product, table->table[i]->a, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table->table[i]->a, product); | |||
| gf_mul(product, table->table[i]->b, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table->table[i]->b, product); | |||
| gf_mul(product, table->table[i]->c, zis[i]); | |||
| gf_canon(product); | |||
| gf_cpy(table->table[i]->c, product); | |||
| } | |||
| batch_normalize_niels(table->table,zs,zis,n<<(t-1)); | |||
| } | |||
| extern const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; | |||
| @@ -1396,93 +1406,115 @@ static int recode_wnaf ( | |||
| sv prepare_wnaf_table( | |||
| pniels_t *output, | |||
| decaf_448_point_t working, | |||
| const decaf_448_point_t working, | |||
| unsigned int tbits | |||
| ) { | |||
| decaf_448_point_t tmp; | |||
| int i; | |||
| pt_to_pniels(output[0], working); | |||
| if (tbits == 0) return; | |||
| decaf_448_point_double(working,working); | |||
| decaf_448_point_double(tmp,working); | |||
| pniels_t twop; | |||
| pt_to_pniels(twop, working); | |||
| pt_to_pniels(twop, tmp); | |||
| add_pniels_to_pt(working, output[0],0); | |||
| pt_to_pniels(output[1], working); | |||
| add_pniels_to_pt(tmp, output[0],0); | |||
| pt_to_pniels(output[1], tmp); | |||
| for (i=2; i < 1<<tbits; i++) { | |||
| add_pniels_to_pt(working, twop,0); | |||
| pt_to_pniels(output[i], working); | |||
| add_pniels_to_pt(tmp, twop,0); | |||
| pt_to_pniels(output[i], tmp); | |||
| } | |||
| } | |||
| extern const decaf_word_t decaf_448_precomputed_wnaf_as_words[]; | |||
| static const niels_t *decaf_448_wnaf_base = (const niels_t *)decaf_448_precomputed_wnaf_as_words; | |||
| const size_t sizeof_decaf_448_precomputed_wnafs __attribute((visibility("hidden"))) = sizeof(niels_t)<<5; | |||
| void decaf_448_precompute_wnafs ( | |||
| niels_t out[1<<5], | |||
| const decaf_448_point_t base | |||
| ) __attribute__ ((visibility ("hidden"))); | |||
| void decaf_448_precompute_wnafs ( | |||
| niels_t out[1<<5], | |||
| const decaf_448_point_t base | |||
| ) { | |||
| // TODO MAGIC | |||
| pniels_t tmp[1<<5]; | |||
| gf zs[1<<5], zis[1<<5]; | |||
| int i; | |||
| prepare_wnaf_table(tmp,base,5); | |||
| for (i=0; i<1<<5; i++) { | |||
| memcpy(out[i], tmp[i]->n, sizeof(niels_t)); | |||
| gf_cpy(zs[i], tmp[i]->z); | |||
| } | |||
| batch_normalize_niels(out, zs, zis, 1<<5); | |||
| } | |||
| void decaf_448_base_double_scalarmul_non_secret ( | |||
| decaf_448_point_t combo, | |||
| const decaf_448_scalar_t scalar1, | |||
| const decaf_448_point_t base2, | |||
| const decaf_448_scalar_t scalar2 | |||
| ) { | |||
| int i; | |||
| unsigned j,k; | |||
| const unsigned int n = 5, t = 5; | |||
| const int s = 18; // TODO MAGIC | |||
| const int table_bits_var = 3, table_bits_pre = 5; // TODO MAGIC | |||
| struct smvt_control control_var[DECAF_448_SCALAR_BITS/(table_bits_var+1)+3]; | |||
| struct smvt_control control_pre[DECAF_448_SCALAR_BITS/(table_bits_pre+1)+3]; | |||
| decaf_448_scalar_t scalar1x; | |||
| decaf_448_scalar_add(scalar1x, scalar1, decaf_448_precomputed_scalarmul_adjustment); | |||
| decaf_448_halve(scalar1x,scalar1x,decaf_448_scalar_p); | |||
| decaf_448_point_copy(combo, base2); | |||
| const int table_bits = 4; // TODO MAGIC | |||
| struct smvt_control control[DECAF_448_SCALAR_BITS/(table_bits+1)+3]; | |||
| int control_bits = recode_wnaf(control, scalar2, table_bits); | |||
| int ncb_pre = recode_wnaf(control_pre, scalar1, table_bits_pre); | |||
| int ncb_var = recode_wnaf(control_var, scalar2, table_bits_var); | |||
| pniels_t precmp[1<<table_bits]; | |||
| prepare_wnaf_table(precmp, combo, table_bits); | |||
| decaf_448_point_copy(combo, decaf_448_point_identity); | |||
| pniels_t precmp_var[1<<table_bits_var]; | |||
| prepare_wnaf_table(precmp_var, base2, table_bits_var); | |||
| int contp=0, contv=0, i = control_var[0].power; | |||
| int conti = 0; | |||
| for (i = control[0].power; i >= 0; i--) { | |||
| if (i < 0) { | |||
| decaf_448_point_copy(combo, decaf_448_point_identity); | |||
| return; | |||
| } else if (i > control_pre[0].power) { | |||
| pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); | |||
| contv++; | |||
| } else if (i == control_pre[0].power && i >=0 ) { | |||
| pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); | |||
| add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1], i); | |||
| contv++; contp++; | |||
| } else { | |||
| i = control_pre[0].power; | |||
| niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1]); | |||
| contp++; | |||
| } | |||
| for (i--; i >= 0; i--) { | |||
| int cv = (i==control_var[contv].power), cp = (i==control_pre[contp].power); | |||
| decaf_448_point_double_internal(combo,combo,i && !(cv||cp)); | |||
| if (i == control[conti].power) { | |||
| decaf_448_point_double_internal(combo,combo,0); | |||
| assert(control[conti].addend); | |||
| if (cv) { | |||
| assert(control_var[contv].addend); | |||
| if (control[conti].addend > 0) { | |||
| add_pniels_to_pt(combo, precmp[control[conti].addend >> 1], i>=s); // TODO PERF: internal | |||
| if (control_var[contv].addend > 0) { | |||
| add_pniels_to_pt(combo, precmp_var[control_var[contv].addend >> 1], i&&!cp); | |||
| } else { | |||
| sub_pniels_from_pt(combo, precmp[(-control[conti].addend) >> 1], i>=s); // TODO PERF: internal | |||
| sub_pniels_from_pt(combo, precmp_var[(-control_var[contv].addend) >> 1], i&&!cp); | |||
| } | |||
| conti++; | |||
| assert(conti <= control_bits); | |||
| } else { | |||
| decaf_448_point_double_internal(combo,combo,i>=s); | |||
| contv++; | |||
| } | |||
| if (i < s) { | |||
| /* comb component */ | |||
| for (j=0; j<n; j++) { | |||
| int tab = 0; | |||
| for (k=0; k<t; k++) { | |||
| unsigned int bit = i + s*(k + j*t); | |||
| if (bit < SCALAR_WORDS * WBITS) { | |||
| tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k; | |||
| } | |||
| } | |||
| decaf_bool_t invert = (tab>>(t-1))-1; | |||
| tab ^= invert; | |||
| tab &= (1<<(t-1)) - 1; | |||
| if (invert) { | |||
| sub_niels_from_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); | |||
| } else { | |||
| add_niels_to_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); | |||
| } | |||
| if (cp) { | |||
| assert(control_pre[contp].addend); | |||
| if (control_pre[contp].addend > 0) { | |||
| add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[contp].addend >> 1], i); | |||
| } else { | |||
| sub_niels_from_pt(combo, decaf_448_wnaf_base[(-control_pre[contp].addend) >> 1], i); | |||
| } | |||
| contp++; | |||
| } | |||
| } | |||
| assert(contv == ncb_var); (void)ncb_var; | |||
| assert(contp == ncb_pre); (void)ncb_pre; | |||
| } | |||
| @@ -18,6 +18,15 @@ const decaf_word_t decaf_448_precomputed_base_as_words[1]; | |||
| const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; | |||
| const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment; | |||
| struct niels_s; | |||
| const decaf_word_t *decaf_448_precomputed_wnaf_as_words; | |||
| extern const size_t sizeof_decaf_448_precomputed_wnafs; | |||
| void decaf_448_precompute_wnafs ( | |||
| struct niels_s *out, | |||
| const decaf_448_point_t base | |||
| ); | |||
| static void scalar_print(const char *name, const decaf_448_scalar_t sc) { | |||
| printf("const decaf_448_scalar_t %s = {{{\n", name); | |||
| unsigned i; | |||
| @@ -36,6 +45,11 @@ int main(int argc, char **argv) { | |||
| if (ret || !pre) return 1; | |||
| decaf_448_precompute(pre, decaf_448_point_base); | |||
| struct niels_s *preWnaf; | |||
| ret = posix_memalign((void**)&preWnaf, alignof_decaf_448_precomputed_s, sizeof_decaf_448_precomputed_wnafs); | |||
| if (ret || !preWnaf) return 1; | |||
| decaf_448_precompute_wnafs(preWnaf, decaf_448_point_base); | |||
| const decaf_word_t *output = (const decaf_word_t *)pre; | |||
| unsigned i; | |||
| @@ -43,7 +57,7 @@ int main(int argc, char **argv) { | |||
| printf("#include \"decaf.h\"\n\n"); | |||
| printf("const decaf_word_t decaf_448_precomputed_base_as_words[%d]\n", | |||
| (int)(sizeof_decaf_448_precomputed_s / sizeof(decaf_word_t))); | |||
| printf("__attribute__((aligned(%d))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
| printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
| for (i=0; i < sizeof_decaf_448_precomputed_s; i+=sizeof(decaf_word_t)) { | |||
| if (i && (i%8==0)) printf(",\n "); | |||
| @@ -53,6 +67,18 @@ int main(int argc, char **argv) { | |||
| } | |||
| printf("\n};\n"); | |||
| output = (const decaf_word_t *)preWnaf; | |||
| printf("const decaf_word_t decaf_448_precomputed_wnaf_as_words[%d]\n", | |||
| (int)(sizeof_decaf_448_precomputed_wnafs / sizeof(decaf_word_t))); | |||
| printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
| for (i=0; i < sizeof_decaf_448_precomputed_wnafs; i+=sizeof(decaf_word_t)) { | |||
| if (i && (i%8==0)) printf(",\n "); | |||
| else if (i) printf(", "); | |||
| printf("0x%0*llxull", (int)sizeof(decaf_word_t)*2, (unsigned long long)*output ); | |||
| output++; | |||
| } | |||
| printf("\n};\n"); | |||
| decaf_448_scalar_t smadj; | |||
| decaf_448_scalar_copy(smadj,decaf_448_scalar_one); | |||