@@ -70,8 +70,12 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ | |||
build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \ | |||
build/f_arithmetic.o build/arithmetic.o | |||
DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o \ | |||
DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o \ | |||
build/$(FIELD).o build/f_arithmetic.o # TODO | |||
ifeq ($(DECAF),decaf_fast) | |||
DECAFCOMPONENTS += build/decaf_tables.o | |||
endif | |||
TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ | |||
build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o \ | |||
@@ -89,35 +89,47 @@ const decaf_448_point_t decaf_448_point_base = {{ | |||
struct decaf_448_precomputed_s { decaf_448_point_t p[1]; }; | |||
/* FIXME: restore */ | |||
// const struct decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
// (const struct decaf_448_precomputed_s *)decaf_448_point_base; | |||
extern const decaf_word_t decaf_448_precomputed_base_as_words[]; | |||
const decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
(const decaf_448_precomputed_s *) &decaf_448_precomputed_base_as_words; | |||
const struct decaf_448_precomputed_s *decaf_448_precomputed_base = | |||
(const struct decaf_448_precomputed_s *)decaf_448_point_base; | |||
const size_t sizeof_decaf_448_precomputed_s = sizeof(struct decaf_448_precomputed_s); | |||
const size_t alignof_decaf_448_precomputed_s = 32; | |||
#ifdef __clang__ | |||
#if 100*__clang_major__ + __clang_minor__ > 305 | |||
#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||
#endif | |||
#endif | |||
#ifndef VECTORIZE | |||
#define VECTORIZE | |||
#endif | |||
#if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__)) || defined(DECAF_FORCE_UNROLL) | |||
#if DECAF_448_LIMBS==8 | |||
#define FOR_LIMB(i,op) { unsigned int i=0; \ | |||
#define FOR_LIMB_U(i,op) { unsigned int i=0; \ | |||
op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
} | |||
#elif DECAF_448_LIMBS==16 | |||
#define FOR_LIMB(i,op) { unsigned int i=0; \ | |||
#define FOR_LIMB_U(i,op) { unsigned int i=0; \ | |||
op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; \ | |||
} | |||
#else | |||
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
#define FOR_LIMB_U(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
#endif | |||
#else | |||
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
#define FOR_LIMB_U(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
#endif | |||
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
/* TODO: figure out why this horribly degrades speed if you use it */ | |||
#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<DECAF_448_LIMBS; i++) { op; }} | |||
/** Copy x = y */ | |||
siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x->limb[i] = y->limb[i]); } | |||
siv gf_cpy(gf x, const gf y) { FOR_LIMB_U(i, x->limb[i] = y->limb[i]); } | |||
/** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */ | |||
snv gf_mul (gf c, const gf a, const gf b) { | |||
@@ -125,19 +137,19 @@ snv gf_mul (gf c, const gf a, const gf b) { | |||
gf_cpy(aa,a); | |||
decaf_dword_t accum[DECAF_448_LIMBS] = {0}; | |||
FOR_LIMB(i, { | |||
FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); | |||
FOR_LIMB_U(i, { | |||
FOR_LIMB_U(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); | |||
aa->limb[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa->limb[DECAF_448_LIMBS-1-i]; | |||
}); | |||
accum[DECAF_448_LIMBS-1] += accum[DECAF_448_LIMBS-2] >> LBITS; | |||
accum[DECAF_448_LIMBS-2] &= LMASK; | |||
accum[DECAF_448_LIMBS/2] += accum[DECAF_448_LIMBS-1] >> LBITS; | |||
FOR_LIMB(j,{ | |||
FOR_LIMB_U(j,{ | |||
accum[j] += accum[(j-1)%DECAF_448_LIMBS] >> LBITS; | |||
accum[(j-1)%DECAF_448_LIMBS] &= LMASK; | |||
}); | |||
FOR_LIMB(j, c->limb[j] = accum[j] ); | |||
FOR_LIMB_U(j, c->limb[j] = accum[j] ); | |||
} | |||
/** No dedicated square (PERF) */ | |||
@@ -166,7 +178,7 @@ snv gf_isqrt(gf y, const gf x) { | |||
/** Weak reduce mod p. */ | |||
siv gf_reduce(gf x) { | |||
x->limb[DECAF_448_LIMBS/2] += x->limb[DECAF_448_LIMBS-1] >> LBITS; | |||
FOR_LIMB(j,{ | |||
FOR_LIMB_U(j,{ | |||
x->limb[j] += x->limb[(j-1)%DECAF_448_LIMBS] >> LBITS; | |||
x->limb[(j-1)%DECAF_448_LIMBS] &= LMASK; | |||
}); | |||
@@ -174,19 +186,19 @@ siv gf_reduce(gf x) { | |||
/** Add mod p. Conservatively always weak-reduce. (PERF) */ | |||
sv gf_add ( gf x, const gf y, const gf z ) { | |||
FOR_LIMB(i, x->limb[i] = y->limb[i] + z->limb[i] ); | |||
FOR_LIMB_U(i, x->limb[i] = y->limb[i] + z->limb[i] ); | |||
gf_reduce(x); | |||
} | |||
/** Subtract mod p. Conservatively always weak-reduce. (PERF) */ | |||
sv gf_sub ( gf x, const gf y, const gf z ) { | |||
FOR_LIMB(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); | |||
FOR_LIMB_U(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); | |||
gf_reduce(x); | |||
} | |||
/** Constant time, x = is_z ? z : y */ | |||
sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) { | |||
FOR_LIMB(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); | |||
FOR_LIMB_U(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); | |||
} | |||
/** Constant time, if (neg) x=-x; */ | |||
@@ -198,7 +210,7 @@ siv cond_neg(gf x, decaf_bool_t neg) { | |||
/** Constant time, if (swap) (x,y) = (y,x); */ | |||
sv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | |||
FOR_LIMB(i, { | |||
FOR_LIMB_U(i, { | |||
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
x->limb[i] ^= s; | |||
y->limb[i] ^= s; | |||
@@ -388,9 +400,9 @@ decaf_bool_t decaf_448_scalar_eq ( | |||
const decaf_448_scalar_t a, | |||
const decaf_448_scalar_t b | |||
) { | |||
int i; | |||
decaf_word_t diff = 0; | |||
unsigned int i; | |||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) { | |||
for(i=0; i<DECAF_448_SCALAR_LIMBS; i++) { | |||
diff |= a->limb[i] ^ b->limb[i]; | |||
} | |||
return (((decaf_dword_t)diff)-1)>>WBITS; | |||
@@ -424,14 +436,14 @@ void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf | |||
cond_neg ( a, hibit(a) ); | |||
gf_canon(a); | |||
int i, k=0, bits=0; | |||
int k=0, bits=0; | |||
decaf_dword_t buf=0; | |||
for (i=0; i<DECAF_448_LIMBS; i++) { | |||
FOR_LIMB(i, { | |||
buf |= (decaf_dword_t)a->limb[i]<<bits; | |||
for (bits += LBITS; (bits>=8 || i==DECAF_448_LIMBS-1) && k<DECAF_448_SER_BYTES; bits-=8, buf>>=8) { | |||
ser[k++]=buf; | |||
} | |||
} | |||
}); | |||
} | |||
/** | |||
@@ -1073,6 +1073,31 @@ static void gf_batch_invert ( | |||
} | |||
} | |||
static void batch_normalize_niels ( | |||
niels_t *table, | |||
gf *zs, | |||
gf *zis, | |||
int n | |||
) { | |||
int i; | |||
gf product; | |||
gf_batch_invert(zis, zs, n); | |||
for (i=0; i<n; i++) { | |||
gf_mul(product, table[i]->a, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table[i]->a, product); | |||
gf_mul(product, table[i]->b, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table[i]->b, product); | |||
gf_mul(product, table[i]->c, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table[i]->c, product); | |||
} | |||
} | |||
void | |||
decaf_448_precompute ( | |||
decaf_448_precomputed_s *table, | |||
@@ -1129,22 +1154,7 @@ decaf_448_precompute ( | |||
} | |||
} | |||
gf_batch_invert(zis, zs, n<<(t-1)); | |||
gf product; | |||
for (i=0; i<n<<(t-1); i++) { | |||
gf_mul(product, table->table[i]->a, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table->table[i]->a, product); | |||
gf_mul(product, table->table[i]->b, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table->table[i]->b, product); | |||
gf_mul(product, table->table[i]->c, zis[i]); | |||
gf_canon(product); | |||
gf_cpy(table->table[i]->c, product); | |||
} | |||
batch_normalize_niels(table->table,zs,zis,n<<(t-1)); | |||
} | |||
extern const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; | |||
@@ -1396,93 +1406,115 @@ static int recode_wnaf ( | |||
sv prepare_wnaf_table( | |||
pniels_t *output, | |||
decaf_448_point_t working, | |||
const decaf_448_point_t working, | |||
unsigned int tbits | |||
) { | |||
decaf_448_point_t tmp; | |||
int i; | |||
pt_to_pniels(output[0], working); | |||
if (tbits == 0) return; | |||
decaf_448_point_double(working,working); | |||
decaf_448_point_double(tmp,working); | |||
pniels_t twop; | |||
pt_to_pniels(twop, working); | |||
pt_to_pniels(twop, tmp); | |||
add_pniels_to_pt(working, output[0],0); | |||
pt_to_pniels(output[1], working); | |||
add_pniels_to_pt(tmp, output[0],0); | |||
pt_to_pniels(output[1], tmp); | |||
for (i=2; i < 1<<tbits; i++) { | |||
add_pniels_to_pt(working, twop,0); | |||
pt_to_pniels(output[i], working); | |||
add_pniels_to_pt(tmp, twop,0); | |||
pt_to_pniels(output[i], tmp); | |||
} | |||
} | |||
extern const decaf_word_t decaf_448_precomputed_wnaf_as_words[]; | |||
static const niels_t *decaf_448_wnaf_base = (const niels_t *)decaf_448_precomputed_wnaf_as_words; | |||
const size_t sizeof_decaf_448_precomputed_wnafs __attribute((visibility("hidden"))) = sizeof(niels_t)<<5; | |||
void decaf_448_precompute_wnafs ( | |||
niels_t out[1<<5], | |||
const decaf_448_point_t base | |||
) __attribute__ ((visibility ("hidden"))); | |||
void decaf_448_precompute_wnafs ( | |||
niels_t out[1<<5], | |||
const decaf_448_point_t base | |||
) { | |||
// TODO MAGIC | |||
pniels_t tmp[1<<5]; | |||
gf zs[1<<5], zis[1<<5]; | |||
int i; | |||
prepare_wnaf_table(tmp,base,5); | |||
for (i=0; i<1<<5; i++) { | |||
memcpy(out[i], tmp[i]->n, sizeof(niels_t)); | |||
gf_cpy(zs[i], tmp[i]->z); | |||
} | |||
batch_normalize_niels(out, zs, zis, 1<<5); | |||
} | |||
void decaf_448_base_double_scalarmul_non_secret ( | |||
decaf_448_point_t combo, | |||
const decaf_448_scalar_t scalar1, | |||
const decaf_448_point_t base2, | |||
const decaf_448_scalar_t scalar2 | |||
) { | |||
int i; | |||
unsigned j,k; | |||
const unsigned int n = 5, t = 5; | |||
const int s = 18; // TODO MAGIC | |||
const int table_bits_var = 3, table_bits_pre = 5; // TODO MAGIC | |||
struct smvt_control control_var[DECAF_448_SCALAR_BITS/(table_bits_var+1)+3]; | |||
struct smvt_control control_pre[DECAF_448_SCALAR_BITS/(table_bits_pre+1)+3]; | |||
decaf_448_scalar_t scalar1x; | |||
decaf_448_scalar_add(scalar1x, scalar1, decaf_448_precomputed_scalarmul_adjustment); | |||
decaf_448_halve(scalar1x,scalar1x,decaf_448_scalar_p); | |||
decaf_448_point_copy(combo, base2); | |||
const int table_bits = 4; // TODO MAGIC | |||
struct smvt_control control[DECAF_448_SCALAR_BITS/(table_bits+1)+3]; | |||
int control_bits = recode_wnaf(control, scalar2, table_bits); | |||
int ncb_pre = recode_wnaf(control_pre, scalar1, table_bits_pre); | |||
int ncb_var = recode_wnaf(control_var, scalar2, table_bits_var); | |||
pniels_t precmp[1<<table_bits]; | |||
prepare_wnaf_table(precmp, combo, table_bits); | |||
decaf_448_point_copy(combo, decaf_448_point_identity); | |||
pniels_t precmp_var[1<<table_bits_var]; | |||
prepare_wnaf_table(precmp_var, base2, table_bits_var); | |||
int contp=0, contv=0, i = control_var[0].power; | |||
int conti = 0; | |||
for (i = control[0].power; i >= 0; i--) { | |||
if (i < 0) { | |||
decaf_448_point_copy(combo, decaf_448_point_identity); | |||
return; | |||
} else if (i > control_pre[0].power) { | |||
pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); | |||
contv++; | |||
} else if (i == control_pre[0].power && i >=0 ) { | |||
pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]); | |||
add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1], i); | |||
contv++; contp++; | |||
} else { | |||
i = control_pre[0].power; | |||
niels_to_pt(combo, decaf_448_wnaf_base[control_pre[0].addend >> 1]); | |||
contp++; | |||
} | |||
for (i--; i >= 0; i--) { | |||
int cv = (i==control_var[contv].power), cp = (i==control_pre[contp].power); | |||
decaf_448_point_double_internal(combo,combo,i && !(cv||cp)); | |||
if (i == control[conti].power) { | |||
decaf_448_point_double_internal(combo,combo,0); | |||
assert(control[conti].addend); | |||
if (cv) { | |||
assert(control_var[contv].addend); | |||
if (control[conti].addend > 0) { | |||
add_pniels_to_pt(combo, precmp[control[conti].addend >> 1], i>=s); // TODO PERF: internal | |||
if (control_var[contv].addend > 0) { | |||
add_pniels_to_pt(combo, precmp_var[control_var[contv].addend >> 1], i&&!cp); | |||
} else { | |||
sub_pniels_from_pt(combo, precmp[(-control[conti].addend) >> 1], i>=s); // TODO PERF: internal | |||
sub_pniels_from_pt(combo, precmp_var[(-control_var[contv].addend) >> 1], i&&!cp); | |||
} | |||
conti++; | |||
assert(conti <= control_bits); | |||
} else { | |||
decaf_448_point_double_internal(combo,combo,i>=s); | |||
contv++; | |||
} | |||
if (i < s) { | |||
/* comb component */ | |||
for (j=0; j<n; j++) { | |||
int tab = 0; | |||
for (k=0; k<t; k++) { | |||
unsigned int bit = i + s*(k + j*t); | |||
if (bit < SCALAR_WORDS * WBITS) { | |||
tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k; | |||
} | |||
} | |||
decaf_bool_t invert = (tab>>(t-1))-1; | |||
tab ^= invert; | |||
tab &= (1<<(t-1)) - 1; | |||
if (invert) { | |||
sub_niels_from_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); | |||
} else { | |||
add_niels_to_pt(combo, decaf_448_precomputed_base->table[(j<<(t-1)) + tab], j==n-1 && i); | |||
} | |||
if (cp) { | |||
assert(control_pre[contp].addend); | |||
if (control_pre[contp].addend > 0) { | |||
add_niels_to_pt(combo, decaf_448_wnaf_base[control_pre[contp].addend >> 1], i); | |||
} else { | |||
sub_niels_from_pt(combo, decaf_448_wnaf_base[(-control_pre[contp].addend) >> 1], i); | |||
} | |||
contp++; | |||
} | |||
} | |||
assert(contv == ncb_var); (void)ncb_var; | |||
assert(contp == ncb_pre); (void)ncb_pre; | |||
} |
@@ -18,6 +18,15 @@ const decaf_word_t decaf_448_precomputed_base_as_words[1]; | |||
const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; | |||
const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment; | |||
struct niels_s; | |||
const decaf_word_t *decaf_448_precomputed_wnaf_as_words; | |||
extern const size_t sizeof_decaf_448_precomputed_wnafs; | |||
void decaf_448_precompute_wnafs ( | |||
struct niels_s *out, | |||
const decaf_448_point_t base | |||
); | |||
static void scalar_print(const char *name, const decaf_448_scalar_t sc) { | |||
printf("const decaf_448_scalar_t %s = {{{\n", name); | |||
unsigned i; | |||
@@ -36,6 +45,11 @@ int main(int argc, char **argv) { | |||
if (ret || !pre) return 1; | |||
decaf_448_precompute(pre, decaf_448_point_base); | |||
struct niels_s *preWnaf; | |||
ret = posix_memalign((void**)&preWnaf, alignof_decaf_448_precomputed_s, sizeof_decaf_448_precomputed_wnafs); | |||
if (ret || !preWnaf) return 1; | |||
decaf_448_precompute_wnafs(preWnaf, decaf_448_point_base); | |||
const decaf_word_t *output = (const decaf_word_t *)pre; | |||
unsigned i; | |||
@@ -43,7 +57,7 @@ int main(int argc, char **argv) { | |||
printf("#include \"decaf.h\"\n\n"); | |||
printf("const decaf_word_t decaf_448_precomputed_base_as_words[%d]\n", | |||
(int)(sizeof_decaf_448_precomputed_s / sizeof(decaf_word_t))); | |||
printf("__attribute__((aligned(%d))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
for (i=0; i < sizeof_decaf_448_precomputed_s; i+=sizeof(decaf_word_t)) { | |||
if (i && (i%8==0)) printf(",\n "); | |||
@@ -53,6 +67,18 @@ int main(int argc, char **argv) { | |||
} | |||
printf("\n};\n"); | |||
output = (const decaf_word_t *)preWnaf; | |||
printf("const decaf_word_t decaf_448_precomputed_wnaf_as_words[%d]\n", | |||
(int)(sizeof_decaf_448_precomputed_wnafs / sizeof(decaf_word_t))); | |||
printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)alignof_decaf_448_precomputed_s); | |||
for (i=0; i < sizeof_decaf_448_precomputed_wnafs; i+=sizeof(decaf_word_t)) { | |||
if (i && (i%8==0)) printf(",\n "); | |||
else if (i) printf(", "); | |||
printf("0x%0*llxull", (int)sizeof(decaf_word_t)*2, (unsigned long long)*output ); | |||
output++; | |||
} | |||
printf("\n};\n"); | |||
decaf_448_scalar_t smadj; | |||
decaf_448_scalar_copy(smadj,decaf_448_scalar_one); | |||