From 38455f34f2319686138859353ac669f8816d8683 Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Fri, 15 Jan 2016 13:35:04 -0800 Subject: [PATCH] one ser/deser to rule them all (TODO test on NEON and other places with LIMBPERM) --- src/decaf.c | 126 ++++++++++++++++-------------- src/gen_headers/f_field_h.py | 11 ++- src/p25519/arch_32/f_impl.c | 77 +++--------------- src/p25519/arch_32/f_impl.h | 5 +- src/p25519/arch_ref64/f_impl.c | 42 ---------- src/p25519/arch_ref64/f_impl.h | 2 + src/p25519/arch_x86_64/f_impl.c | 42 ---------- src/p25519/arch_x86_64/f_impl.h | 2 + src/p25519/f_arithmetic.c | 7 +- src/p448/arch_32/f_impl.c | 50 ------------ src/p448/arch_32/f_impl.h | 2 + src/p448/arch_arm_32/f_impl.c | 60 +------------- src/p448/arch_arm_32/f_impl.h | 2 + src/p448/arch_neon/f_impl.c | 1 - src/p448/arch_neon/f_impl.h | 2 + src/p448/arch_ref64/f_impl.h | 2 + src/p448/arch_x86_64/f_impl.c | 48 ------------ src/p448/arch_x86_64/f_impl.h | 1 + src/p448/f_arithmetic.c | 5 ++ src/p480/arch_x86_64/f_impl.c | 62 --------------- src/p480/arch_x86_64/f_impl.h | 8 +- src/p480/f_arithmetic.c | 6 ++ src/p521/arch_ref64/f_impl.c | 46 ----------- src/p521/arch_ref64/f_impl.h | 2 + src/p521/arch_x86_64_r12/f_impl.c | 48 ------------ src/p521/arch_x86_64_r12/f_impl.h | 1 + src/p521/f_arithmetic.c | 6 ++ 27 files changed, 136 insertions(+), 530 deletions(-) diff --git a/src/decaf.c b/src/decaf.c index a690678..7a6e08d 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -86,12 +86,45 @@ const size_t API_NS2(alignof,precomputed_s) = 32; #define UNROLL #endif -#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; ilimb[LIMBPERM(j)]) << fill; + fill += LIMB_PLACE_VALUE(LIMBPERM(j)); + j++; + } + serial[i] = buffer; + fill -= 8; + buffer >>= 8; + } +} -/** Copy x = y */ -static INLINE void -gf_cpy(gf x, const gf y) { x[0] = y[0]; } +mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) { + unsigned int j=0, fill=0; + dword_t buffer = 0; + dsword_t scarry = 0; + UNROLL for (unsigned int i=0; ilimb[LIMBPERM(i)] = (i>= LIMB_PLACE_VALUE(LIMBPERM(i)); + scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t)); + } + return word_is_zero(buffer) & ~word_is_zero(scarry); +} /** Constant time, x = is_z ? z : y */ static INLINE void @@ -120,9 +153,7 @@ cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { /** Compare a==b */ /* Not static because it's used in inverse square root. */ decaf_word_t gf_eq(const gf a, const gf b); - -decaf_word_t -gf_eq(const gf a, const gf b) { +decaf_word_t gf_eq(const gf a, const gf b) { gf c; gf_sub(c,a,b); gf_strong_reduce(c); @@ -153,13 +184,10 @@ gf_invert(gf y, const gf x) { (void)ret; assert(ret); gf_sqr(t1, t2); gf_mul(t2, t1, x); // not direct to y in case of alias. - gf_cpy(y, t2); + gf_copy(y, t2); } -/** - * Mul by signed int. Not constant-time WRT the sign of that int. - * Just uses a full mul (PERF) - */ +/** Mul by signed int. Not constant-time WRT the sign of that int. */ static INLINE void gf_mulw_sgn(gf c, const gf a, int w) { if (w>0) { @@ -182,7 +210,7 @@ static decaf_word_t hibit(const gf x) { /** Return high bit of x = low bit of 2x mod p */ static decaf_word_t lobit(const gf x) { gf y; - gf_cpy(y,x); + gf_copy(y,x); gf_strong_reduce(y); return -(y->limb[0]&1); } @@ -394,16 +422,9 @@ API_NS(scalar_eq) ( return word_is_zero(diff); } -/* *** API begins here *** */ - /** identity = (0,1) */ const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}}; -static void -gf_encode ( unsigned char ser[SER_BYTES], gf a ) { - gf_serialize(ser, (gf_s *)a); -} - static void deisogenize ( gf_s *__restrict__ s, @@ -508,14 +529,7 @@ deisogenize ( void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) { gf s, mtos; deisogenize(s,mtos,p,0,0,0); - gf_encode ( ser, s ); -} - -/** - * Deserialize a field element, return TRUE if < p. - */ -static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) { - return gf_deserialize((gf_s *)s, ser); + gf_serialize ( ser, s ); } decaf_error_t API_NS(point_decode) ( @@ -524,7 +538,7 @@ decaf_error_t API_NS(point_decode) ( decaf_bool_t allow_identity ) { gf s, a, b, c, d, e, f; - decaf_bool_t succ = gf_deser(s, ser), zero = gf_eq(s, ZERO); + decaf_bool_t succ = gf_deserialize(s, ser), zero = gf_eq(s, ZERO); allow_identity = ~word_is_zero(allow_identity); succ &= allow_identity | ~zero; succ &= ~hibit(s); @@ -592,8 +606,6 @@ decaf_error_t API_NS(point_decode) ( #define NEG_D 0 #endif - - void API_NS(point_sub) ( point_t p, const point_t q, @@ -688,8 +700,8 @@ void API_NS(point_negate) ( const point_t a ) { gf_sub(nega->x, ZERO, a->x); - gf_cpy(nega->y, a->y); - gf_cpy(nega->z, a->z); + gf_copy(nega->y, a->y); + gf_copy(nega->z, a->z); gf_sub(nega->t, ZERO, a->t); } @@ -827,7 +839,7 @@ niels_to_pt ( gf_add ( e->y, n->b, n->a ); gf_sub ( e->x, n->b, n->a ); gf_mul ( e->t, e->y, e->x ); - gf_cpy ( e->z, ONE ); + gf_copy ( e->z, ONE ); } static NOINLINE void @@ -882,7 +894,7 @@ add_pniels_to_pt ( ) { gf L0; gf_mul ( L0, p->z, pn->z ); - gf_cpy ( p->z, L0 ); + gf_copy ( p->z, L0 ); add_niels_to_pt( p, pn->n, before_double ); } @@ -894,7 +906,7 @@ sub_pniels_from_pt ( ) { gf L0; gf_mul ( L0, p->z, pn->z ); - gf_cpy ( p->z, L0 ); + gf_copy ( p->z, L0 ); sub_niels_from_pt( p, pn->n, before_double ); } @@ -1203,7 +1215,7 @@ void API_NS(point_from_hash_nonuniform) ( // TODO: simplify since we don't return a hint anymore // TODO: test pathological case ur0^2 = 1/(1-d) gf r0,r,a,b,c,dee,D,N,rN,e; - gf_deser(r0,ser); + gf_deserialize(r0,ser); gf_strong_reduce(r0); gf_sqr(a,r0); #if P_MOD_8 == 5 @@ -1265,7 +1277,7 @@ void API_NS(point_from_hash_nonuniform) ( /* isogenize */ #if IMAGINE_TWIST gf_mul(c,a,SQRT_MINUS_ONE); - gf_cpy(a,c); + gf_copy(a,c); #endif gf_sqr(c,a); /* s^2 */ @@ -1326,7 +1338,7 @@ API_NS(invert_elligator_nonuniform) ( succ &= ~(is_identity & sgn_ed_T); /* NB: there are no preimages of rotated identity. */ #endif - gf_encode(recovered_hash, b); + gf_serialize(recovered_hash, b); /* TODO: deal with overflow flag */ return decaf_succeed_if(succ); } @@ -1380,14 +1392,14 @@ void API_NS(point_debugging_torque) ( gf tmp; gf_mul(tmp,p->x,SQRT_MINUS_ONE); gf_mul(q->x,p->y,SQRT_MINUS_ONE); - gf_cpy(q->y,tmp); - gf_cpy(q->z,p->z); + gf_copy(q->y,tmp); + gf_copy(q->z,p->z); gf_sub(q->t,ZERO,p->t); #else gf_sub(q->x,ZERO,p->x); gf_sub(q->y,ZERO,p->y); - gf_cpy(q->z,p->z); - gf_cpy(q->t,p->t); + gf_copy(q->z,p->z); + gf_copy(q->t,p->t); #endif } @@ -1397,16 +1409,16 @@ void API_NS(point_debugging_pscale) ( const uint8_t factor[SER_BYTES] ) { gf gfac,tmp; - ignore_result(gf_deser(gfac,factor)); + ignore_result(gf_deserialize(gfac,factor)); cond_sel(gfac,gfac,ONE,gf_eq(gfac,ZERO)); gf_mul(tmp,p->x,gfac); - gf_cpy(q->x,tmp); + gf_copy(q->x,tmp); gf_mul(tmp,p->y,gfac); - gf_cpy(q->y,tmp); + gf_copy(q->y,tmp); gf_mul(tmp,p->z,gfac); - gf_cpy(q->z,tmp); + gf_copy(q->z,tmp); gf_mul(tmp,p->t,gfac); - gf_cpy(q->t,tmp); + gf_copy(q->t,tmp); } static void gf_batch_invert ( @@ -1417,7 +1429,7 @@ static void gf_batch_invert ( gf t1; assert(n>1); - gf_cpy(out[1], in[0]); + gf_copy(out[1], in[0]); int i; for (i=1; i<(int) (n-1); i++) { gf_mul(out[i+1], out[i], in[i]); @@ -1428,9 +1440,9 @@ static void gf_batch_invert ( for (i=n-1; i>0; i--) { gf_mul(t1, out[i], out[0]); - gf_cpy(out[i], t1); + gf_copy(out[i], t1); gf_mul(t1, out[0], in[i]); - gf_cpy(out[0], t1); + gf_copy(out[0], t1); } } @@ -1447,15 +1459,15 @@ static void batch_normalize_niels ( for (i=0; ia, zis[i]); gf_strong_reduce(product); - gf_cpy(table[i]->a, product); + gf_copy(table[i]->a, product); gf_mul(product, table[i]->b, zis[i]); gf_strong_reduce(product); - gf_cpy(table[i]->b, product); + gf_copy(table[i]->b, product); gf_mul(product, table[i]->c, zis[i]); gf_strong_reduce(product); - gf_cpy(table[i]->c, product); + gf_copy(table[i]->c, product); } decaf_bzero(product,sizeof(product)); @@ -1500,7 +1512,7 @@ void API_NS(precompute) ( pt_to_pniels(pn_tmp, start); memcpy(table->table[idx], pn_tmp->n, sizeof(pn_tmp->n)); - gf_cpy(zs[idx], pn_tmp->z); + gf_copy(zs[idx], pn_tmp->z); if (j >= (1u<<(t-1)) - 1) break; int delta = (j+1) ^ ((j+1)>>1) ^ gray; @@ -1733,7 +1745,7 @@ void API_NS(precompute_wnafs) ( prepare_wnaf_table(tmp,base,DECAF_WNAF_FIXED_TABLE_BITS); for (i=0; i<1<n, sizeof(niels_t)); - gf_cpy(zs[i], tmp[i]->z); + gf_copy(zs[i], tmp[i]->z); } batch_normalize_niels(out, (const gf *)zs, zis, 1<limb[0] += (a->limb[9]>>25)*19; - a->limb[9] &= masko; + a->limb[9] &= LIMB_MASK(9); /* now the total is less than 2p */ /* compute total_value - p. No need to reduce mod p. */ - int64_t scarry = 0; - int i; - for (i=0; i<10; /*i+=2*/) { - scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske); - a->limb[i] = scarry & maske; - scarry >>= 26; - i++; - - scarry = scarry + a->limb[i] - masko; - a->limb[i] = scarry & masko; - scarry >>= 25; - i++; + dsword_t scarry = 0; + for (unsigned int i=0; i<10; i++) { + scarry = scarry + a->limb[i] - MODULUS->limb[i]; + a->limb[i] = scarry & LIMB_MASK(i); + scarry >>= LIMB_PLACE_VALUE(i); } /* uncommon case: it was >= p, so now scarry = 0 and this = x * common case: it was < p, so now scarry = -1 and this = x - p + 2^255 * so let's add back in p. will carry back off the top for 2^255. */ - assert(word_is_zero(scarry) | word_is_zero(scarry+1)); - uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske; - uint64_t carry = 0; + word_t scarry_0 = scarry; + dword_t carry = 0; /* add it back */ - for (i=0; i<10; /*i+=2*/) { - carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske); - a->limb[i] = carry & maske; - carry >>= 26; - i++; - - carry = carry + a->limb[i] + scarry_masko; - a->limb[i] = carry & masko; - carry >>= 25; + for (unsigned int i=0; i<10; i++) { + carry = carry + a->limb[i] + (scarry_0 & MODULUS->limb[i]); + a->limb[i] = carry & LIMB_MASK(i); + carry >>= LIMB_PLACE_VALUE(i); i++; } - assert(word_is_zero(carry + scarry)); -} - -#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26) -void gf_serialize (uint8_t serial[32], const gf x) { - gf red; - gf_copy(red, x); - gf_strong_reduce(red); - unsigned int j=0, fill=0; - dword_t buffer = 0; - for (unsigned int i=0; i<32; i++) { - if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) { - buffer |= ((dword_t)red->limb[j]) << fill; - fill += LIMB_PLACE_VALUE(j); - j++; - } - serial[i] = buffer; - fill -= 8; - buffer >>= 8; - } + assert(word_is_zero(carry + scarry_0)); } -mask_t gf_deserialize (gf x, const uint8_t serial[32]) { - unsigned int j=0, fill=0; - dword_t buffer = 0; - for (unsigned int i=0; i<32; i++) { - buffer |= ((dword_t)serial[i]) << fill; - fill += 8; - if (fill >= LIMB_PLACE_VALUE(j) || i == 31) { - assert(j < sizeof(x->limb)/sizeof(x->limb[0])); - word_t mask = ((1ull)<limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field. - buffer >>= LIMB_PLACE_VALUE(j); - fill -= LIMB_PLACE_VALUE(j); - j++; - } - } - return -1; // FIXME: test whether in field. -} diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h index 5e51bf0..f917fa0 100644 --- a/src/p25519/arch_32/f_impl.h +++ b/src/p25519/arch_32/f_impl.h @@ -3,8 +3,9 @@ */ #define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26 -#define FIELD_LITERAL(a,b,c,d,e) \ - {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}} +#define FIELD_LITERAL(a,b,c,d,e) {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}} + +#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26) void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i<10; i++) { diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c index 414fd66..ec98829 100644 --- a/src/p25519/arch_ref64/f_impl.c +++ b/src/p25519/arch_ref64/f_impl.c @@ -97,45 +97,3 @@ void gf_strong_reduce (gf a) { assert(word_is_zero(carry + scarry)); } - -void gf_serialize (uint8_t serial[32], const gf x) { - int i,j; - gf red; - gf_copy(red, x); - gf_strong_reduce(red); - uint64_t *r = red->limb; - uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; - for (i=0; i<4; i++) { - for (j=0; j<8; j++) { - serial[8*i+j] = ser64[i]; - ser64[i] >>= 8; - } - } -} - -mask_t gf_deserialize (gf x, const uint8_t serial[32]) { - int i,j; - uint64_t ser64[4], mask = ((1ull<<51)-1); - for (i=0; i<4; i++) { - uint64_t out = 0; - for (j=0; j<8; j++) { - out |= ((uint64_t)serial[8*i+j])<<(8*j); - } - ser64[i] = out; - } - - /* Test for >= 2^255-19 */ - uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); - ge &= ser64[1]; - ge &= ser64[2]; - ge &= (ser64[3]<<1) + 1; - ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); - - x->limb[0] = ser64[0] & mask; - x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; - x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; - x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; - x->limb[4] = ser64[3]>>12; - - return ~word_is_zero(~ge); -} diff --git a/src/p25519/arch_ref64/f_impl.h b/src/p25519/arch_ref64/f_impl.h index dcd097d..c4c472f 100644 --- a/src/p25519/arch_ref64/f_impl.h +++ b/src/p25519/arch_ref64/f_impl.h @@ -4,6 +4,8 @@ #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} +#define LIMB_PLACE_VALUE(i) 51 + void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i<5; i++) { out->limb[i] = a->limb[i] + b->limb[i]; diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c index 0b02519..81af981 100644 --- a/src/p25519/arch_x86_64/f_impl.c +++ b/src/p25519/arch_x86_64/f_impl.c @@ -208,45 +208,3 @@ void gf_strong_reduce (gf a) { assert(word_is_zero(carry + scarry)); } - -void gf_serialize (uint8_t serial[32], const gf x) { - int i,j; - gf red; - gf_copy(red, x); - gf_strong_reduce(red); - uint64_t *r = red->limb; - uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; - for (i=0; i<4; i++) { - for (j=0; j<8; j++) { - serial[8*i+j] = ser64[i]; - ser64[i] >>= 8; - } - } -} - -mask_t gf_deserialize (gf x, const uint8_t serial[32]) { - int i,j; - uint64_t ser64[4], mask = ((1ull<<51)-1); - for (i=0; i<4; i++) { - uint64_t out = 0; - for (j=0; j<8; j++) { - out |= ((uint64_t)serial[8*i+j])<<(8*j); - } - ser64[i] = out; - } - - /* Test for >= 2^255-19 */ - uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); - ge &= ser64[1]; - ge &= ser64[2]; - ge &= (ser64[3]<<1) + 1; - ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); - - x->limb[0] = ser64[0] & mask; - x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; - x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; - x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; - x->limb[4] = ser64[3]>>12; - - return ~word_is_zero(~ge); -} diff --git a/src/p25519/arch_x86_64/f_impl.h b/src/p25519/arch_x86_64/f_impl.h index 3461a6c..647f966 100644 --- a/src/p25519/arch_x86_64/f_impl.h +++ b/src/p25519/arch_x86_64/f_impl.h @@ -4,6 +4,8 @@ #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} +#define LIMB_PLACE_VALUE(i) 51 + void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i<5; i++) { out->limb[i] = a->limb[i] + b->limb[i]; diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c index a3749d6..f348307 100644 --- a/src/p25519/f_arithmetic.c +++ b/src/p25519/f_arithmetic.c @@ -18,14 +18,17 @@ const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL( 0x78595a6804c9e, 0x2b8324804fc1d )}; + +const gf MODULUS = {FIELD_LITERAL( + 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff +)}; /* TODO put in header */ extern const gf_25519_t decaf_255_ONE; extern mask_t decaf_255_gf_eq(const gf_25519_t a, const gf_25519_t b); /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ -void -gf_isr ( +void gf_isr ( gf_25519_t a, const gf_25519_t x ) { diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c index 24e8fe2..1131def 100644 --- a/src/p448/arch_32/f_impl.c +++ b/src/p448/arch_32/f_impl.c @@ -142,53 +142,3 @@ void gf_strong_reduce (gf a) { assert(word_is_zero(carry + scarry)); } - -void gf_serialize (uint8_t *serial, const gf x) { - int i,j; - gf red; - gf_copy(red, x); - gf_strong_reduce(red); - for (i=0; i<8; i++) { - uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); - for (j=0; j<7; j++) { - serial[7*i+j] = limb; - limb >>= 8; - } - assert(limb == 0); - } -} - -mask_t gf_deserialize (gf x, const uint8_t serial[56]) { - int i,j; - for (i=0; i<8; i++) { - uint64_t out = 0; - for (j=0; j<7; j++) { - out |= ((uint64_t)serial[7*i+j])<<(8*j); - } - x->limb[2*i] = out & ((1ull<<28)-1); - x->limb[2*i+1] = out >> 28; - } - - /* Check for reduction. - * - * The idea is to create a variable ge which is all ones (rather, 56 ones) - * if and only if the low $i$ words of $x$ are >= those of p. - * - * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) - */ - uint32_t ge = -1, mask = (1ull<<28)-1; - for (i=0; i<8; i++) { - ge &= x->limb[i]; - } - - /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask); - - /* Propagate the rest */ - for (i=9; i<16; i++) { - ge &= x->limb[i]; - } - - return ~word_is_zero(ge ^ mask); -} - diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h index a82452f..330a29c 100644 --- a/src/p448/arch_32/f_impl.h +++ b/src/p448/arch_32/f_impl.h @@ -5,6 +5,8 @@ #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} + +#define LIMB_PLACE_VALUE(i) 28 void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i> 28; } -void gf_strong_reduce ( - gf a -) { +void gf_strong_reduce (gf a) { word_t mask = (1ull<<28)-1; /* first, clear high */ @@ -875,59 +873,3 @@ void gf_strong_reduce ( assert(word_is_zero(carry + scarry)); } - -void gf_serialize ( - uint8_t *serial, - const gf x -) { - int i,j; - gf red; - gf_copy(red, x); - gf_strong_reduce(red); - for (i=0; i<8; i++) { - uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); - for (j=0; j<7; j++) { - serial[7*i+j] = limb; - limb >>= 8; - } - assert(limb == 0); - } -} - -mask_t -gf_deserialize ( - gf x, - const uint8_t serial[56] -) { - int i,j; - for (i=0; i<8; i++) { - uint64_t out = 0; - for (j=0; j<7; j++) { - out |= ((uint64_t)serial[7*i+j])<<(8*j); - } - x->limb[2*i] = out & ((1ull<<28)-1); - x->limb[2*i+1] = out >> 28; - } - - /* Check for reduction. - * - * The idea is to create a variable ge which is all ones (rather, 56 ones) - * if and only if the low $i$ words of $x$ are >= those of p. - * - * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) - */ - uint32_t ge = -1, mask = (1ull<<28)-1; - for (i=0; i<8; i++) { - ge &= x->limb[i]; - } - - /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask); - - /* Propagate the rest */ - for (i=9; i<16; i++) { - ge &= x->limb[i]; - } - - return ~word_is_zero(ge ^ mask); -} diff --git a/src/p448/arch_arm_32/f_impl.h b/src/p448/arch_arm_32/f_impl.h index 4392012..e193c34 100644 --- a/src/p448/arch_arm_32/f_impl.h +++ b/src/p448/arch_arm_32/f_impl.h @@ -5,6 +5,8 @@ #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} + +#define LIMB_PLACE_VALUE(i) 28 void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; ilimb[i]; - red->limb[i] >>= 8; - } - assert(red->limb[i] == 0); - } -} - -mask_t gf_deserialize (gf x, const uint8_t serial[56]) { - int i,j; - for (i=0; i<8; i++) { - word_t out = 0; - for (j=0; j<7; j++) { - out |= ((word_t)serial[7*i+j])<<(8*j); - } - x->limb[i] = out; - } - - /* Check for reduction. - * - * The idea is to create a variable ge which is all ones (rather, 56 ones) - * if and only if the low $i$ words of $x$ are >= those of p. - * - * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) - */ - word_t ge = -1, mask = (1ull<<56)-1; - for (i=0; i<4; i++) { - ge &= x->limb[i]; - } - - /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask); - - /* Propagate the rest */ - for (i=5; i<8; i++) { - ge &= x->limb[i]; - } - - return ~word_is_zero(ge ^ mask); -} - diff --git a/src/p448/arch_x86_64/f_impl.h b/src/p448/arch_x86_64/f_impl.h index a62e1b4..f69ba1f 100644 --- a/src/p448/arch_x86_64/f_impl.h +++ b/src/p448/arch_x86_64/f_impl.h @@ -3,6 +3,7 @@ */ #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} +#define LIMB_PLACE_VALUE(i) 56 void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i>= 8; - } - assert(r<16); - r += red.limb[i+1]<<4; - for (j=0; j<8; j++) { - serial[k++] = r; - r >>= 8; - } - assert(r==0); - } -} - -mask_t gf_deserialize (gf *x, const uint8_t serial[60]) { - int i,j,k=0; - - for (i=0; i<8; i+=2) { - word_t r = 0; - for (j=0; j<8; j++) { - r |= ((word_t)serial[k++])<<(8*j); - } - x->limb[i] = r & ((1ull<<60)-1); - r >>= 60; - for (j=0; j<7; j++) { - r |= ((word_t)serial[k++])<<(8*j+4); - } - x->limb[i+1] = r; - } - - /* Check for reduction. - * - * The idea is to create a variable ge which is all ones (rather, 60 ones) - * if and only if the low $i$ words of $x$ are >= those of p. - * - * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) - */ - word_t ge = -1, mask = (1ull<<60)-1; - for (i=0; i<4; i++) { - ge &= x->limb[i]; - } - - /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask); - - /* Propagate the rest */ - for (i=5; i<8; i++) { - ge &= x->limb[i]; - } - - return ~word_is_zero(ge ^ mask); -} - diff --git a/src/p480/arch_x86_64/f_impl.h b/src/p480/arch_x86_64/f_impl.h index d501eb3..272125f 100644 --- a/src/p480/arch_x86_64/f_impl.h +++ b/src/p480/arch_x86_64/f_impl.h @@ -2,6 +2,8 @@ * Released under the MIT License. See LICENSE.txt for license information. */ +#define LIMB_PLACE_VALUE(i) 60 + void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i= 8; bits -= 8) { - serial[k++] = r; - r >>= 8; - } - assert(bits <= 6); - } - assert(bits); - serial[k++] = r; -} - -mask_t gf_deserialize (gf x, const uint8_t serial[66]) { - int i,k=0,bits=0; - __uint128_t out = 0; - uint64_t mask = (1ull<<58)-1; - for (i=0; i<9; i++) { - out >>= 58; - for (; bits<58; bits+=8) { - out |= ((__uint128_t)serial[k++])<limb[i] = out & mask; - bits -= 58; - } - - /* Check for reduction. First, high has to be < 2^57 */ - mask_t good = word_is_zero(out>>57); - - uint64_t and = -1ull; - for (i=0; i<8; i++) { - and &= x->limb[i]; - } - and &= (2*out+1); - good &= word_is_zero((and+1)>>58); - - return good; -} diff --git a/src/p521/arch_ref64/f_impl.h b/src/p521/arch_ref64/f_impl.h index e9d631a..42a37e6 100644 --- a/src/p521/arch_ref64/f_impl.h +++ b/src/p521/arch_ref64/f_impl.h @@ -2,6 +2,8 @@ * Released under the MIT License. See LICENSE.txt for license information. */ +#define LIMB_PLACE_VALUE(i) 58 + void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i<9; i++) { out->limb[i] = a->limb[i] + b->limb[i]; diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c index 2040531..8de3642 100644 --- a/src/p521/arch_x86_64_r12/f_impl.c +++ b/src/p521/arch_x86_64_r12/f_impl.c @@ -389,51 +389,3 @@ void gf_strong_reduce (gf *a) { a->limb[3] = a->limb[7] = a->limb[11] = 0; } - -void gf_serialize (uint8_t *serial, const struct gf *x) { - unsigned int i,k=0; - gf red; - gf_copy(&red, x); - gf_strong_reduce(&red); - - uint64_t r=0; - int bits = 0; - for (i=0; i<9; i++) { - r |= red.limb[LIMBPERM(i)] << bits; - for (bits += 58; bits >= 8; bits -= 8) { - serial[k++] = r; - r >>= 8; - } - assert(bits <= 6); - } - assert(bits); - serial[k++] = r; -} - -mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) { - int i,k=0,bits=0; - __uint128_t out = 0; - uint64_t mask = (1ull<<58)-1; - for (i=0; i<9; i++) { - out >>= 58; - for (; bits<58; bits+=8) { - out |= ((__uint128_t)serial[k++])<limb[LIMBPERM(i)] = out & mask; - bits -= 58; - } - - /* Check for reduction. First, high has to be < 2^57 */ - mask_t good = word_is_zero(out>>57); - - uint64_t and = -1ull; - for (i=0; i<8; i++) { - and &= x->limb[LIMBPERM(i)]; - } - and &= (2*out+1); - good &= word_is_zero((and+1)>>58); - - x->limb[3] = x->limb[7] = x->limb[11] = 0; - - return good; -} diff --git a/src/p521/arch_x86_64_r12/f_impl.h b/src/p521/arch_x86_64_r12/f_impl.h index 434a114..4f9e965 100644 --- a/src/p521/arch_x86_64_r12/f_impl.h +++ b/src/p521/arch_x86_64_r12/f_impl.h @@ -4,6 +4,7 @@ /* FIXME: Currently this file desn't work at all, because the struct is declared [9] and not [12] */ #define LIMBPERM(x) (((x)%3)*4 + (x)/3) +#define LIMB_PLACE_VALUE(i) ((((i)&4)==3) ? 0 : 57) #define USE_P521_3x3_TRANSPOSE typedef uint64x4_t uint64x3_t; /* fit it in a vector register */ diff --git a/src/p521/f_arithmetic.c b/src/p521/f_arithmetic.c index 7ce39d8..a0c774a 100644 --- a/src/p521/f_arithmetic.c +++ b/src/p521/f_arithmetic.c @@ -10,6 +10,12 @@ #include "field.h" +const gf MODULUS = {FIELD_LITERAL( + 0x3ffffffffffffff, 0x3ffffffffffffff, 0x3ffffffffffffff, + 0x3ffffffffffffff, 0x3ffffffffffffff, 0x3ffffffffffffff, + 0x3ffffffffffffff, 0x3ffffffffffffff, 0x1ffffffffffffff +)}; + void gf_isr ( gf_a_t a,