|
|
@@ -4,41 +4,20 @@ |
|
|
|
|
|
|
|
#include "p521.h" |
|
|
|
|
|
|
|
typedef uint64x4_t uint64x3_t; /* fit it in a vector register */ |
|
|
|
static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 }; |
|
|
|
|
|
|
|
typedef struct { |
|
|
|
uint64x3_t lo, hi; |
|
|
|
} hexad_t; |
|
|
|
|
|
|
|
/* Currently requires CLANG. Sorry. */ |
|
|
|
static inline uint64x3_t timesW (uint64x3_t u) { |
|
|
|
return u.zxyw + u.zwww; |
|
|
|
} |
|
|
|
|
|
|
|
/* Store three vectors. Currently requries AVX2 (TODO: remove) */ |
|
|
|
static const uint64x4_t ls_mask_3 = { -1ull, -1ull, -1ull, 0 }; |
|
|
|
static void store3 (uint64_t *x, uint64x3_t v) { |
|
|
|
_mm256_maskstore_epi64((long long *) x, ls_mask_3, v); |
|
|
|
} |
|
|
|
uint64x3_t lo, hi, hier; |
|
|
|
} nonad_t; |
|
|
|
|
|
|
|
static __inline__ uint64_t is_zero(uint64_t a) { |
|
|
|
/* let's hope the compiler isn't clever enough to optimize this. */ |
|
|
|
return (((__uint128_t)a)-1)>>64; |
|
|
|
} |
|
|
|
|
|
|
|
static __inline__ __uint128_t widemul( |
|
|
|
const uint64_t a, |
|
|
|
const uint64_t b |
|
|
|
) { |
|
|
|
return ((__uint128_t)a) * ((__uint128_t)b); |
|
|
|
} |
|
|
|
|
|
|
|
static inline __uint128_t widemulu(const uint64_t a, const uint64_t b) { |
|
|
|
static inline __uint128_t widemulu(uint64_t a, uint64_t b) { |
|
|
|
return ((__uint128_t)(a)) * b; |
|
|
|
} |
|
|
|
|
|
|
|
static inline __int128_t widemuls(const int64_t a, const int64_t b) { |
|
|
|
static inline __int128_t widemuls(int64_t a, int64_t b) { |
|
|
|
return ((__int128_t)(a)) * b; |
|
|
|
} |
|
|
|
|
|
|
@@ -48,8 +27,9 @@ static inline uint64_t opacify(uint64_t x) { |
|
|
|
return x; |
|
|
|
} |
|
|
|
|
|
|
|
static inline void hexad_mul ( |
|
|
|
hexad_t *hex, |
|
|
|
/* These used to be hexads, leading to 10% better performance, but there were overflow issues */ |
|
|
|
static inline void nonad_mul ( |
|
|
|
nonad_t *hex, |
|
|
|
const uint64_t *a, |
|
|
|
const uint64_t *b |
|
|
|
) { |
|
|
@@ -76,15 +56,13 @@ static inline void hexad_mul ( |
|
|
|
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, |
|
|
|
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; |
|
|
|
|
|
|
|
hi = hi<<6 | lo>>58; |
|
|
|
lo &= mask58; |
|
|
|
|
|
|
|
hex->lo = lo; |
|
|
|
hex->hi = hi; |
|
|
|
hex->hier = hi>>52; |
|
|
|
hex->hi = (hi<<12)>>6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
} |
|
|
|
|
|
|
|
static inline void hexad_mul_signed ( |
|
|
|
hexad_t *hex, |
|
|
|
nonad_t *hex, |
|
|
|
const int64_t *a, |
|
|
|
const int64_t *b |
|
|
|
) { |
|
|
@@ -111,15 +89,18 @@ static inline void hexad_mul_signed ( |
|
|
|
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, |
|
|
|
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; |
|
|
|
|
|
|
|
hi = hi<<6 | lo>>58; |
|
|
|
lo &= mask58; |
|
|
|
|
|
|
|
hex->lo = lo; |
|
|
|
hex->hi = hi; |
|
|
|
/* |
|
|
|
hex->hier = (uint64x4_t)((int64x4_t)hi>>52); |
|
|
|
hex->hi = (hi<<12)>>6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
*/ |
|
|
|
|
|
|
|
hex->hi = hi<<6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
} |
|
|
|
|
|
|
|
static inline void hexad_sqr ( |
|
|
|
hexad_t *hex, |
|
|
|
static inline void nonad_sqr ( |
|
|
|
nonad_t *hex, |
|
|
|
const uint64_t *a |
|
|
|
) { |
|
|
|
__uint128_t xu, xv, xw; |
|
|
@@ -143,15 +124,13 @@ static inline void hexad_sqr ( |
|
|
|
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, |
|
|
|
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; |
|
|
|
|
|
|
|
hi = hi<<6 | lo>>58; |
|
|
|
lo &= mask58; |
|
|
|
|
|
|
|
hex->lo = lo; |
|
|
|
hex->hi = hi; |
|
|
|
hex->hier = hi>>52; |
|
|
|
hex->hi = (hi<<12)>>6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
} |
|
|
|
|
|
|
|
static inline void hexad_sqr_signed ( |
|
|
|
hexad_t *hex, |
|
|
|
nonad_t *hex, |
|
|
|
const int64_t *a |
|
|
|
) { |
|
|
|
__uint128_t xu, xv, xw; |
|
|
@@ -175,11 +154,15 @@ static inline void hexad_sqr_signed ( |
|
|
|
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, |
|
|
|
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; |
|
|
|
|
|
|
|
hi = hi<<6 | lo>>58; |
|
|
|
lo &= mask58; |
|
|
|
|
|
|
|
hex->lo = lo; |
|
|
|
hex->hi = hi; |
|
|
|
/* |
|
|
|
hex->hier = (uint64x4_t)((int64x4_t)hi>>52); |
|
|
|
hex->hi = (hi<<12)>>6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
*/ |
|
|
|
|
|
|
|
hex->hi = hi<<6 | lo>>58; |
|
|
|
hex->lo = lo & mask58; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@@ -190,51 +173,83 @@ p521_mul ( |
|
|
|
const p521_t *as, |
|
|
|
const p521_t *bs |
|
|
|
) { |
|
|
|
int i; |
|
|
|
|
|
|
|
#if 0 |
|
|
|
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); |
|
|
|
assert(bs->limb[3] == 0 && bs->limb[7] == 0 && bs->limb[11] == 0); |
|
|
|
for (i=0; i<12; i++) { |
|
|
|
assert(as->limb[i] < 5ull<<57); |
|
|
|
assert(bs->limb[i] < 5ull<<57); |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
/* Bounds on the hexads and nonads. |
|
|
|
* |
|
|
|
* Limbs < 2<<58 + ep. |
|
|
|
* Nonad mul < 1<<58, 1<<58, tiny |
|
|
|
* -> t0 < (3,2,2)<<58 + tiny |
|
|
|
* t1,t2 < 2<<58 + tiny |
|
|
|
* * w < (4,2,2) |
|
|
|
* Hexad mul < +- (5,4,3) * 4<<116 -> 2^58 lo, +- (5,4,3) * 4<<58+ep |
|
|
|
* TimesW < (2,1,1)<<58, (6,5,4)*4<<58 + ep |
|
|
|
|
|
|
|
* ot2 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo); |
|
|
|
== (3,2,2) + (4,2,2) + (4,2,2) +- (6,5,4)*4 - (1) << 58 |
|
|
|
in (-25, +35) << 58 |
|
|
|
|
|
|
|
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo); |
|
|
|
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi); |
|
|
|
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2; |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
uint64_t *c = cs->limb; |
|
|
|
const uint64_t *a = as->limb, *b = bs->limb; |
|
|
|
|
|
|
|
hexad_t ad, be, cf, abde, bcef, acdf; |
|
|
|
hexad_mul(&ad, &a[0], &b[0]); |
|
|
|
hexad_mul(&be, &a[3], &b[3]); |
|
|
|
hexad_mul(&cf, &a[6], &b[6]); |
|
|
|
|
|
|
|
uint64_t amt = 32; |
|
|
|
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, |
|
|
|
nonad_t ad, be, cf, abde, bcef, acdf; |
|
|
|
nonad_mul(&ad, &a[0], &b[0]); |
|
|
|
nonad_mul(&be, &a[4], &b[4]); |
|
|
|
nonad_mul(&cf, &a[8], &b[8]); |
|
|
|
|
|
|
|
uint64_t amt = 26; |
|
|
|
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, |
|
|
|
vhi2 = { 0, 0, -amt<<57, 0 }; |
|
|
|
|
|
|
|
uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; |
|
|
|
|
|
|
|
int64_t ta[3], tb[3]; |
|
|
|
// it seems to be faster not to vectorize these loops |
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+3]; |
|
|
|
tb[i] = b[i]-b[i+3]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&abde,ta,tb); |
|
|
|
|
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i+3]-a[i+6]; |
|
|
|
tb[i] = b[i+3]-b[i+6]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&bcef,ta,tb); |
|
|
|
|
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+6]; |
|
|
|
tb[i] = b[i]-b[i+6]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&acdf,ta,tb); |
|
|
|
|
|
|
|
uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); |
|
|
|
uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); |
|
|
|
uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; |
|
|
|
|
|
|
|
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); |
|
|
|
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); |
|
|
|
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); |
|
|
|
|
|
|
|
store3(&c[0], out0); |
|
|
|
store3(&c[3], out1); |
|
|
|
store3(&c[6], out2); |
|
|
|
uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier); |
|
|
|
|
|
|
|
int64_t ta[4] VECTOR_ALIGNED, tb[4] VECTOR_ALIGNED; |
|
|
|
// it seems to be faster not to vectorize these loops |
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+4]; |
|
|
|
tb[i] = b[i]-b[i+4]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&abde,ta,tb); |
|
|
|
|
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i+4]-a[i+8]; |
|
|
|
tb[i] = b[i+4]-b[i+8]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&bcef,ta,tb); |
|
|
|
|
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+8]; |
|
|
|
tb[i] = b[i]-b[i+8]; |
|
|
|
} |
|
|
|
hexad_mul_signed(&acdf,ta,tb); |
|
|
|
|
|
|
|
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo); |
|
|
|
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi); |
|
|
|
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2; |
|
|
|
|
|
|
|
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); |
|
|
|
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); |
|
|
|
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); |
|
|
|
|
|
|
|
*(uint64x4_t *)&c[0] = out0; |
|
|
|
*(uint64x4_t *)&c[4] = out1; |
|
|
|
*(uint64x4_t *)&c[8] = out2; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@@ -243,48 +258,58 @@ p521_sqr ( |
|
|
|
p521_t *__restrict__ cs, |
|
|
|
const p521_t *as |
|
|
|
) { |
|
|
|
|
|
|
|
|
|
|
|
int i; |
|
|
|
#if 0 |
|
|
|
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); |
|
|
|
for (i=0; i<12; i++) { |
|
|
|
assert(as->limb[i] < 5ull<<57); |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
uint64_t *c = cs->limb; |
|
|
|
const uint64_t *a = as->limb; |
|
|
|
|
|
|
|
hexad_t ad, be, cf, abde, bcef, acdf; |
|
|
|
hexad_sqr(&ad, &a[0]); |
|
|
|
hexad_sqr(&be, &a[3]); |
|
|
|
hexad_sqr(&cf, &a[6]); |
|
|
|
|
|
|
|
uint64_t amt = 32; |
|
|
|
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, |
|
|
|
nonad_t ad, be, cf, abde, bcef, acdf; |
|
|
|
nonad_sqr(&ad, &a[0]); |
|
|
|
nonad_sqr(&be, &a[4]); |
|
|
|
nonad_sqr(&cf, &a[8]); |
|
|
|
|
|
|
|
uint64_t amt = 26; |
|
|
|
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, |
|
|
|
vhi2 = { 0, 0, -amt<<57, 0 }; |
|
|
|
|
|
|
|
uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier); |
|
|
|
|
|
|
|
int64_t ta[4] VECTOR_ALIGNED; |
|
|
|
// it seems to be faster not to vectorize these loops |
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+4]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&abde,ta); |
|
|
|
|
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i+4]-a[i+8]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&bcef,ta); |
|
|
|
|
|
|
|
for (i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+8]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&acdf,ta); |
|
|
|
|
|
|
|
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo); |
|
|
|
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi); |
|
|
|
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2; |
|
|
|
|
|
|
|
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); |
|
|
|
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); |
|
|
|
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); |
|
|
|
|
|
|
|
uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; |
|
|
|
|
|
|
|
int64_t ta[3]; |
|
|
|
// it seems to be faster not to vectorize these loops |
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+3]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&abde,ta); |
|
|
|
|
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i+3]-a[i+6]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&bcef,ta); |
|
|
|
|
|
|
|
for (int i=0; i<3; i++) { |
|
|
|
ta[i] = a[i]-a[i+6]; |
|
|
|
} |
|
|
|
hexad_sqr_signed(&acdf,ta); |
|
|
|
|
|
|
|
uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); |
|
|
|
uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); |
|
|
|
uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; |
|
|
|
|
|
|
|
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); |
|
|
|
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); |
|
|
|
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); |
|
|
|
|
|
|
|
store3(&c[0], out0); |
|
|
|
store3(&c[3], out1); |
|
|
|
store3(&c[6], out2); |
|
|
|
*(uint64x4_t *)&c[0] = out0; |
|
|
|
*(uint64x4_t *)&c[4] = out1; |
|
|
|
*(uint64x4_t *)&c[8] = out2; |
|
|
|
} |
|
|
|
|
|
|
|
void |
|
|
@@ -293,37 +318,59 @@ p521_mulw ( |
|
|
|
const p521_t *as, |
|
|
|
uint64_t b |
|
|
|
) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if 0 |
|
|
|
int i; |
|
|
|
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); |
|
|
|
for (i=0; i<12; i++) { |
|
|
|
assert(as->limb[i] < 1ull<<61); |
|
|
|
} |
|
|
|
assert(b < 1ull<<61); |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
const uint64_t *a = as->limb; |
|
|
|
uint64_t *c = cs->limb; |
|
|
|
|
|
|
|
__uint128_t accum0 = 0, accum3 = 0, accum6 = 0; |
|
|
|
uint64_t mask = (1ull<<58) - 1; |
|
|
|
|
|
|
|
int i; |
|
|
|
for (i=0; i<3; i++) { |
|
|
|
accum0 += widemul(b, a[LIMBPERM(i)]); |
|
|
|
accum3 += widemul(b, a[LIMBPERM(i+3)]); |
|
|
|
accum6 += widemul(b, a[LIMBPERM(i+6)]); |
|
|
|
c[LIMBPERM(i)] = accum0 & mask; accum0 >>= 58; |
|
|
|
c[LIMBPERM(i+3)] = accum3 & mask; accum3 >>= 58; |
|
|
|
if (i==2) { |
|
|
|
c[LIMBPERM(i+6)] = accum6 & (mask>>1); accum6 >>= 57; |
|
|
|
} else { |
|
|
|
c[LIMBPERM(i+6)] = accum6 & mask; accum6 >>= 58; |
|
|
|
} |
|
|
|
} |
|
|
|
uint64_t mask = (1ull<<58) - 1; |
|
|
|
|
|
|
|
accum0 += widemulu(b, a[0]); |
|
|
|
accum3 += widemulu(b, a[1]); |
|
|
|
accum6 += widemulu(b, a[2]); |
|
|
|
c[0] = accum0 & mask; accum0 >>= 58; |
|
|
|
c[1] = accum3 & mask; accum3 >>= 58; |
|
|
|
c[2] = accum6 & mask; accum6 >>= 58; |
|
|
|
|
|
|
|
accum0 += widemulu(b, a[4]); |
|
|
|
accum3 += widemulu(b, a[5]); |
|
|
|
accum6 += widemulu(b, a[6]); |
|
|
|
c[4] = accum0 & mask; accum0 >>= 58; |
|
|
|
c[5] = accum3 & mask; accum3 >>= 58; |
|
|
|
c[6] = accum6 & mask; accum6 >>= 58; |
|
|
|
|
|
|
|
accum0 += widemulu(b, a[8]); |
|
|
|
accum3 += widemulu(b, a[9]); |
|
|
|
accum6 += widemulu(b, a[10]); |
|
|
|
c[8] = accum0 & mask; accum0 >>= 58; |
|
|
|
c[9] = accum3 & mask; accum3 >>= 58; |
|
|
|
c[10] = accum6 & (mask>>1); accum6 >>= 57; |
|
|
|
|
|
|
|
accum0 += c[LIMBPERM(3)]; |
|
|
|
c[LIMBPERM(3)] = accum0 & mask; |
|
|
|
c[LIMBPERM(4)] += accum0 >> 58; |
|
|
|
accum0 += c[1]; |
|
|
|
c[1] = accum0 & mask; |
|
|
|
c[5] += accum0 >> 58; |
|
|
|
|
|
|
|
accum3 += c[LIMBPERM(6)]; |
|
|
|
c[LIMBPERM(6)] = accum3 & mask; |
|
|
|
c[LIMBPERM(7)] += accum3 >> 58; |
|
|
|
accum3 += c[2]; |
|
|
|
c[2] = accum3 & mask; |
|
|
|
c[6] += accum3 >> 58; |
|
|
|
|
|
|
|
accum6 += c[LIMBPERM(0)]; |
|
|
|
c[LIMBPERM(0)] = accum6 & mask; |
|
|
|
c[LIMBPERM(1)] += accum6 >> 58; |
|
|
|
accum6 += c[0]; |
|
|
|
c[0] = accum6 & mask; |
|
|
|
c[4] += accum6 >> 58; |
|
|
|
|
|
|
|
c[3] = c[7] = c[11] = 0; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@@ -366,6 +413,8 @@ p521_strong_reduce ( |
|
|
|
} |
|
|
|
|
|
|
|
assert(is_zero(carry + scarry)); |
|
|
|
|
|
|
|
a->limb[3] = a->limb[7] = a->limb[11] = 0; |
|
|
|
} |
|
|
|
|
|
|
|
mask_t |
|
|
@@ -377,8 +426,8 @@ p521_is_zero ( |
|
|
|
p521_strong_reduce(&b); |
|
|
|
|
|
|
|
uint64_t any = 0; |
|
|
|
int i; |
|
|
|
for (i=0; i<9; i++) { |
|
|
|
unsigned int i; |
|
|
|
for (i=0; i<sizeof(b)/sizeof(b.limb[0]); i++) { |
|
|
|
any |= b.limb[i]; |
|
|
|
} |
|
|
|
return is_zero(any); |
|
|
@@ -389,7 +438,7 @@ p521_serialize ( |
|
|
|
uint8_t *serial, |
|
|
|
const struct p521_t *x |
|
|
|
) { |
|
|
|
int i,k=0; |
|
|
|
unsigned int i,k=0; |
|
|
|
p521_t red; |
|
|
|
p521_copy(&red, x); |
|
|
|
p521_strong_reduce(&red); |
|
|
@@ -430,10 +479,12 @@ p521_deserialize ( |
|
|
|
|
|
|
|
uint64_t and = -1ull; |
|
|
|
for (i=0; i<8; i++) { |
|
|
|
and &= x->limb[i]; |
|
|
|
and &= x->limb[LIMBPERM(i)]; |
|
|
|
} |
|
|
|
and &= (2*out+1); |
|
|
|
good &= is_zero((and+1)>>58); |
|
|
|
|
|
|
|
x->limb[3] = x->limb[7] = x->limb[11] = 0; |
|
|
|
|
|
|
|
return good; |
|
|
|
} |