diff --git a/src/include/arch_ref64/arch_intrinsics.h b/src/include/arch_ref64/arch_intrinsics.h index a1b1a74..4b34ea5 100644 --- a/src/include/arch_ref64/arch_intrinsics.h +++ b/src/include/arch_ref64/arch_intrinsics.h @@ -14,7 +14,7 @@ uint64_t word_is_zero(uint64_t a) { } static __inline__ __attribute((always_inline,unused)) -uint64_t widemul(uint64_t a, uint64_t b) { +__uint128_t widemul(uint64_t a, uint64_t b) { return ((__uint128_t)a) * b; } diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c index 93d03bc..edaa62a 100644 --- a/src/p25519/arch_32/f_impl.c +++ b/src/p25519/arch_32/f_impl.c @@ -7,7 +7,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); - uint64_t bh[9]; + uint32_t bh[9]; int i,j; for (i=0; i<9; i++) bh[i] = b[i+1] * 19; @@ -18,13 +18,13 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { /* Even case. */ for (j=0; j>= 26; @@ -53,25 +53,22 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); - uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; uint32_t *c = cs->limb; - uint64_t accum = 0; - - accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]); + uint64_t accum = widemul(b, a[0]); c[0] = accum & maske; accum >>= 26; - accum += widemul(blo, a[1]) + widemul(bhi,a[0]); + accum += widemul(b, a[1]); c[1] = accum & masko; accum >>= 25; for (int i=2; i<10; /*i+=2*/) { - accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]); + accum += widemul(b, a[i]); c[i] = accum & maske; accum >>= 26; i++; - accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]); + accum += widemul(b, a[i]); c[i] = accum & masko; accum >>= 25; i++; diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h index 322de05..d2d0de9 100644 --- a/src/p25519/arch_32/f_impl.h +++ b/src/p25519/arch_32/f_impl.h @@ -2,7 +2,7 @@ * Released under the MIT License. See LICENSE.txt for license information. */ -#define GF_HEADROOM 5 +#define GF_HEADROOM 3 /* Would be 5, but 3*19 * 2^26+small is all that fits in a uint32_t */ #define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26 #define FIELD_LITERAL(a,b,c,d,e) {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}} @@ -12,21 +12,20 @@ void gf_add_RAW (gf out, const gf a, const gf b) { for (unsigned int i=0; i<10; i++) { out->limb[i] = a->limb[i] + b->limb[i]; } - gf_weak_reduce(out); } void gf_sub_RAW (gf out, const gf a, const gf b) { - uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36; - for (unsigned int i=0; i<10; i+=2) { - out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe); - out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo; + for (unsigned int i=0; i<10; i++) { + out->limb[i] = a->limb[i] - b->limb[i]; } - gf_weak_reduce(out); } void gf_bias (gf a, int amt) { - (void) a; - (void) amt; + uint32_t coe = ((1ull<<26)-1)*amt, coo = ((1ull<<25)-1)*amt, co0 = coe-18*amt; + for (unsigned int i=0; i<10; i+=2) { + a->limb[i] += ((i==0) ? co0 : coe); + a->limb[i+1] += coo; + } } void gf_weak_reduce (gf a) { diff --git a/src/p25519/arch_ref64/f_impl.h b/src/p25519/arch_ref64/f_impl.h index 0e9e3ca..61d9e03 100644 --- a/src/p25519/arch_ref64/f_impl.h +++ b/src/p25519/arch_ref64/f_impl.h @@ -2,7 +2,7 @@ * Released under the MIT License. See LICENSE.txt for license information. */ -#define GF_HEADROOM 933 +#define GF_HEADROOM 9999 /* Always reduced */ #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} #define LIMB_PLACE_VALUE(i) 51