diff --git a/src/include/decaf_255_config.h b/src/include/decaf_255_config.h index be9d978..01de56a 100644 --- a/src/include/decaf_255_config.h +++ b/src/include/decaf_255_config.h @@ -20,7 +20,7 @@ * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul * time. */ -#define DECAF_USE_MONTGOMERY_LADDER 1 +#define DECAF_USE_MONTGOMERY_LADDER 0 /* FUTURE */ /** The number of comb tables for fixed base scalarmul. */ #define DECAF_COMBS_N 3 diff --git a/src/p25519/arch_x86_64/p25519.c b/src/p25519/arch_x86_64/p25519.c index 5959b0d..8d9044a 100644 --- a/src/p25519/arch_x86_64/p25519.c +++ b/src/p25519/arch_x86_64/p25519.c @@ -14,10 +14,6 @@ p255_mul ( const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; - uint64_t bh[4]; - int i; - for (i=0; i<4; i++) bh[i] = b[i+1] * 19; - __uint128_t accum0, accum1, accum2; uint64_t ai = a[0]; @@ -26,24 +22,26 @@ p255_mul ( accum2 = widemul_rm(ai, &b[2]); ai = a[1]; - mac_rm(&accum0, ai, &bh[3]); - mac_rm(&accum1, ai, &b[0]); mac_rm(&accum2, ai, &b[1]); + mac_rm(&accum1, ai, &b[0]); + ai *= 19; + mac_rm(&accum0, ai, &b[4]); ai = a[2]; - mac_rm(&accum0, ai, &bh[2]); - mac_rm(&accum1, ai, &bh[3]); mac_rm(&accum2, ai, &b[0]); + ai *= 19; + mac_rm(&accum0, ai, &b[3]); + mac_rm(&accum1, ai, &b[4]); - ai = a[3]; - mac_rm(&accum0, ai, &bh[1]); - mac_rm(&accum1, ai, &bh[2]); - mac_rm(&accum2, ai, &bh[3]); + ai = a[3] * 19; + mac_rm(&accum0, ai, &b[2]); + mac_rm(&accum1, ai, &b[3]); + mac_rm(&accum2, ai, &b[4]); - ai = a[4]; - mac_rm(&accum0, ai, &bh[0]); - mac_rm(&accum1, ai, &bh[1]); - mac_rm(&accum2, ai, &bh[2]); + ai = a[4] * 19; + mac_rm(&accum0, ai, &b[1]); + mac_rm(&accum1, ai, &b[2]); + mac_rm(&accum2, ai, &b[3]); uint64_t c0 = accum0 & mask; accum1 += accum0 >> 51; @@ -52,6 +50,8 @@ p255_mul ( c[2] = accum2 & mask; accum0 = accum2 >> 51; + + mac_rm(&accum0, ai, &b[4]); ai = a[0]; mac_rm(&accum0, ai, &b[3]); @@ -70,7 +70,6 @@ p255_mul ( mac_rm(&accum1, ai, &b[1]); ai = a[4]; - mac_rm(&accum0, ai, &bh[3]); mac_rm(&accum1, ai, &b[0]); c[3] = accum0 & mask; @@ -88,33 +87,6 @@ p255_mul ( c[1] = c1 + (accum1>>51); } -void -p255_mulw ( - p255_t *__restrict__ cs, - const p255_t *as, - uint64_t b -) { - const uint64_t *a = as->limb, mask = ((1ull<<51)-1); - int i; - - uint64_t *c = cs->limb; - - __uint128_t accum = 0; - for (i=0; i<5; i++) { - mac_rm(&accum, b, &a[i]); - c[i] = accum & mask; - accum >>= 51; - } - /* PERF: parallelize? eh well this is reference */ - accum *= 19; - accum += c[0]; - c[0] = accum & mask; - accum >>= 51; - - assert(accum < mask); - c[1] += accum; -} - void p255_sqr ( p255_t *__restrict__ cs, @@ -123,10 +95,6 @@ p255_sqr ( const uint64_t *a = as->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; - uint64_t ah[4]; - int i; - for (i=0; i<4; i++) ah[i] = a[i+1] * 19; - __uint128_t accum0, accum1, accum2; uint64_t ai = a[0]; @@ -137,17 +105,17 @@ p255_sqr ( ai = a[1]; mac_rr(&accum2, ai, ai); - ai *= 2; - mac_rm(&accum0, ai, &ah[3]); + ai *= 38; + mac_rm(&accum0, ai, &a[4]); - ai = a[2] * 2; - mac_rm(&accum0, ai, &ah[2]); - mac_rm(&accum1, ai, &ah[3]); + ai = a[2] * 38; + mac_rm(&accum0, ai, &a[3]); + mac_rm(&accum1, ai, &a[4]); - ai = a[3]; - mac_rm(&accum1, ai, &ah[2]); + ai = a[3] * 19; + mac_rm(&accum1, ai, &a[3]); ai *= 2; - mac_rm(&accum2, ai, &ah[3]); + mac_rm(&accum2, ai, &a[4]); uint64_t c0 = accum0 & mask; accum1 += accum0 >> 51; @@ -165,7 +133,7 @@ p255_sqr ( mac_rm(&accum0, ai, &a[2]); mac_rm(&accum1, ai, &a[3]); - mac_rm(&accum0, a[4], &ah[3]); + mac_rr(&accum0, a[4]*19, a[4]); mac_rr(&accum1, a[2], a[2]); c[3] = accum0 & mask; @@ -183,6 +151,33 @@ p255_sqr ( c[1] = c1 + (accum1>>51); } +void +p255_mulw ( + p255_t *__restrict__ cs, + const p255_t *as, + uint64_t b +) { + const uint64_t *a = as->limb, mask = ((1ull<<51)-1); + int i; + + uint64_t *c = cs->limb; + + __uint128_t accum = 0; + for (i=0; i<5; i++) { + mac_rm(&accum, b, &a[i]); + c[i] = accum & mask; + accum >>= 51; + } + /* PERF: parallelize? eh well this is reference */ + accum *= 19; + accum += c[0]; + c[0] = accum & mask; + accum >>= 51; + + assert(accum < mask); + c[1] += accum; +} + void p255_strong_reduce ( p255_t *a