| @@ -5,6 +5,10 @@ | |||||
| #include "p25519.h" | #include "p25519.h" | ||||
| #include "x86-64-arith.h" | #include "x86-64-arith.h" | ||||
| static inline uint64_t shr(__uint128_t x, int n) { | |||||
| return x>>n; | |||||
| } | |||||
| void | void | ||||
| p255_mul ( | p255_mul ( | ||||
| p255_t *__restrict__ cs, | p255_t *__restrict__ cs, | ||||
| @@ -44,12 +48,12 @@ p255_mul ( | |||||
| mac_rm(&accum2, ai, &b[3]); | mac_rm(&accum2, ai, &b[3]); | ||||
| uint64_t c0 = accum0 & mask; | uint64_t c0 = accum0 & mask; | ||||
| accum1 += accum0 >> 51; | |||||
| accum1 += shr(accum0, 51); | |||||
| uint64_t c1 = accum1 & mask; | uint64_t c1 = accum1 & mask; | ||||
| accum2 += accum1 >> 51; | |||||
| accum2 += shr(accum1, 51); | |||||
| c[2] = accum2 & mask; | c[2] = accum2 & mask; | ||||
| accum0 = accum2 >> 51; | |||||
| accum0 = shr(accum2, 51); | |||||
| mac_rm(&accum0, ai, &b[4]); | mac_rm(&accum0, ai, &b[4]); | ||||
| @@ -73,7 +77,7 @@ p255_mul ( | |||||
| mac_rm(&accum1, ai, &b[0]); | mac_rm(&accum1, ai, &b[0]); | ||||
| c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
| accum1 += accum0 >> 51; | |||||
| accum1 += shr(accum0, 51); | |||||
| c[4] = accum1 & mask; | c[4] = accum1 & mask; | ||||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
| @@ -81,10 +85,10 @@ p255_mul ( | |||||
| * PERF: good enough to fit into uint64_t? | * PERF: good enough to fit into uint64_t? | ||||
| */ | */ | ||||
| uint64_t a1 = accum1>>51; | |||||
| uint64_t a1 = shr(accum1,51); | |||||
| accum1 = (__uint128_t)a1 * 19 + c0; | accum1 = (__uint128_t)a1 * 19 + c0; | ||||
| c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
| c[1] = c1 + (accum1>>51); | |||||
| c[1] = c1 + shr(accum1,51); | |||||
| } | } | ||||
| void | void | ||||
| @@ -118,9 +122,9 @@ p255_sqr ( | |||||
| mac_rm(&accum2, ai, &a[4]); | mac_rm(&accum2, ai, &a[4]); | ||||
| uint64_t c0 = accum0 & mask; | uint64_t c0 = accum0 & mask; | ||||
| accum1 += accum0 >> 51; | |||||
| accum1 += shr(accum0, 51); | |||||
| uint64_t c1 = accum1 & mask; | uint64_t c1 = accum1 & mask; | ||||
| accum2 += accum1 >> 51; | |||||
| accum2 += shr(accum1, 51); | |||||
| c[2] = accum2 & mask; | c[2] = accum2 & mask; | ||||
| accum0 = accum2 >> 51; | accum0 = accum2 >> 51; | ||||
| @@ -137,7 +141,7 @@ p255_sqr ( | |||||
| mac_rr(&accum1, a[2], a[2]); | mac_rr(&accum1, a[2], a[2]); | ||||
| c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
| accum1 += accum0 >> 51; | |||||
| accum1 += shr(accum0, 51); | |||||
| c[4] = accum1 & mask; | c[4] = accum1 & mask; | ||||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
| @@ -145,10 +149,10 @@ p255_sqr ( | |||||
| * PERF: good enough to fit into uint64_t? | * PERF: good enough to fit into uint64_t? | ||||
| */ | */ | ||||
| uint64_t a1 = accum1>>51; | |||||
| uint64_t a1 = shr(accum1,51); | |||||
| accum1 = (__uint128_t)a1 * 19 + c0; | accum1 = (__uint128_t)a1 * 19 + c0; | ||||
| c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
| c[1] = c1 + (accum1>>51); | |||||
| c[1] = c1 + shr(accum1,51); | |||||
| } | } | ||||
| void | void | ||||
| @@ -162,28 +166,28 @@ p255_mulw ( | |||||
| __uint128_t accum = widemul_rm(b, &a[0]); | __uint128_t accum = widemul_rm(b, &a[0]); | ||||
| uint64_t c0 = accum & mask; | uint64_t c0 = accum & mask; | ||||
| accum >>= 51; | |||||
| accum = shr(accum,51); | |||||
| mac_rm(&accum, b, &a[1]); | mac_rm(&accum, b, &a[1]); | ||||
| uint64_t c1 = accum & mask; | uint64_t c1 = accum & mask; | ||||
| accum >>= 51; | |||||
| accum = shr(accum,51); | |||||
| mac_rm(&accum, b, &a[2]); | mac_rm(&accum, b, &a[2]); | ||||
| c[2] = accum & mask; | c[2] = accum & mask; | ||||
| accum >>= 51; | |||||
| accum = shr(accum,51); | |||||
| mac_rm(&accum, b, &a[3]); | mac_rm(&accum, b, &a[3]); | ||||
| c[3] = accum & mask; | c[3] = accum & mask; | ||||
| accum >>= 51; | |||||
| accum = shr(accum,51); | |||||
| mac_rm(&accum, b, &a[4]); | mac_rm(&accum, b, &a[4]); | ||||
| c[4] = accum & mask; | c[4] = accum & mask; | ||||
| uint64_t a1 = accum>>51; | |||||
| accum = (__uint128_t)a1 * 19 + c0; | |||||
| accum = shr(accum,51); | |||||
| accum = accum * 19 + c0; | |||||
| c[0] = accum & mask; | c[0] = accum & mask; | ||||
| c[1] = c1 + (accum>>51); | |||||
| c[1] = c1 + shr(accum,51); | |||||
| } | } | ||||
| void | void | ||||