|
|
@@ -5,6 +5,10 @@ |
|
|
|
#include "p25519.h" |
|
|
|
#include "x86-64-arith.h" |
|
|
|
|
|
|
|
static inline uint64_t shr(__uint128_t x, int n) { |
|
|
|
return x>>n; |
|
|
|
} |
|
|
|
|
|
|
|
void |
|
|
|
p255_mul ( |
|
|
|
p255_t *__restrict__ cs, |
|
|
@@ -44,12 +48,12 @@ p255_mul ( |
|
|
|
mac_rm(&accum2, ai, &b[3]); |
|
|
|
|
|
|
|
uint64_t c0 = accum0 & mask; |
|
|
|
accum1 += accum0 >> 51; |
|
|
|
accum1 += shr(accum0, 51); |
|
|
|
uint64_t c1 = accum1 & mask; |
|
|
|
accum2 += accum1 >> 51; |
|
|
|
accum2 += shr(accum1, 51); |
|
|
|
c[2] = accum2 & mask; |
|
|
|
|
|
|
|
accum0 = accum2 >> 51; |
|
|
|
accum0 = shr(accum2, 51); |
|
|
|
|
|
|
|
mac_rm(&accum0, ai, &b[4]); |
|
|
|
|
|
|
@@ -73,7 +77,7 @@ p255_mul ( |
|
|
|
mac_rm(&accum1, ai, &b[0]); |
|
|
|
|
|
|
|
c[3] = accum0 & mask; |
|
|
|
accum1 += accum0 >> 51; |
|
|
|
accum1 += shr(accum0, 51); |
|
|
|
c[4] = accum1 & mask; |
|
|
|
|
|
|
|
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 |
|
|
@@ -81,10 +85,10 @@ p255_mul ( |
|
|
|
* PERF: good enough to fit into uint64_t? |
|
|
|
*/ |
|
|
|
|
|
|
|
uint64_t a1 = accum1>>51; |
|
|
|
uint64_t a1 = shr(accum1,51); |
|
|
|
accum1 = (__uint128_t)a1 * 19 + c0; |
|
|
|
c[0] = accum1 & mask; |
|
|
|
c[1] = c1 + (accum1>>51); |
|
|
|
c[1] = c1 + shr(accum1,51); |
|
|
|
} |
|
|
|
|
|
|
|
void |
|
|
@@ -118,9 +122,9 @@ p255_sqr ( |
|
|
|
mac_rm(&accum2, ai, &a[4]); |
|
|
|
|
|
|
|
uint64_t c0 = accum0 & mask; |
|
|
|
accum1 += accum0 >> 51; |
|
|
|
accum1 += shr(accum0, 51); |
|
|
|
uint64_t c1 = accum1 & mask; |
|
|
|
accum2 += accum1 >> 51; |
|
|
|
accum2 += shr(accum1, 51); |
|
|
|
c[2] = accum2 & mask; |
|
|
|
|
|
|
|
accum0 = accum2 >> 51; |
|
|
@@ -137,7 +141,7 @@ p255_sqr ( |
|
|
|
mac_rr(&accum1, a[2], a[2]); |
|
|
|
|
|
|
|
c[3] = accum0 & mask; |
|
|
|
accum1 += accum0 >> 51; |
|
|
|
accum1 += shr(accum0, 51); |
|
|
|
c[4] = accum1 & mask; |
|
|
|
|
|
|
|
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 |
|
|
@@ -145,10 +149,10 @@ p255_sqr ( |
|
|
|
* PERF: good enough to fit into uint64_t? |
|
|
|
*/ |
|
|
|
|
|
|
|
uint64_t a1 = accum1>>51; |
|
|
|
uint64_t a1 = shr(accum1,51); |
|
|
|
accum1 = (__uint128_t)a1 * 19 + c0; |
|
|
|
c[0] = accum1 & mask; |
|
|
|
c[1] = c1 + (accum1>>51); |
|
|
|
c[1] = c1 + shr(accum1,51); |
|
|
|
} |
|
|
|
|
|
|
|
void |
|
|
@@ -162,28 +166,28 @@ p255_mulw ( |
|
|
|
|
|
|
|
__uint128_t accum = widemul_rm(b, &a[0]); |
|
|
|
uint64_t c0 = accum & mask; |
|
|
|
accum >>= 51; |
|
|
|
accum = shr(accum,51); |
|
|
|
|
|
|
|
mac_rm(&accum, b, &a[1]); |
|
|
|
uint64_t c1 = accum & mask; |
|
|
|
accum >>= 51; |
|
|
|
accum = shr(accum,51); |
|
|
|
|
|
|
|
mac_rm(&accum, b, &a[2]); |
|
|
|
c[2] = accum & mask; |
|
|
|
accum >>= 51; |
|
|
|
accum = shr(accum,51); |
|
|
|
|
|
|
|
mac_rm(&accum, b, &a[3]); |
|
|
|
c[3] = accum & mask; |
|
|
|
accum >>= 51; |
|
|
|
accum = shr(accum,51); |
|
|
|
|
|
|
|
mac_rm(&accum, b, &a[4]); |
|
|
|
c[4] = accum & mask; |
|
|
|
|
|
|
|
uint64_t a1 = accum>>51; |
|
|
|
accum = (__uint128_t)a1 * 19 + c0; |
|
|
|
|
|
|
|
accum = shr(accum,51); |
|
|
|
accum = accum * 19 + c0; |
|
|
|
|
|
|
|
c[0] = accum & mask; |
|
|
|
c[1] = c1 + (accum>>51); |
|
|
|
c[1] = c1 + shr(accum,51); |
|
|
|
} |
|
|
|
|
|
|
|
void |
|
|
|