From e273155e35e2c1e4dc454b9b19eb100f85e7a82c Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Wed, 1 Jul 2015 19:01:18 -0700 Subject: [PATCH] switch to 64-bit shift; still slower than donna? --- src/p25519/arch_x86_64/p25519.c | 42 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/p25519/arch_x86_64/p25519.c b/src/p25519/arch_x86_64/p25519.c index fedf83f..464522c 100644 --- a/src/p25519/arch_x86_64/p25519.c +++ b/src/p25519/arch_x86_64/p25519.c @@ -5,6 +5,10 @@ #include "p25519.h" #include "x86-64-arith.h" +static inline uint64_t shr(__uint128_t x, int n) { + return x>>n; +} + void p255_mul ( p255_t *__restrict__ cs, @@ -44,12 +48,12 @@ p255_mul ( mac_rm(&accum2, ai, &b[3]); uint64_t c0 = accum0 & mask; - accum1 += accum0 >> 51; + accum1 += shr(accum0, 51); uint64_t c1 = accum1 & mask; - accum2 += accum1 >> 51; + accum2 += shr(accum1, 51); c[2] = accum2 & mask; - accum0 = accum2 >> 51; + accum0 = shr(accum2, 51); mac_rm(&accum0, ai, &b[4]); @@ -73,7 +77,7 @@ p255_mul ( mac_rm(&accum1, ai, &b[0]); c[3] = accum0 & mask; - accum1 += accum0 >> 51; + accum1 += shr(accum0, 51); c[4] = accum1 & mask; /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 @@ -81,10 +85,10 @@ p255_mul ( * PERF: good enough to fit into uint64_t? */ - uint64_t a1 = accum1>>51; + uint64_t a1 = shr(accum1,51); accum1 = (__uint128_t)a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + (accum1>>51); + c[1] = c1 + shr(accum1,51); } void @@ -118,9 +122,9 @@ p255_sqr ( mac_rm(&accum2, ai, &a[4]); uint64_t c0 = accum0 & mask; - accum1 += accum0 >> 51; + accum1 += shr(accum0, 51); uint64_t c1 = accum1 & mask; - accum2 += accum1 >> 51; + accum2 += shr(accum1, 51); c[2] = accum2 & mask; accum0 = accum2 >> 51; @@ -137,7 +141,7 @@ p255_sqr ( mac_rr(&accum1, a[2], a[2]); c[3] = accum0 & mask; - accum1 += accum0 >> 51; + accum1 += shr(accum0, 51); c[4] = accum1 & mask; /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 @@ -145,10 +149,10 @@ p255_sqr ( * PERF: good enough to fit into uint64_t? */ - uint64_t a1 = accum1>>51; + uint64_t a1 = shr(accum1,51); accum1 = (__uint128_t)a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + (accum1>>51); + c[1] = c1 + shr(accum1,51); } void @@ -162,28 +166,28 @@ p255_mulw ( __uint128_t accum = widemul_rm(b, &a[0]); uint64_t c0 = accum & mask; - accum >>= 51; + accum = shr(accum,51); mac_rm(&accum, b, &a[1]); uint64_t c1 = accum & mask; - accum >>= 51; + accum = shr(accum,51); mac_rm(&accum, b, &a[2]); c[2] = accum & mask; - accum >>= 51; + accum = shr(accum,51); mac_rm(&accum, b, &a[3]); c[3] = accum & mask; - accum >>= 51; + accum = shr(accum,51); mac_rm(&accum, b, &a[4]); c[4] = accum & mask; - - uint64_t a1 = accum>>51; - accum = (__uint128_t)a1 * 19 + c0; + + accum = shr(accum,51); + accum = accum * 19 + c0; c[0] = accum & mask; - c[1] = c1 + (accum>>51); + c[1] = c1 + shr(accum,51); } void