From ba9f2019013ce0ff05203d73a19380577d5cf4c5 Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Wed, 1 Jul 2015 16:57:34 -0700 Subject: [PATCH] faster mulw? --- src/p25519/arch_x86_64/p25519.c | 36 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/p25519/arch_x86_64/p25519.c b/src/p25519/arch_x86_64/p25519.c index 8d9044a..fedf83f 100644 --- a/src/p25519/arch_x86_64/p25519.c +++ b/src/p25519/arch_x86_64/p25519.c @@ -158,24 +158,32 @@ p255_mulw ( uint64_t b ) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); - int i; - uint64_t *c = cs->limb; - __uint128_t accum = 0; - for (i=0; i<5; i++) { - mac_rm(&accum, b, &a[i]); - c[i] = accum & mask; - accum >>= 51; - } - /* PERF: parallelize? eh well this is reference */ - accum *= 19; - accum += c[0]; - c[0] = accum & mask; + __uint128_t accum = widemul_rm(b, &a[0]); + uint64_t c0 = accum & mask; + accum >>= 51; + + mac_rm(&accum, b, &a[1]); + uint64_t c1 = accum & mask; accum >>= 51; - assert(accum < mask); - c[1] += accum; + mac_rm(&accum, b, &a[2]); + c[2] = accum & mask; + accum >>= 51; + + mac_rm(&accum, b, &a[3]); + c[3] = accum & mask; + accum >>= 51; + + mac_rm(&accum, b, &a[4]); + c[4] = accum & mask; + + uint64_t a1 = accum>>51; + accum = (__uint128_t)a1 * 19 + c0; + + c[0] = accum & mask; + c[1] = c1 + (accum>>51); } void