From 42a561d018920db87b17e9cf03142082e7488df5 Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Wed, 1 Jul 2015 16:36:55 -0700 Subject: [PATCH] some accel in for curve25519 --- Makefile | 2 +- src/decaf_fast.c | 6 +- src/p25519/arch_x86_64/arch_config.h | 1 + src/p25519/arch_x86_64/p25519.c | 275 ++++++++++++++++++++++++++ src/p25519/arch_x86_64/p25519.h | 166 ++++++++++++++++ src/p25519/arch_x86_64/x86-64-arith.h | 1 + src/p448/arch_x86_64/x86-64-arith.h | 44 +++++ 7 files changed, 491 insertions(+), 4 deletions(-) create mode 100644 src/p25519/arch_x86_64/arch_config.h create mode 100644 src/p25519/arch_x86_64/p25519.c create mode 100644 src/p25519/arch_x86_64/p25519.h create mode 120000 src/p25519/arch_x86_64/x86-64-arith.h diff --git a/Makefile b/Makefile index f43541f..eae38d9 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ ASM ?= $(CC) DECAF ?= decaf_fast ifneq (,$(findstring x86_64,$(MACHINE))) -ARCH ?= arch_ref64 +ARCH ?= arch_x86_64 else # no i386 port yet ARCH ?= arch_ref32 diff --git a/src/decaf_fast.c b/src/decaf_fast.c index bfde995..956ef24 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -361,7 +361,7 @@ decaf_bool_t API_NS(scalar_invert) ( } return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); #else - decaf_255_scalar_t b, ma; + scalar_t b, ma; int i; sc_montmul(b,API_NS(scalar_one),sc_r2); sc_montmul(ma,a,sc_r2); @@ -378,10 +378,10 @@ decaf_bool_t API_NS(scalar_invert) ( } } - sc_montmul(out,b,decaf_255_scalar_one); + sc_montmul(out,b,API_NS(scalar_one)); API_NS(scalar_destroy)(b); API_NS(scalar_destroy)(ma); - return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); + return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); #endif } diff --git a/src/p25519/arch_x86_64/arch_config.h b/src/p25519/arch_x86_64/arch_config.h new file mode 100644 index 0000000..58758cc --- /dev/null +++ b/src/p25519/arch_x86_64/arch_config.h @@ -0,0 +1 @@ +#define WORD_BITS 64 diff --git a/src/p25519/arch_x86_64/p25519.c b/src/p25519/arch_x86_64/p25519.c new file mode 100644 index 0000000..5959b0d --- /dev/null +++ b/src/p25519/arch_x86_64/p25519.c @@ -0,0 +1,275 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p25519.h" +#include "x86-64-arith.h" + +void +p255_mul ( + p255_t *__restrict__ cs, + const p255_t *as, + const p255_t *bs +) { + const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); + uint64_t *c = cs->limb; + + uint64_t bh[4]; + int i; + for (i=0; i<4; i++) bh[i] = b[i+1] * 19; + + __uint128_t accum0, accum1, accum2; + + uint64_t ai = a[0]; + accum0 = widemul_rm(ai, &b[0]); + accum1 = widemul_rm(ai, &b[1]); + accum2 = widemul_rm(ai, &b[2]); + + ai = a[1]; + mac_rm(&accum0, ai, &bh[3]); + mac_rm(&accum1, ai, &b[0]); + mac_rm(&accum2, ai, &b[1]); + + ai = a[2]; + mac_rm(&accum0, ai, &bh[2]); + mac_rm(&accum1, ai, &bh[3]); + mac_rm(&accum2, ai, &b[0]); + + ai = a[3]; + mac_rm(&accum0, ai, &bh[1]); + mac_rm(&accum1, ai, &bh[2]); + mac_rm(&accum2, ai, &bh[3]); + + ai = a[4]; + mac_rm(&accum0, ai, &bh[0]); + mac_rm(&accum1, ai, &bh[1]); + mac_rm(&accum2, ai, &bh[2]); + + uint64_t c0 = accum0 & mask; + accum1 += accum0 >> 51; + uint64_t c1 = accum1 & mask; + accum2 += accum1 >> 51; + c[2] = accum2 & mask; + + accum0 = accum2 >> 51; + + ai = a[0]; + mac_rm(&accum0, ai, &b[3]); + accum1 = widemul_rm(ai, &b[4]); + + ai = a[1]; + mac_rm(&accum0, ai, &b[2]); + mac_rm(&accum1, ai, &b[3]); + + ai = a[2]; + mac_rm(&accum0, ai, &b[1]); + mac_rm(&accum1, ai, &b[2]); + + ai = a[3]; + mac_rm(&accum0, ai, &b[0]); + mac_rm(&accum1, ai, &b[1]); + + ai = a[4]; + mac_rm(&accum0, ai, &bh[3]); + mac_rm(&accum1, ai, &b[0]); + + c[3] = accum0 & mask; + accum1 += accum0 >> 51; + c[4] = accum1 & mask; + + /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 + * = 2^(-13 + <13) + * PERF: good enough to fit into uint64_t? + */ + + uint64_t a1 = accum1>>51; + accum1 = (__uint128_t)a1 * 19 + c0; + c[0] = accum1 & mask; + c[1] = c1 + (accum1>>51); +} + +void +p255_mulw ( + p255_t *__restrict__ cs, + const p255_t *as, + uint64_t b +) { + const uint64_t *a = as->limb, mask = ((1ull<<51)-1); + int i; + + uint64_t *c = cs->limb; + + __uint128_t accum = 0; + for (i=0; i<5; i++) { + mac_rm(&accum, b, &a[i]); + c[i] = accum & mask; + accum >>= 51; + } + /* PERF: parallelize? eh well this is reference */ + accum *= 19; + accum += c[0]; + c[0] = accum & mask; + accum >>= 51; + + assert(accum < mask); + c[1] += accum; +} + +void +p255_sqr ( + p255_t *__restrict__ cs, + const p255_t *as +) { + const uint64_t *a = as->limb, mask = ((1ull<<51)-1); + uint64_t *c = cs->limb; + + uint64_t ah[4]; + int i; + for (i=0; i<4; i++) ah[i] = a[i+1] * 19; + + __uint128_t accum0, accum1, accum2; + + uint64_t ai = a[0]; + accum0 = widemul_rr(ai, ai); + ai *= 2; + accum1 = widemul_rm(ai, &a[1]); + accum2 = widemul_rm(ai, &a[2]); + + ai = a[1]; + mac_rr(&accum2, ai, ai); + ai *= 2; + mac_rm(&accum0, ai, &ah[3]); + + ai = a[2] * 2; + mac_rm(&accum0, ai, &ah[2]); + mac_rm(&accum1, ai, &ah[3]); + + ai = a[3]; + mac_rm(&accum1, ai, &ah[2]); + ai *= 2; + mac_rm(&accum2, ai, &ah[3]); + + uint64_t c0 = accum0 & mask; + accum1 += accum0 >> 51; + uint64_t c1 = accum1 & mask; + accum2 += accum1 >> 51; + c[2] = accum2 & mask; + + accum0 = accum2 >> 51; + + ai = a[0]*2; + mac_rm(&accum0, ai, &a[3]); + accum1 = widemul_rm(ai, &a[4]); + + ai = a[1]*2; + mac_rm(&accum0, ai, &a[2]); + mac_rm(&accum1, ai, &a[3]); + + mac_rm(&accum0, a[4], &ah[3]); + mac_rr(&accum1, a[2], a[2]); + + c[3] = accum0 & mask; + accum1 += accum0 >> 51; + c[4] = accum1 & mask; + + /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 + * = 2^(-13 + <13) + * PERF: good enough to fit into uint64_t? + */ + + uint64_t a1 = accum1>>51; + accum1 = (__uint128_t)a1 * 19 + c0; + c[0] = accum1 & mask; + c[1] = c1 + (accum1>>51); +} + +void +p255_strong_reduce ( + p255_t *a +) { + uint64_t mask = (1ull<<51)-1; + + /* first, clear high */ + a->limb[0] += (a->limb[4]>>51)*19; + a->limb[4] &= mask; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + __int128_t scarry = 0; + int i; + for (i=0; i<5; i++) { + scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); + a->limb[i] = scarry & mask; + scarry >>= 51; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^255 + * so let's add back in p. will carry back off the top for 2^255. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<5; i++) { + carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); + a->limb[i] = carry & mask; + carry >>= 51; + } + + assert(is_zero(carry + scarry)); +} + +void +p255_serialize ( + uint8_t serial[32], + const struct p255_t *x +) { + int i,j; + p255_t red; + p255_copy(&red, x); + p255_strong_reduce(&red); + uint64_t *r = red.limb; + uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; + for (i=0; i<4; i++) { + for (j=0; j<8; j++) { + serial[8*i+j] = ser64[i]; + ser64[i] >>= 8; + } + } +} + +mask_t +p255_deserialize ( + p255_t *x, + const uint8_t serial[32] +) { + int i,j; + uint64_t ser64[4], mask = ((1ull<<51)-1); + for (i=0; i<4; i++) { + uint64_t out = 0; + for (j=0; j<8; j++) { + out |= ((uint64_t)serial[8*i+j])<<(8*j); + } + ser64[i] = out; + } + + /* Test for >= 2^255-19 */ + uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); + ge &= ser64[1]; + ge &= ser64[2]; + ge &= (ser64[3]<<1) + 1; + ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); + + x->limb[0] = ser64[0] & mask; + x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; + x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; + x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; + x->limb[4] = ser64[3]>>12; + + return ~is_zero(~ge); +} diff --git a/src/p25519/arch_x86_64/p25519.h b/src/p25519/arch_x86_64/p25519.h new file mode 100644 index 0000000..4106fcc --- /dev/null +++ b/src/p25519/arch_x86_64/p25519.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P255_H__ +#define __P255_H__ 1 + +#include +#include +#include + +#include "word.h" + +typedef struct p255_t { + uint64_t limb[5]; +} p255_t; + +#define LBITS 51 +#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} + +/* +#define FIELD_LITERAL(a,b,c,d) {{ \ + (a##ull) & LMASK, \ + ((a##ull)>>51 | (b##ull)<<13) & LMASK, \ + ((b##ull)>>38 | (c##ull)<<26) & LMASK, \ + ((c##ull)>>25 | (d##ull)<<39) & LMASK, \ + (d##ull)>>12 \ +}} +*/ + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p255_add_RAW ( + p255_t *out, + const p255_t *a, + const p255_t *b +) __attribute__((unused)); + +static __inline__ void +p255_sub_RAW ( + p255_t *out, + const p255_t *a, + const p255_t *b +) __attribute__((unused)); + +static __inline__ void +p255_copy ( + p255_t *out, + const p255_t *a +) __attribute__((unused)); + +static __inline__ void +p255_weak_reduce ( + p255_t *inout +) __attribute__((unused)); + +void +p255_strong_reduce ( + p255_t *inout +); + +static __inline__ void +p255_bias ( + p255_t *inout, + int amount +) __attribute__((unused)); + +void +p255_mul ( + p255_t *__restrict__ out, + const p255_t *a, + const p255_t *b +); + +void +p255_mulw ( + p255_t *__restrict__ out, + const p255_t *a, + uint64_t b +); + +void +p255_sqr ( + p255_t *__restrict__ out, + const p255_t *a +); + +void +p255_serialize ( + uint8_t serial[32], + const struct p255_t *x +); + +mask_t +p255_deserialize ( + p255_t *x, + const uint8_t serial[32] +); + +/* -------------- Inline functions begin here -------------- */ + +void +p255_add_RAW ( + p255_t *out, + const p255_t *a, + const p255_t *b +) { + unsigned int i; + for (i=0; i<5; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } +} + +void +p255_sub_RAW ( + p255_t *out, + const p255_t *a, + const p255_t *b +) { + unsigned int i; + uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36; + for (i=0; i<5; i++) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1); + } +} + +void +p255_copy ( + p255_t *out, + const p255_t *a +) { + memcpy(out,a,sizeof(*a)); +} + +void +p255_bias ( + p255_t *a, + int amt +) { + a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt; + int i; + for (i=1; i<5; i++) { + a->limb[i] += ((uint64_t)(amt)<<52)-2*amt; + } +} + +void +p255_weak_reduce ( + p255_t *a +) { + uint64_t mask = (1ull<<51) - 1; + uint64_t tmp = a->limb[4] >> 51; + int i; + for (i=4; i>0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); + } + a->limb[0] = (a->limb[0] & mask) + tmp*19; +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P255_H__ */ diff --git a/src/p25519/arch_x86_64/x86-64-arith.h b/src/p25519/arch_x86_64/x86-64-arith.h new file mode 120000 index 0000000..93c6c47 --- /dev/null +++ b/src/p25519/arch_x86_64/x86-64-arith.h @@ -0,0 +1 @@ +../../p448/arch_x86_64/x86-64-arith.h \ No newline at end of file diff --git a/src/p448/arch_x86_64/x86-64-arith.h b/src/p448/arch_x86_64/x86-64-arith.h index 32ee832..00fcc1e 100644 --- a/src/p448/arch_x86_64/x86-64-arith.h +++ b/src/p448/arch_x86_64/x86-64-arith.h @@ -53,6 +53,25 @@ static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { #endif } +static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { + #ifndef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"r"(b), "a"(a) + : "cc"); + return (((__uint128_t)(d))<<64) | c; + #else + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"r"(b), [a]"d"(a)); + return (((__uint128_t)(d))<<64) | c; + #endif +} + static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { #ifndef __BMI2__ uint64_t c,d; @@ -163,6 +182,31 @@ static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { *acc = (((__uint128_t)(hi))<<64) | lo; } +static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"r"(b), [a]"d"(a) + : "cc"); + #else + __asm__ volatile + ("mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"r"(b), "a"(a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { uint64_t lo = *acc, hi = *acc>>64;