From 6bc97fb756cbd1fa962bbeed3d50fb55063dff91 Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Thu, 14 Jan 2016 18:11:00 -0800 Subject: [PATCH] need an include/arch_*/arch_intrinsics.h for other arches --- Makefile | 14 +- src/include/arch_x86_64/arch_intrinsics.h | 306 ++++++++++++++++++++ src/include/word.h | 3 +- src/p25519/arch_ref64/f_impl.c | 45 +-- src/p25519/arch_x86_64/f_impl.c | 82 ++---- src/p25519/arch_x86_64/x86-64-arith.h | 323 ---------------------- src/p448/arch_32/f_impl.c | 80 ++---- src/p448/arch_arm_32/f_impl.c | 51 ++-- src/p448/arch_neon_experimental/f_impl.c | 43 +-- src/p448/arch_ref64/f_impl.c | 43 +-- src/p448/arch_x86_64/f_impl.c | 46 +-- src/p448/arch_x86_64/x86-64-arith.h | 323 ---------------------- src/p480/arch_x86_64/f_impl.c | 43 +-- src/p480/arch_x86_64/x86-64-arith.h | 275 ------------------ src/p521/arch_ref64/f_impl.c | 43 +-- src/p521/arch_x86_64_r12/f_impl.c | 48 +--- 16 files changed, 453 insertions(+), 1315 deletions(-) create mode 100644 src/include/arch_x86_64/arch_intrinsics.h delete mode 100644 src/p25519/arch_x86_64/x86-64-arith.h delete mode 100644 src/p448/arch_x86_64/x86-64-arith.h delete mode 100644 src/p480/arch_x86_64/x86-64-arith.h diff --git a/Makefile b/Makefile index a3c3ff4..16eb948 100644 --- a/Makefile +++ b/Makefile @@ -146,10 +146,14 @@ COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o LIBCOMPONENTS += $$(COMPONENTS_OF_$(1)) $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS) - $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$< + $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ + -I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ + -S -c -o $$@ $$< $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS) - $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$< + $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ + -I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ + -S -c -o $$@ $$< endef ################################################################ @@ -166,18 +170,18 @@ $$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1) $$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS) $$(CC) $$(CFLAGS) -S -c -o $$@ $$< \ - -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ + -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) $$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS) $$(CC) $$(CFLAGS) \ - -I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ + -I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ -S -c -o $$@ $$< $$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS) $$(CC) $$(CFLAGS) \ - -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ + -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ -S -c -o $$@ $$< diff --git a/src/include/arch_x86_64/arch_intrinsics.h b/src/include/arch_x86_64/arch_intrinsics.h new file mode 100644 index 0000000..d2b03e1 --- /dev/null +++ b/src/include/arch_x86_64/arch_intrinsics.h @@ -0,0 +1,306 @@ +/* Copyright (c) 2014-2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__ +#define __ARCH_X86_64_ARCH_INTRINSICS_H__ + +#include + +/* FUTURE: non x86-64 versions of these. + * FUTURE: autogenerate + */ + +static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { + uint64_t c,d; + #ifndef __BMI2__ + __asm__ volatile + ("movq %[a], %%rax;" + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"m"(*a) + : "cc"); + #else + __asm__ volatile + ("movq %[a], %%rdx;" + "mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"m"(*a) + : "rdx"); + #endif + return (((__uint128_t)(d))<<64) | c; +} + +static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { + uint64_t c,d; + #ifndef __BMI2__ + __asm__ volatile + ("movq %[a], %%rax;" + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"r"(a) + : "cc"); + #else + __asm__ volatile + ("mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"d"(a)); + #endif + return (((__uint128_t)(d))<<64) | c; +} + +static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { + uint64_t c,d; + #ifndef __BMI2__ + __asm__ volatile + ("mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"r"(b), "a"(a) + : "cc"); + #else + __asm__ volatile + ("mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"r"(b), [a]"d"(a)); + #endif + return (((__uint128_t)(d))<<64) | c; +} + +static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { + uint64_t c,d; + #ifndef __BMI2__ + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"m"(*a) + : "cc"); + #else + __asm__ volatile + ("movq %[a], %%rdx;" + "leaq (,%%rdx,2), %%rdx;" + "mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"m"(*a) + : "rdx"); + #endif + return (((__uint128_t)(d))<<64) | c; +} + +static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + uint64_t lo2 = *acc2, hi2 = *acc2>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + "addq %[c], %[lo2]; " + "adcq %[d], %[hi2]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + "addq %%rax, %[lo2]; " + "adcq %%rdx, %[hi2]; " + : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; + *acc2 = (((__uint128_t)(hi2))<<64) | lo2; +} + +static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"d"(a) + : "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"r"(a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"r"(b), [a]"d"(a) + : "cc"); + #else + __asm__ volatile + ("mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"r"(b), "a"(a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "addq %%rdx, %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[c], %[lo]; " + "sbbq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "subq %%rax, %[lo]; " + "sbbq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "addq %%rdx, %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[c], %[lo]; " + "sbbq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b]; " + "subq %%rax, %[lo]; " + "sbbq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + *acc = (((__uint128_t)(hi))<<64) | lo; + +} + +static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t c,d, lo = *acc, hi = *acc>>64; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[lo], %[c]; " + "sbbq %[hi], %[d]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + *acc = (((__uint128_t)(d))<<64) | c; +} + +static __inline__ uint64_t is_zero(uint64_t x) { + __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); + return ~x; +} + +static inline uint64_t shrld(__uint128_t x, int n) { + return x>>n; +} + +#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */ diff --git a/src/include/word.h b/src/include/word.h index 0ba17ee..b44a92e 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -9,6 +9,7 @@ #define _XOPEN_SOURCE 600 #include "arch_config.h" +#include "arch_intrinsics.h" #include @@ -32,7 +33,6 @@ #endif #if (WORD_BITS == 64) - typedef uint32_t hword_t; typedef uint64_t word_t, mask_t; typedef __uint128_t dword_t; typedef int32_t hsword_t; @@ -50,7 +50,6 @@ #define letohWORD letoh64 #define SC_LIMB(x) (x##ull) #elif (WORD_BITS == 32) - typedef uint16_t hword_t; typedef uint32_t word_t, mask_t; typedef uint64_t dword_t; typedef int16_t hsword_t; diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c index 8f24012..7afd485 100644 --- a/src/p25519/arch_ref64/f_impl.c +++ b/src/p25519/arch_ref64/f_impl.c @@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { return (((__uint128_t)a)-1)>>64; } -void -gf_25519_mul ( - gf_25519_t __restrict__ cs, - const gf_25519_t as, - const gf_25519_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); uint64_t bh[4]; @@ -51,12 +46,7 @@ gf_25519_mul ( c[1] += accum; } -void -gf_25519_mulw ( - gf_25519_t __restrict__ cs, - const gf_25519_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); int i; @@ -78,18 +68,11 @@ gf_25519_mulw ( c[1] += accum; } -void -gf_25519_sqr ( - gf_25519_t __restrict__ cs, - const gf_25519_t as -) { - gf_25519_mul(cs,as,as); // PERF +void gf_sqr (gf_s *__restrict__ cs, const gf as) { + gf_mul(cs,as,as); // PERF } -void -gf_25519_strong_reduce ( - gf_25519_t a -) { +void gf_strong_reduce (gf a) { uint64_t mask = (1ull<<51)-1; /* first, clear high */ @@ -127,15 +110,11 @@ gf_25519_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_25519_serialize ( - uint8_t serial[32], - const struct gf_25519_t x -) { +void gf_serialize (uint8_t serial[32], const struct gf x) { int i,j; - gf_25519_t red; - gf_25519_copy(&red, x); - gf_25519_t trong_reduce(&red); + gf red; + gf_copy(&red, x); + gf_strong_reduce(&red); uint64_t *r = red.limb; uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; for (i=0; i<4; i++) { @@ -146,11 +125,7 @@ gf_25519_serialize ( } } -mask_t -gf_25519_deserialize ( - gf_25519_t x, - const uint8_t serial[32] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[32]) { int i,j; uint64_t ser64[4], mask = ((1ull<<51)-1); for (i=0; i<4; i++) { diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c index 377252c..168dbd5 100644 --- a/src/p25519/arch_x86_64/f_impl.c +++ b/src/p25519/arch_x86_64/f_impl.c @@ -3,18 +3,8 @@ */ #include "f_field.h" -#include "x86-64-arith.h" -static inline uint64_t shr(__uint128_t x, int n) { - return x>>n; -} - -void -gf_25519_mul ( - gf_25519_s *__restrict__ cs, - const gf_25519_t as, - const gf_25519_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; @@ -48,12 +38,12 @@ gf_25519_mul ( mac_rm(&accum2, ai, &b[3]); uint64_t c0 = accum0 & mask; - accum1 += shr(accum0, 51); + accum1 += shrld(accum0, 51); uint64_t c1 = accum1 & mask; - accum2 += shr(accum1, 51); + accum2 += shrld(accum1, 51); c[2] = accum2 & mask; - accum0 = shr(accum2, 51); + accum0 = shrld(accum2, 51); mac_rm(&accum0, ai, &b[4]); @@ -77,7 +67,7 @@ gf_25519_mul ( mac_rm(&accum1, ai, &b[0]); c[3] = accum0 & mask; - accum1 += shr(accum0, 51); + accum1 += shrld(accum0, 51); c[4] = accum1 & mask; /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 @@ -85,17 +75,13 @@ gf_25519_mul ( * PERF: good enough to fit into uint64_t? */ - uint64_t a1 = shr(accum1,51); + uint64_t a1 = shrld(accum1,51); accum1 = (__uint128_t)a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + shr(accum1,51); + c[1] = c1 + shrld(accum1,51); } -void -gf_25519_sqr ( - gf_25519_s *__restrict__ cs, - const gf_25519_t as -) { +void gf_sqr (gf_s *__restrict__ cs, const gf as) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; @@ -122,9 +108,9 @@ gf_25519_sqr ( mac_rm(&accum2, ai, &a[4]); uint64_t c0 = accum0 & mask; - accum1 += shr(accum0, 51); + accum1 += shrld(accum0, 51); uint64_t c1 = accum1 & mask; - accum2 += shr(accum1, 51); + accum2 += shrld(accum1, 51); c[2] = accum2 & mask; accum0 = accum2 >> 51; @@ -141,7 +127,7 @@ gf_25519_sqr ( mac_rr(&accum1, a[2], a[2]); c[3] = accum0 & mask; - accum1 += shr(accum0, 51); + accum1 += shrld(accum0, 51); c[4] = accum1 & mask; /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 @@ -149,51 +135,43 @@ gf_25519_sqr ( * PERF: good enough to fit into uint64_t? */ - uint64_t a1 = shr(accum1,51); + uint64_t a1 = shrld(accum1,51); accum1 = (__uint128_t)a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + shr(accum1,51); + c[1] = c1 + shrld(accum1,51); } -void -gf_25519_mulw ( - gf_25519_s *__restrict__ cs, - const gf_25519_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; __uint128_t accum = widemul_rm(b, &a[0]); uint64_t c0 = accum & mask; - accum = shr(accum,51); + accum = shrld(accum,51); mac_rm(&accum, b, &a[1]); uint64_t c1 = accum & mask; - accum = shr(accum,51); + accum = shrld(accum,51); mac_rm(&accum, b, &a[2]); c[2] = accum & mask; - accum = shr(accum,51); + accum = shrld(accum,51); mac_rm(&accum, b, &a[3]); c[3] = accum & mask; - accum = shr(accum,51); + accum = shrld(accum,51); mac_rm(&accum, b, &a[4]); c[4] = accum & mask; - accum = shr(accum,51); + accum = shrld(accum,51); accum = accum * 19 + c0; c[0] = accum & mask; - c[1] = c1 + shr(accum,51); + c[1] = c1 + shrld(accum,51); } -void -gf_25519_strong_reduce ( - gf_25519_t a -) { +void gf_strong_reduce (gf a) { uint64_t mask = (1ull<<51)-1; /* first, clear high */ @@ -231,15 +209,11 @@ gf_25519_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_25519_serialize ( - uint8_t serial[32], - const gf_25519_t x -) { +void gf_serialize (uint8_t serial[32], const gf x) { int i,j; - gf_25519_t red; - gf_25519_copy(red, x); - gf_25519_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); uint64_t *r = red->limb; uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; for (i=0; i<4; i++) { @@ -250,11 +224,7 @@ gf_25519_serialize ( } } -mask_t -gf_25519_deserialize ( - gf_25519_t x, - const uint8_t serial[32] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[32]) { int i,j; uint64_t ser64[4], mask = ((1ull<<51)-1); for (i=0; i<4; i++) { diff --git a/src/p25519/arch_x86_64/x86-64-arith.h b/src/p25519/arch_x86_64/x86-64-arith.h deleted file mode 100644 index 00fcc1e..0000000 --- a/src/p25519/arch_x86_64/x86-64-arith.h +++ /dev/null @@ -1,323 +0,0 @@ -/* Copyright (c) 2014 Cryptography Research, Inc. - * Released under the MIT License. See LICENSE.txt for license information. - */ - -#ifndef __X86_64_ARITH_H__ -#define __X86_64_ARITH_H__ - -#include - -/* TODO: non x86-64 versions of these. - * FUTURE: autogenerate - */ - -static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"r"(a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"d"(a)); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"r"(b), "a"(a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"r"(b), [a]"d"(a)); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "leaq (,%%rdx,2), %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - uint64_t lo2 = *acc2, hi2 = *acc2>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - "addq %[c], %[lo2]; " - "adcq %[d], %[hi2]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - "addq %%rax, %[lo2]; " - "adcq %%rdx, %[hi2]; " - : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; - *acc2 = (((__uint128_t)(hi2))<<64) | lo2; -} - -static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"r"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"r"(b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"r"(b), "a"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; - -} - -static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t c,d, lo = *acc, hi = *acc>>64; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[lo], %[c]; " - "sbbq %[hi], %[d]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - *acc = (((__uint128_t)(d))<<64) | c; -} - -static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { - return ((__uint128_t)(a)) * b; -} - -static __inline__ __int128_t widemuls(int64_t a, int64_t b) { - return ((__int128_t)(a)) * b; -} - -static __inline__ uint64_t opacify(uint64_t x) { - __asm__ volatile("" : "+r"(x)); - return x; -} - -static __inline__ mask_t is_zero(uint64_t x) { - __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); - return ~x; -} - -#endif /* __X86_64_ARITH_H__ */ diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c index bd900c6..739b1fb 100644 --- a/src/p448/arch_32/f_impl.c +++ b/src/p448/arch_32/f_impl.c @@ -4,28 +4,20 @@ #include "f_field.h" -static inline mask_t __attribute__((always_inline)) -is_zero ( - word_t x -) { +static inline mask_t is_zero (word_t x) { dword_t xx = x; xx--; return xx >> WORD_BITS; } -static uint64_t widemul_32 ( +static uint64_t widemul ( const uint32_t a, const uint32_t b ) { return ((uint64_t)a)* b; } -void -gf_448_mul ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - const gf_448_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; @@ -44,9 +36,9 @@ gf_448_mul ( accum2 = 0; for (i=0; i<=j; i++) { - accum2 += widemul_32(a[j-i],b[i]); - accum1 += widemul_32(aa[j-i],bb[i]); - accum0 += widemul_32(a[8+j-i], b[8+i]); + accum2 += widemul(a[j-i],b[i]); + accum1 += widemul(aa[j-i],bb[i]); + accum0 += widemul(a[8+j-i], b[8+i]); } accum1 -= accum2; @@ -54,9 +46,9 @@ gf_448_mul ( accum2 = 0; for (; i<8; i++) { - accum0 -= widemul_32(a[8+j-i], b[i]); - accum2 += widemul_32(aa[8+j-i], bb[i]); - accum1 += widemul_32(a[16+j-i], b[8+i]); + accum0 -= widemul(a[8+j-i], b[i]); + accum2 += widemul(aa[8+j-i], bb[i]); + accum1 += widemul(a[16+j-i], b[8+i]); } accum1 += accum2; @@ -81,12 +73,7 @@ gf_448_mul ( c[1] += ((uint32_t)(accum1)); } -void -gf_448_mulw ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); const uint32_t *a = as->limb; @@ -97,20 +84,20 @@ gf_448_mulw ( int i; - accum0 = widemul_32(blo, a[0]); - accum8 = widemul_32(blo, a[8]); - accum0 += widemul_32(bhi, a[15]); - accum8 += widemul_32(bhi, a[15] + a[7]); + accum0 = widemul(blo, a[0]); + accum8 = widemul(blo, a[8]); + accum0 += widemul(bhi, a[15]); + accum8 += widemul(bhi, a[15] + a[7]); c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; for (i=1; i<8; i++) { - accum0 += widemul_32(blo, a[i]); - accum8 += widemul_32(blo, a[i+8]); + accum0 += widemul(blo, a[i]); + accum8 += widemul(blo, a[i+8]); - accum0 += widemul_32(bhi, a[i-1]); - accum8 += widemul_32(bhi, a[i+7]); + accum0 += widemul(bhi, a[i-1]); + accum8 += widemul(bhi, a[i+7]); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -125,18 +112,11 @@ gf_448_mulw ( c[1] += accum8 >> 28; } -void -gf_448_sqr ( - gf_448_s *__restrict__ cs, - const gf_448_t as -) { - gf_448_mul(cs,as,as); /* PERF */ +void gf_sqr (gf_s *__restrict__ cs, const gf as) { + gf_mul(cs,as,as); /* PERF */ } -void -gf_448_strong_reduce ( - gf_448_t a -) { +void gf_strong_reduce (gf a) { word_t mask = (1ull<<28)-1; /* first, clear high */ @@ -176,15 +156,11 @@ gf_448_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_448_serialize ( - uint8_t *serial, - const gf_448_t x -) { +void gf_serialize (uint8_t *serial, const gf x) { int i,j; - gf_448_t red; - gf_448_copy(red, x); - gf_448_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); for (i=0; i<8; i++) { uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); for (j=0; j<7; j++) { @@ -195,11 +171,7 @@ gf_448_serialize ( } } -mask_t -gf_448_deserialize ( - gf_448_t x, - const uint8_t serial[56] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[56]) { int i,j; for (i=0; i<8; i++) { uint64_t out = 0; diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c index ea831f3..62eda0f 100644 --- a/src/p448/arch_arm_32/f_impl.c +++ b/src/p448/arch_arm_32/f_impl.c @@ -4,16 +4,13 @@ #include "f_field.h" -static inline mask_t __attribute__((always_inline)) -is_zero ( - word_t x -) { +static inline mask_t is_zero (word_t x) { dword_t xx = x; xx--; return xx >> WORD_BITS; } -static uint64_t widemul_32 ( +static uint64_t widemul ( const uint32_t a, const uint32_t b ) { @@ -97,12 +94,7 @@ smull2 ( #endif } -void -gf_448_mul ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - const gf_448_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; @@ -448,11 +440,7 @@ gf_448_mul ( c[1] += ((uint32_t)(accum1)); } -void -gf_448_sqr ( - gf_448_s *__restrict__ cs, - const gf_448_t as -) { +void gf_sqr (gf_s *__restrict__ cs, const gf as) { const uint32_t *a = as->limb; uint32_t *c = cs->limb; @@ -746,10 +734,9 @@ gf_448_sqr ( c[1] += ((uint32_t)(accum1)); } -void -gf_448_mulw ( - gf_448_s *__restrict__ cs, - const gf_448_t as, +void gf_mulw ( + gf_s *__restrict__ cs, + const gf as, uint64_t b ) { uint32_t mask = (1ull<<28)-1; @@ -763,8 +750,8 @@ gf_448_mulw ( int i; uint32_t c0, c8, n0, n8; - accum0 = widemul_32(bhi, a[15]); - accum8 = widemul_32(bhi, a[15] + a[7]); + accum0 = widemul(bhi, a[15]); + accum8 = widemul(bhi, a[15] + a[7]); c0 = a[0]; c8 = a[8]; smlal(&accum0, blo, c0); smlal(&accum8, blo, c8); @@ -860,9 +847,8 @@ gf_448_mulw ( c[1] += accum8 >> 28; } -void -gf_448_strong_reduce ( - gf_448_t a +void gf_strong_reduce ( + gf a ) { word_t mask = (1ull<<28)-1; @@ -903,15 +889,14 @@ gf_448_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_448_serialize ( +void gf_serialize ( uint8_t *serial, - const gf_448_t x + const gf x ) { int i,j; - gf_448_t red; - gf_448_copy(red, x); - gf_448_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); for (i=0; i<8; i++) { uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); for (j=0; j<7; j++) { @@ -923,8 +908,8 @@ gf_448_serialize ( } mask_t -gf_448_deserialize ( - gf_448_t x, +gf_deserialize ( + gf x, const uint8_t serial[56] ) { int i,j; diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon_experimental/f_impl.c index 002ef40..1225f5e 100644 --- a/src/p448/arch_neon_experimental/f_impl.c +++ b/src/p448/arch_neon_experimental/f_impl.c @@ -67,12 +67,7 @@ smull2 ( *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; } -void -gf_448_mul ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - const gf_448_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { #define _bl0 "q0" #define _bl0_0 "d0" #define _bl0_1 "d1" @@ -366,11 +361,7 @@ gf_448_mul ( ); } -void -gf_448_sqr ( - gf_448_s *__restrict__ cs, - const gf_448_t bs -) { +void gf_sqr (gf_s *__restrict__ cs, const gf bs) { int32x2_t *vc = (int32x2_t*) cs->limb; __asm__ __volatile__ ( @@ -567,12 +558,7 @@ gf_448_sqr ( ); } -void -gf_448_mulw ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; uint64x2_t accum; @@ -618,10 +604,7 @@ gf_448_mulw ( } /* PERF: vectorize? */ -void -gf_448_strong_reduce ( - gf_448_t a -) { +void gf_strong_reduce (gf a) { word_t mask = (1ull<<28)-1; /* first, clear high */ @@ -661,15 +644,11 @@ gf_448_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_448_serialize ( - uint8_t *serial, - const gf_448_t x -) { +void gf_serialize (uint8_t *serial, const gf x) { int i,j; - gf_448_t red; - gf_448_copy(red, x); - gf_448_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); for (i=0; i<8; i++) { uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28); @@ -681,11 +660,7 @@ gf_448_serialize ( } } -mask_t -gf_448_deserialize ( - gf_448_t x, - const uint8_t serial[56] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[56]) { int i,j; for (i=0; i<8; i++) { uint64_t out = 0; diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c index 88bef61..74aeeb1 100644 --- a/src/p448/arch_ref64/f_impl.c +++ b/src/p448/arch_ref64/f_impl.c @@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { return (((__uint128_t)a)-1)>>64; } -void -gf_448_mul ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - const gf_448_t bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb; uint64_t *c = cs->limb; @@ -182,12 +177,7 @@ gf_448_mul ( c[1] += ((uint64_t)(accum1)); } -void -gf_448_mulw ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -211,11 +201,7 @@ gf_448_mulw ( c[1] += accum4 >> 56; } -void -gf_448_sqr ( - gf_448_s *__restrict__ cs, - const gf_448_t as -) { +void gf_sqr (gf_s *__restrict__ cs, const gf as) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -326,10 +312,7 @@ gf_448_sqr ( c[0] += ((uint64_t)(accum1)); } -void -gf_448_strong_reduce ( - gf_448_t a -) { +void gf_strong_reduce (gf a) { uint64_t mask = (1ull<<56)-1; /* first, clear high */ @@ -369,15 +352,11 @@ gf_448_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_448_serialize ( - uint8_t *serial, - const gf_448_t x -) { +void gf_serialize (uint8_t *serial, const gf x) { int i,j; - gf_448_t red; - gf_448_copy(red, x); - gf_448_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); for (i=0; i<8; i++) { for (j=0; j<7; j++) { serial[7*i+j] = red->limb[i]; @@ -387,11 +366,7 @@ gf_448_serialize ( } } -mask_t -gf_448_deserialize ( - gf_448_t x, - const uint8_t serial[56] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[56]) { int i,j; for (i=0; i<8; i++) { uint64_t out = 0; diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c index 9c02d84..07744fa 100644 --- a/src/p448/arch_x86_64/f_impl.c +++ b/src/p448/arch_x86_64/f_impl.c @@ -3,14 +3,8 @@ */ #include "f_field.h" -#include "x86-64-arith.h" - -void -gf_448_mul ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - const gf_448_t bs -) { + +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb; uint64_t *c = cs->limb; @@ -145,12 +139,7 @@ gf_448_mul ( c[0] += ((uint64_t)(accum1)); } -void -gf_448_mulw ( - gf_448_s *__restrict__ cs, - const gf_448_t as, - uint64_t b -) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -190,11 +179,7 @@ gf_448_mulw ( c[1] += accum4 >> 56; } -void -gf_448_sqr ( - gf_448_s *__restrict__ cs, - const gf_448_t as -) { +void gf_sqr (gf_s *__restrict__ cs, const gf as) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -305,10 +290,7 @@ gf_448_sqr ( c[0] += ((uint64_t)(accum1)); } -void -gf_448_strong_reduce ( - gf_448_t a -) { +void gf_strong_reduce (gf a) { uint64_t mask = (1ull<<56)-1; /* first, clear high */ @@ -348,15 +330,11 @@ gf_448_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_448_serialize ( - uint8_t *serial, - const gf_448_t x -) { +void gf_serialize (uint8_t *serial, const gf x) { int i,j; - gf_448_t red; - gf_448_copy(red, x); - gf_448_strong_reduce(red); + gf red; + gf_copy(red, x); + gf_strong_reduce(red); for (i=0; i<8; i++) { for (j=0; j<7; j++) { serial[7*i+j] = red->limb[i]; @@ -366,11 +344,7 @@ gf_448_serialize ( } } -mask_t -gf_448_deserialize ( - gf_448_t x, - const uint8_t serial[56] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[56]) { int i,j; for (i=0; i<8; i++) { word_t out = 0; diff --git a/src/p448/arch_x86_64/x86-64-arith.h b/src/p448/arch_x86_64/x86-64-arith.h deleted file mode 100644 index 4f38723..0000000 --- a/src/p448/arch_x86_64/x86-64-arith.h +++ /dev/null @@ -1,323 +0,0 @@ -/* Copyright (c) 2014 Cryptography Research, Inc. - * Released under the MIT License. See LICENSE.txt for license information. - */ - -#ifndef __X86_64_ARITH_H__ -#define __X86_64_ARITH_H__ - -#include - -/* FUTURE: non x86-64 versions of these. - * FUTURE: autogenerate - */ - -static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"r"(a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"d"(a)); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"r"(b), "a"(a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"r"(b), [a]"d"(a)); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "leaq (,%%rdx,2), %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - uint64_t lo2 = *acc2, hi2 = *acc2>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - "addq %[c], %[lo2]; " - "adcq %[d], %[hi2]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - "addq %%rax, %[lo2]; " - "adcq %%rdx, %[hi2]; " - : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; - *acc2 = (((__uint128_t)(hi2))<<64) | lo2; -} - -static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"r"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"r"(b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"r"(b), "a"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; - -} - -static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t c,d, lo = *acc, hi = *acc>>64; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[lo], %[c]; " - "sbbq %[hi], %[d]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - *acc = (((__uint128_t)(d))<<64) | c; -} - -static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { - return ((__uint128_t)(a)) * b; -} - -static __inline__ __int128_t widemuls(int64_t a, int64_t b) { - return ((__int128_t)(a)) * b; -} - -static __inline__ uint64_t opacify(uint64_t x) { - __asm__ volatile("" : "+r"(x)); - return x; -} - -static __inline__ mask_t is_zero(uint64_t x) { - __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); - return ~x; -} - -#endif /* __X86_64_ARITH_H__ */ diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c index 7aea1f0..b3c565b 100644 --- a/src/p480/arch_x86_64/f_impl.c +++ b/src/p480/arch_x86_64/f_impl.c @@ -4,12 +4,7 @@ #include "f_field.h" -void -gf_480_mul ( - gf_480_t *__restrict__ cs, - const gf_480_t *as, - const gf_480_t *bs -) { +void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) { const uint64_t *a = as->limb, *b = bs->limb; uint64_t *c = cs->limb; @@ -144,12 +139,7 @@ gf_480_mul ( c[0] += ((uint64_t)(accum1)); } -void -gf_480_mulw ( - gf_480_t *__restrict__ cs, - const gf_480_t *as, - uint64_t b -) { +void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -189,11 +179,7 @@ gf_480_mulw ( c[1] += accum4 >> 60; } -void -gf_480_sqr ( - gf_480_t *__restrict__ cs, - const gf_480_t *as -) { +void gf_sqr (gf *__restrict__ cs, const gf *as) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; @@ -304,10 +290,7 @@ gf_480_sqr ( c[0] += ((uint64_t)(accum1)); } -void -gf_480_strong_reduce ( - gf_480_t *a -) { +void gf_strong_reduce (gf *a) { uint64_t mask = (1ull<<60)-1; /* first, clear high */ @@ -347,15 +330,11 @@ gf_480_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_480_serialize ( - uint8_t *serial, - const struct gf_480_t *x -) { +void gf_serialize (uint8_t *serial, const struct gf *x) { int i,j,k=0; - gf_480_t red; - gf_480_copy(&red, x); - gf_480_strong_reduce(&red); + gf red; + gf_copy(&red, x); + gf_strong_reduce(&red); word_t r = 0; for (i=0; i<8; i+=2) { r = red.limb[i]; @@ -373,11 +352,7 @@ gf_480_serialize ( } } -mask_t -gf_480_deserialize ( - gf_480_t *x, - const uint8_t serial[60] -) { +mask_t gf_deserialize (gf *x, const uint8_t serial[60]) { int i,j,k=0; for (i=0; i<8; i+=2) { diff --git a/src/p480/arch_x86_64/x86-64-arith.h b/src/p480/arch_x86_64/x86-64-arith.h deleted file mode 100644 index a4d40da..0000000 --- a/src/p480/arch_x86_64/x86-64-arith.h +++ /dev/null @@ -1,275 +0,0 @@ -/* Copyright (c) 2014 Cryptography Research, Inc. - * Released under the MIT License. See LICENSE.txt for license information. - */ - -#ifndef __X86_64_ARITH_H__ -#define __X86_64_ARITH_H__ - -#include - -static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"r"(a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"d"(a)); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { - #ifndef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - return (((__uint128_t)(d))<<64) | c; - #else - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx;" - "leaq (,%%rdx,2), %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - return (((__uint128_t)(d))<<64) | c; - #endif -} - -static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - uint64_t lo2 = *acc2, hi2 = *acc2>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - "addq %[c], %[lo2]; " - "adcq %[d], %[hi2]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - "addq %%rax, %[lo2]; " - "adcq %%rdx, %[hi2]; " - : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; - *acc2 = (((__uint128_t)(hi2))<<64) | lo2; -} - -static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"r"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; -} - -static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t lo = *acc, hi = *acc>>64; - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; - -} - -static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { - uint64_t c,d, lo = *acc, hi = *acc>>64; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[lo], %[c]; " - "sbbq %[hi], %[d]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - *acc = (((__uint128_t)(d))<<64) | c; -} - -static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { - return ((__uint128_t)(a)) * b; -} - -static __inline__ __int128_t widemuls(int64_t a, int64_t b) { - return ((__int128_t)(a)) * b; -} - -static __inline__ uint64_t opacify(uint64_t x) { - __asm__ volatile("" : "+r"(x)); - return x; -} - -static __inline__ mask_t is_zero(uint64_t x) { - __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); - return ~x; -} - -#endif /* __X86_64_ARITH_H__ */ diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c index 8670cd6..03c98ee 100644 --- a/src/p521/arch_ref64/f_impl.c +++ b/src/p521/arch_ref64/f_impl.c @@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { return (((__uint128_t)a)-1)>>64; } -void -gf_521_mul ( - gf_521_t *__restrict__ cs, - const gf_521_t *as, - const gf_521_t *bs -) { +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { uint64_t *c = cs->limb; const uint64_t *a = as->limb, *b = bs->limb; __uint128_t accum0, accum1; @@ -157,10 +152,9 @@ gf_521_mul ( c[8] += accum1 >> 58; } -void -gf_521_mulw ( - gf_521_t *__restrict__ cs, - const gf_521_t *as, +void gf_mulw ( + gf_s *__restrict__ cs, + const gf as, uint64_t b ) { const uint64_t *a = as->limb; @@ -196,11 +190,7 @@ gf_521_mulw ( c[1] += accum6 >> 58; } -void -gf_521_sqr ( - gf_521_t *__restrict__ cs, - const gf_521_t *as -) { +void gf_sqr (gf_s *__restrict__ cs, const gf as) { uint64_t *c = cs->limb; const uint64_t *a = as->limb; __uint128_t accum0, accum1; @@ -305,10 +295,7 @@ gf_521_sqr ( c[8] += accum1 >> 58; } -void -gf_521_strong_reduce ( - gf_521_t *a -) { +void gf_strong_reduce (gf a) { uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; /* first, clear high */ @@ -346,15 +333,11 @@ gf_521_strong_reduce ( assert(is_zero(carry + scarry)); } -void -gf_521_serialize ( - uint8_t *serial, - const struct gf_521_t *x -) { +void gf_serialize (uint8_t *serial, const struct gf x) { int i,k=0; - gf_521_t red; - gf_521_copy(&red, x); - gf_521_strong_reduce(&red); + gf red; + gf_copy(&red, x); + gf_strong_reduce(&red); uint64_t r=0; int bits = 0; @@ -370,11 +353,7 @@ gf_521_serialize ( serial[k++] = r; } -mask_t -gf_521_deserialize ( - gf_521_t *x, - const uint8_t serial[66] -) { +mask_t gf_deserialize (gf x, const uint8_t serial[66]) { int i,k=0,bits=0; __uint128_t out = 0; uint64_t mask = (1ull<<58)-1; diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c index 0b42a4b..39d0f1e 100644 --- a/src/p521/arch_x86_64_r12/f_impl.c +++ b/src/p521/arch_x86_64_r12/f_impl.c @@ -167,12 +167,7 @@ static inline void hexad_sqr_signed ( -void -gf_521_mul ( - gf_521_t *__restrict__ cs, - const gf_521_t *as, - const gf_521_t *bs -) { +void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) { int i; #if 0 @@ -253,13 +248,7 @@ gf_521_mul ( } -void -gf_521_sqr ( - gf_521_t *__restrict__ cs, - const gf_521_t *as -) { - - +void gf_sqr (gf *__restrict__ cs, const gf *as) { int i; #if 0 assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); @@ -312,15 +301,7 @@ gf_521_sqr ( *(uint64x4_t *)&c[8] = out2; } -void -gf_521_mulw ( - gf_521_t *__restrict__ cs, - const gf_521_t *as, - uint64_t b -) { - - - +void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) { #if 0 int i; assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); @@ -374,10 +355,7 @@ gf_521_mulw ( } -void -gf_521_strong_reduce ( - gf_521_t *a -) { +void gf_strong_reduce (gf *a) { uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; /* first, clear high */ @@ -417,15 +395,11 @@ gf_521_strong_reduce ( a->limb[3] = a->limb[7] = a->limb[11] = 0; } -void -gf_521_serialize ( - uint8_t *serial, - const struct gf_521_t *x -) { +void gf_serialize (uint8_t *serial, const struct gf *x) { unsigned int i,k=0; - gf_521_t red; - gf_521_copy(&red, x); - gf_521_strong_reduce(&red); + gf red; + gf_copy(&red, x); + gf_strong_reduce(&red); uint64_t r=0; int bits = 0; @@ -441,11 +415,7 @@ gf_521_serialize ( serial[k++] = r; } -mask_t -gf_521_deserialize ( - gf_521_t *x, - const uint8_t serial[LIMBPERM(66)] -) { +mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) { int i,k=0,bits=0; __uint128_t out = 0; uint64_t mask = (1ull<<58)-1;