@@ -146,10 +146,14 @@ COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o | |||||
LIBCOMPONENTS += $$(COMPONENTS_OF_$(1)) | LIBCOMPONENTS += $$(COMPONENTS_OF_$(1)) | ||||
$$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS) | $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS) | ||||
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$< | $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ | ||||
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ | |||||
-S -c -o $$@ $$< | |||||
$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS) | $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS) | ||||
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$< | $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ | ||||
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ | |||||
-S -c -o $$@ $$< | |||||
endef | endef | ||||
################################################################ | ################################################################ | ||||
@@ -166,18 +170,18 @@ $$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1) | |||||
$$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS) | $$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS) | ||||
$$(CC) $$(CFLAGS) -S -c -o $$@ $$< \ | $$(CC) $$(CFLAGS) -S -c -o $$@ $$< \ | ||||
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ | -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ | ||||
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) | -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) | ||||
$$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS) | $$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS) | ||||
$$(CC) $$(CFLAGS) \ | $$(CC) $$(CFLAGS) \ | ||||
-I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ | -I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ | ||||
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ | -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ | ||||
-S -c -o $$@ $$< | -S -c -o $$@ $$< | ||||
$$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS) | $$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS) | ||||
$$(CC) $$(CFLAGS) \ | $$(CC) $$(CFLAGS) \ | ||||
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \ | -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ | ||||
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ | -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ | ||||
-S -c -o $$@ $$< | -S -c -o $$@ $$< | ||||
@@ -0,0 +1,306 @@ | |||||
/* Copyright (c) 2014-2016 Cryptography Research, Inc. | |||||
* Released under the MIT License. See LICENSE.txt for license information. | |||||
*/ | |||||
#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__ | |||||
#define __ARCH_X86_64_ARCH_INTRINSICS_H__ | |||||
#include <stdint.h> | |||||
/* FUTURE: non x86-64 versions of these. | |||||
* FUTURE: autogenerate | |||||
*/ | |||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d; | |||||
#ifndef __BMI2__ | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
#endif | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { | |||||
uint64_t c,d; | |||||
#ifndef __BMI2__ | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"d"(a)); | |||||
#endif | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { | |||||
uint64_t c,d; | |||||
#ifndef __BMI2__ | |||||
__asm__ volatile | |||||
("mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"r"(b), "a"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"r"(b), [a]"d"(a)); | |||||
#endif | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d; | |||||
#ifndef __BMI2__ | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"leaq (,%%rdx,2), %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
#endif | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
uint64_t lo2 = *acc2, hi2 = *acc2>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
"addq %[c], %[lo2]; " | |||||
"adcq %[d], %[hi2]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
"addq %%rax, %[lo2]; " | |||||
"adcq %%rdx, %[hi2]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
*acc2 = (((__uint128_t)(hi2))<<64) | lo2; | |||||
} | |||||
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), "a"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d, lo = *acc, hi = *acc>>64; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[lo], %[c]; " | |||||
"sbbq %[hi], %[d]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
*acc = (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ uint64_t is_zero(uint64_t x) { | |||||
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); | |||||
return ~x; | |||||
} | |||||
static inline uint64_t shrld(__uint128_t x, int n) { | |||||
return x>>n; | |||||
} | |||||
#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */ |
@@ -9,6 +9,7 @@ | |||||
#define _XOPEN_SOURCE 600 | #define _XOPEN_SOURCE 600 | ||||
#include "arch_config.h" | #include "arch_config.h" | ||||
#include "arch_intrinsics.h" | |||||
#include <decaf/common.h> | #include <decaf/common.h> | ||||
@@ -32,7 +33,6 @@ | |||||
#endif | #endif | ||||
#if (WORD_BITS == 64) | #if (WORD_BITS == 64) | ||||
typedef uint32_t hword_t; | |||||
typedef uint64_t word_t, mask_t; | typedef uint64_t word_t, mask_t; | ||||
typedef __uint128_t dword_t; | typedef __uint128_t dword_t; | ||||
typedef int32_t hsword_t; | typedef int32_t hsword_t; | ||||
@@ -50,7 +50,6 @@ | |||||
#define letohWORD letoh64 | #define letohWORD letoh64 | ||||
#define SC_LIMB(x) (x##ull) | #define SC_LIMB(x) (x##ull) | ||||
#elif (WORD_BITS == 32) | #elif (WORD_BITS == 32) | ||||
typedef uint16_t hword_t; | |||||
typedef uint32_t word_t, mask_t; | typedef uint32_t word_t, mask_t; | ||||
typedef uint64_t dword_t; | typedef uint64_t dword_t; | ||||
typedef int16_t hsword_t; | typedef int16_t hsword_t; | ||||
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { | |||||
return (((__uint128_t)a)-1)>>64; | return (((__uint128_t)a)-1)>>64; | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_25519_mul ( | |||||
gf_25519_t __restrict__ cs, | |||||
const gf_25519_t as, | |||||
const gf_25519_t bs | |||||
) { | |||||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | ||||
uint64_t bh[4]; | uint64_t bh[4]; | ||||
@@ -51,12 +46,7 @@ gf_25519_mul ( | |||||
c[1] += accum; | c[1] += accum; | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_25519_mulw ( | |||||
gf_25519_t __restrict__ cs, | |||||
const gf_25519_t as, | |||||
uint64_t b | |||||
) { | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
int i; | int i; | ||||
@@ -78,18 +68,11 @@ gf_25519_mulw ( | |||||
c[1] += accum; | c[1] += accum; | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_25519_sqr ( | gf_mul(cs,as,as); // PERF | ||||
gf_25519_t __restrict__ cs, | |||||
const gf_25519_t as | |||||
) { | |||||
gf_25519_mul(cs,as,as); // PERF | |||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_25519_strong_reduce ( | |||||
gf_25519_t a | |||||
) { | |||||
uint64_t mask = (1ull<<51)-1; | uint64_t mask = (1ull<<51)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -127,15 +110,11 @@ gf_25519_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t serial[32], const struct gf x) { | ||||
gf_25519_serialize ( | |||||
uint8_t serial[32], | |||||
const struct gf_25519_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_25519_t red; | gf red; | ||||
gf_25519_copy(&red, x); | gf_copy(&red, x); | ||||
gf_25519_t trong_reduce(&red); | gf_strong_reduce(&red); | ||||
uint64_t *r = red.limb; | uint64_t *r = red.limb; | ||||
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | ||||
for (i=0; i<4; i++) { | for (i=0; i<4; i++) { | ||||
@@ -146,11 +125,7 @@ gf_25519_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[32]) { | ||||
gf_25519_deserialize ( | |||||
gf_25519_t x, | |||||
const uint8_t serial[32] | |||||
) { | |||||
int i,j; | int i,j; | ||||
uint64_t ser64[4], mask = ((1ull<<51)-1); | uint64_t ser64[4], mask = ((1ull<<51)-1); | ||||
for (i=0; i<4; i++) { | for (i=0; i<4; i++) { | ||||
@@ -3,18 +3,8 @@ | |||||
*/ | */ | ||||
#include "f_field.h" | #include "f_field.h" | ||||
#include "x86-64-arith.h" | |||||
static inline uint64_t shr(__uint128_t x, int n) { | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
return x>>n; | |||||
} | |||||
void | |||||
gf_25519_mul ( | |||||
gf_25519_s *__restrict__ cs, | |||||
const gf_25519_t as, | |||||
const gf_25519_t bs | |||||
) { | |||||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -48,12 +38,12 @@ gf_25519_mul ( | |||||
mac_rm(&accum2, ai, &b[3]); | mac_rm(&accum2, ai, &b[3]); | ||||
uint64_t c0 = accum0 & mask; | uint64_t c0 = accum0 & mask; | ||||
accum1 += shr(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
uint64_t c1 = accum1 & mask; | uint64_t c1 = accum1 & mask; | ||||
accum2 += shr(accum1, 51); | accum2 += shrld(accum1, 51); | ||||
c[2] = accum2 & mask; | c[2] = accum2 & mask; | ||||
accum0 = shr(accum2, 51); | accum0 = shrld(accum2, 51); | ||||
mac_rm(&accum0, ai, &b[4]); | mac_rm(&accum0, ai, &b[4]); | ||||
@@ -77,7 +67,7 @@ gf_25519_mul ( | |||||
mac_rm(&accum1, ai, &b[0]); | mac_rm(&accum1, ai, &b[0]); | ||||
c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
accum1 += shr(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
c[4] = accum1 & mask; | c[4] = accum1 & mask; | ||||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
@@ -85,17 +75,13 @@ gf_25519_mul ( | |||||
* PERF: good enough to fit into uint64_t? | * PERF: good enough to fit into uint64_t? | ||||
*/ | */ | ||||
uint64_t a1 = shr(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
accum1 = (__uint128_t)a1 * 19 + c0; | accum1 = (__uint128_t)a1 * 19 + c0; | ||||
c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
c[1] = c1 + shr(accum1,51); | c[1] = c1 + shrld(accum1,51); | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_25519_sqr ( | |||||
gf_25519_s *__restrict__ cs, | |||||
const gf_25519_t as | |||||
) { | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -122,9 +108,9 @@ gf_25519_sqr ( | |||||
mac_rm(&accum2, ai, &a[4]); | mac_rm(&accum2, ai, &a[4]); | ||||
uint64_t c0 = accum0 & mask; | uint64_t c0 = accum0 & mask; | ||||
accum1 += shr(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
uint64_t c1 = accum1 & mask; | uint64_t c1 = accum1 & mask; | ||||
accum2 += shr(accum1, 51); | accum2 += shrld(accum1, 51); | ||||
c[2] = accum2 & mask; | c[2] = accum2 & mask; | ||||
accum0 = accum2 >> 51; | accum0 = accum2 >> 51; | ||||
@@ -141,7 +127,7 @@ gf_25519_sqr ( | |||||
mac_rr(&accum1, a[2], a[2]); | mac_rr(&accum1, a[2], a[2]); | ||||
c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
accum1 += shr(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
c[4] = accum1 & mask; | c[4] = accum1 & mask; | ||||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
@@ -149,51 +135,43 @@ gf_25519_sqr ( | |||||
* PERF: good enough to fit into uint64_t? | * PERF: good enough to fit into uint64_t? | ||||
*/ | */ | ||||
uint64_t a1 = shr(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
accum1 = (__uint128_t)a1 * 19 + c0; | accum1 = (__uint128_t)a1 * 19 + c0; | ||||
c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
c[1] = c1 + shr(accum1,51); | c[1] = c1 + shrld(accum1,51); | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_25519_mulw ( | |||||
gf_25519_s *__restrict__ cs, | |||||
const gf_25519_t as, | |||||
uint64_t b | |||||
) { | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
__uint128_t accum = widemul_rm(b, &a[0]); | __uint128_t accum = widemul_rm(b, &a[0]); | ||||
uint64_t c0 = accum & mask; | uint64_t c0 = accum & mask; | ||||
accum = shr(accum,51); | accum = shrld(accum,51); | ||||
mac_rm(&accum, b, &a[1]); | mac_rm(&accum, b, &a[1]); | ||||
uint64_t c1 = accum & mask; | uint64_t c1 = accum & mask; | ||||
accum = shr(accum,51); | accum = shrld(accum,51); | ||||
mac_rm(&accum, b, &a[2]); | mac_rm(&accum, b, &a[2]); | ||||
c[2] = accum & mask; | c[2] = accum & mask; | ||||
accum = shr(accum,51); | accum = shrld(accum,51); | ||||
mac_rm(&accum, b, &a[3]); | mac_rm(&accum, b, &a[3]); | ||||
c[3] = accum & mask; | c[3] = accum & mask; | ||||
accum = shr(accum,51); | accum = shrld(accum,51); | ||||
mac_rm(&accum, b, &a[4]); | mac_rm(&accum, b, &a[4]); | ||||
c[4] = accum & mask; | c[4] = accum & mask; | ||||
accum = shr(accum,51); | accum = shrld(accum,51); | ||||
accum = accum * 19 + c0; | accum = accum * 19 + c0; | ||||
c[0] = accum & mask; | c[0] = accum & mask; | ||||
c[1] = c1 + shr(accum,51); | c[1] = c1 + shrld(accum,51); | ||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_25519_strong_reduce ( | |||||
gf_25519_t a | |||||
) { | |||||
uint64_t mask = (1ull<<51)-1; | uint64_t mask = (1ull<<51)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -231,15 +209,11 @@ gf_25519_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t serial[32], const gf x) { | ||||
gf_25519_serialize ( | |||||
uint8_t serial[32], | |||||
const gf_25519_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_25519_t red; | gf red; | ||||
gf_25519_copy(red, x); | gf_copy(red, x); | ||||
gf_25519_strong_reduce(red); | gf_strong_reduce(red); | ||||
uint64_t *r = red->limb; | uint64_t *r = red->limb; | ||||
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | ||||
for (i=0; i<4; i++) { | for (i=0; i<4; i++) { | ||||
@@ -250,11 +224,7 @@ gf_25519_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[32]) { | ||||
gf_25519_deserialize ( | |||||
gf_25519_t x, | |||||
const uint8_t serial[32] | |||||
) { | |||||
int i,j; | int i,j; | ||||
uint64_t ser64[4], mask = ((1ull<<51)-1); | uint64_t ser64[4], mask = ((1ull<<51)-1); | ||||
for (i=0; i<4; i++) { | for (i=0; i<4; i++) { | ||||
@@ -1,323 +0,0 @@ | |||||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||||
* Released under the MIT License. See LICENSE.txt for license information. | |||||
*/ | |||||
#ifndef __X86_64_ARITH_H__ | |||||
#define __X86_64_ARITH_H__ | |||||
#include <stdint.h> | |||||
/* TODO: non x86-64 versions of these. | |||||
* FUTURE: autogenerate | |||||
*/ | |||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"d"(a)); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"r"(b), "a"(a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"r"(b), [a]"d"(a)); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"leaq (,%%rdx,2), %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
uint64_t lo2 = *acc2, hi2 = *acc2>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
"addq %[c], %[lo2]; " | |||||
"adcq %[d], %[hi2]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
"addq %%rax, %[lo2]; " | |||||
"adcq %%rdx, %[hi2]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
*acc2 = (((__uint128_t)(hi2))<<64) | lo2; | |||||
} | |||||
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), "a"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d, lo = *acc, hi = *acc>>64; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[lo], %[c]; " | |||||
"sbbq %[hi], %[d]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
*acc = (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { | |||||
return ((__uint128_t)(a)) * b; | |||||
} | |||||
static __inline__ __int128_t widemuls(int64_t a, int64_t b) { | |||||
return ((__int128_t)(a)) * b; | |||||
} | |||||
static __inline__ uint64_t opacify(uint64_t x) { | |||||
__asm__ volatile("" : "+r"(x)); | |||||
return x; | |||||
} | |||||
static __inline__ mask_t is_zero(uint64_t x) { | |||||
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); | |||||
return ~x; | |||||
} | |||||
#endif /* __X86_64_ARITH_H__ */ |
@@ -4,28 +4,20 @@ | |||||
#include "f_field.h" | #include "f_field.h" | ||||
static inline mask_t __attribute__((always_inline)) | static inline mask_t is_zero (word_t x) { | ||||
is_zero ( | |||||
word_t x | |||||
) { | |||||
dword_t xx = x; | dword_t xx = x; | ||||
xx--; | xx--; | ||||
return xx >> WORD_BITS; | return xx >> WORD_BITS; | ||||
} | } | ||||
static uint64_t widemul_32 ( | static uint64_t widemul ( | ||||
const uint32_t a, | const uint32_t a, | ||||
const uint32_t b | const uint32_t b | ||||
) { | ) { | ||||
return ((uint64_t)a)* b; | return ((uint64_t)a)* b; | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_448_mul ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
const gf_448_t bs | |||||
) { | |||||
const uint32_t *a = as->limb, *b = bs->limb; | const uint32_t *a = as->limb, *b = bs->limb; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -44,9 +36,9 @@ gf_448_mul ( | |||||
accum2 = 0; | accum2 = 0; | ||||
for (i=0; i<=j; i++) { | for (i=0; i<=j; i++) { | ||||
accum2 += widemul_32(a[j-i],b[i]); | accum2 += widemul(a[j-i],b[i]); | ||||
accum1 += widemul_32(aa[j-i],bb[i]); | accum1 += widemul(aa[j-i],bb[i]); | ||||
accum0 += widemul_32(a[8+j-i], b[8+i]); | accum0 += widemul(a[8+j-i], b[8+i]); | ||||
} | } | ||||
accum1 -= accum2; | accum1 -= accum2; | ||||
@@ -54,9 +46,9 @@ gf_448_mul ( | |||||
accum2 = 0; | accum2 = 0; | ||||
for (; i<8; i++) { | for (; i<8; i++) { | ||||
accum0 -= widemul_32(a[8+j-i], b[i]); | accum0 -= widemul(a[8+j-i], b[i]); | ||||
accum2 += widemul_32(aa[8+j-i], bb[i]); | accum2 += widemul(aa[8+j-i], bb[i]); | ||||
accum1 += widemul_32(a[16+j-i], b[8+i]); | accum1 += widemul(a[16+j-i], b[8+i]); | ||||
} | } | ||||
accum1 += accum2; | accum1 += accum2; | ||||
@@ -81,12 +73,7 @@ gf_448_mul ( | |||||
c[1] += ((uint32_t)(accum1)); | c[1] += ((uint32_t)(accum1)); | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_448_mulw ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
uint64_t b | |||||
) { | |||||
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | ||||
const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
@@ -97,20 +84,20 @@ gf_448_mulw ( | |||||
int i; | int i; | ||||
accum0 = widemul_32(blo, a[0]); | accum0 = widemul(blo, a[0]); | ||||
accum8 = widemul_32(blo, a[8]); | accum8 = widemul(blo, a[8]); | ||||
accum0 += widemul_32(bhi, a[15]); | accum0 += widemul(bhi, a[15]); | ||||
accum8 += widemul_32(bhi, a[15] + a[7]); | accum8 += widemul(bhi, a[15] + a[7]); | ||||
c[0] = accum0 & mask; accum0 >>= 28; | c[0] = accum0 & mask; accum0 >>= 28; | ||||
c[8] = accum8 & mask; accum8 >>= 28; | c[8] = accum8 & mask; accum8 >>= 28; | ||||
for (i=1; i<8; i++) { | for (i=1; i<8; i++) { | ||||
accum0 += widemul_32(blo, a[i]); | accum0 += widemul(blo, a[i]); | ||||
accum8 += widemul_32(blo, a[i+8]); | accum8 += widemul(blo, a[i+8]); | ||||
accum0 += widemul_32(bhi, a[i-1]); | accum0 += widemul(bhi, a[i-1]); | ||||
accum8 += widemul_32(bhi, a[i+7]); | accum8 += widemul(bhi, a[i+7]); | ||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -125,18 +112,11 @@ gf_448_mulw ( | |||||
c[1] += accum8 >> 28; | c[1] += accum8 >> 28; | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_448_sqr ( | gf_mul(cs,as,as); /* PERF */ | ||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as | |||||
) { | |||||
gf_448_mul(cs,as,as); /* PERF */ | |||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_448_strong_reduce ( | |||||
gf_448_t a | |||||
) { | |||||
word_t mask = (1ull<<28)-1; | word_t mask = (1ull<<28)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -176,15 +156,11 @@ gf_448_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const gf x) { | ||||
gf_448_serialize ( | |||||
uint8_t *serial, | |||||
const gf_448_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_448_t red; | gf red; | ||||
gf_448_copy(red, x); | gf_copy(red, x); | ||||
gf_448_strong_reduce(red); | gf_strong_reduce(red); | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); | uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); | ||||
for (j=0; j<7; j++) { | for (j=0; j<7; j++) { | ||||
@@ -195,11 +171,7 @@ gf_448_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[56]) { | ||||
gf_448_deserialize ( | |||||
gf_448_t x, | |||||
const uint8_t serial[56] | |||||
) { | |||||
int i,j; | int i,j; | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t out = 0; | uint64_t out = 0; | ||||
@@ -4,16 +4,13 @@ | |||||
#include "f_field.h" | #include "f_field.h" | ||||
static inline mask_t __attribute__((always_inline)) | static inline mask_t is_zero (word_t x) { | ||||
is_zero ( | |||||
word_t x | |||||
) { | |||||
dword_t xx = x; | dword_t xx = x; | ||||
xx--; | xx--; | ||||
return xx >> WORD_BITS; | return xx >> WORD_BITS; | ||||
} | } | ||||
static uint64_t widemul_32 ( | static uint64_t widemul ( | ||||
const uint32_t a, | const uint32_t a, | ||||
const uint32_t b | const uint32_t b | ||||
) { | ) { | ||||
@@ -97,12 +94,7 @@ smull2 ( | |||||
#endif | #endif | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_448_mul ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
const gf_448_t bs | |||||
) { | |||||
const uint32_t *a = as->limb, *b = bs->limb; | const uint32_t *a = as->limb, *b = bs->limb; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -448,11 +440,7 @@ gf_448_mul ( | |||||
c[1] += ((uint32_t)(accum1)); | c[1] += ((uint32_t)(accum1)); | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_448_sqr ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as | |||||
) { | |||||
const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -746,10 +734,9 @@ gf_448_sqr ( | |||||
c[1] += ((uint32_t)(accum1)); | c[1] += ((uint32_t)(accum1)); | ||||
} | } | ||||
void | void gf_mulw ( | ||||
gf_448_mulw ( | gf_s *__restrict__ cs, | ||||
gf_448_s *__restrict__ cs, | const gf as, | ||||
const gf_448_t as, | |||||
uint64_t b | uint64_t b | ||||
) { | ) { | ||||
uint32_t mask = (1ull<<28)-1; | uint32_t mask = (1ull<<28)-1; | ||||
@@ -763,8 +750,8 @@ gf_448_mulw ( | |||||
int i; | int i; | ||||
uint32_t c0, c8, n0, n8; | uint32_t c0, c8, n0, n8; | ||||
accum0 = widemul_32(bhi, a[15]); | accum0 = widemul(bhi, a[15]); | ||||
accum8 = widemul_32(bhi, a[15] + a[7]); | accum8 = widemul(bhi, a[15] + a[7]); | ||||
c0 = a[0]; c8 = a[8]; | c0 = a[0]; c8 = a[8]; | ||||
smlal(&accum0, blo, c0); | smlal(&accum0, blo, c0); | ||||
smlal(&accum8, blo, c8); | smlal(&accum8, blo, c8); | ||||
@@ -860,9 +847,8 @@ gf_448_mulw ( | |||||
c[1] += accum8 >> 28; | c[1] += accum8 >> 28; | ||||
} | } | ||||
void | void gf_strong_reduce ( | ||||
gf_448_strong_reduce ( | gf a | ||||
gf_448_t a | |||||
) { | ) { | ||||
word_t mask = (1ull<<28)-1; | word_t mask = (1ull<<28)-1; | ||||
@@ -903,15 +889,14 @@ gf_448_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize ( | ||||
gf_448_serialize ( | |||||
uint8_t *serial, | uint8_t *serial, | ||||
const gf_448_t x | const gf x | ||||
) { | ) { | ||||
int i,j; | int i,j; | ||||
gf_448_t red; | gf red; | ||||
gf_448_copy(red, x); | gf_copy(red, x); | ||||
gf_448_strong_reduce(red); | gf_strong_reduce(red); | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); | uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28); | ||||
for (j=0; j<7; j++) { | for (j=0; j<7; j++) { | ||||
@@ -923,8 +908,8 @@ gf_448_serialize ( | |||||
} | } | ||||
mask_t | mask_t | ||||
gf_448_deserialize ( | gf_deserialize ( | ||||
gf_448_t x, | gf x, | ||||
const uint8_t serial[56] | const uint8_t serial[56] | ||||
) { | ) { | ||||
int i,j; | int i,j; | ||||
@@ -67,12 +67,7 @@ smull2 ( | |||||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; | *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_448_mul ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
const gf_448_t bs | |||||
) { | |||||
#define _bl0 "q0" | #define _bl0 "q0" | ||||
#define _bl0_0 "d0" | #define _bl0_0 "d0" | ||||
#define _bl0_1 "d1" | #define _bl0_1 "d1" | ||||
@@ -366,11 +361,7 @@ gf_448_mul ( | |||||
); | ); | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf bs) { | ||||
gf_448_sqr ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t bs | |||||
) { | |||||
int32x2_t *vc = (int32x2_t*) cs->limb; | int32x2_t *vc = (int32x2_t*) cs->limb; | ||||
__asm__ __volatile__ ( | __asm__ __volatile__ ( | ||||
@@ -567,12 +558,7 @@ gf_448_sqr ( | |||||
); | ); | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_448_mulw ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
uint64_t b | |||||
) { | |||||
uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | ||||
uint64x2_t accum; | uint64x2_t accum; | ||||
@@ -618,10 +604,7 @@ gf_448_mulw ( | |||||
} | } | ||||
/* PERF: vectorize? */ | /* PERF: vectorize? */ | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_448_strong_reduce ( | |||||
gf_448_t a | |||||
) { | |||||
word_t mask = (1ull<<28)-1; | word_t mask = (1ull<<28)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -661,15 +644,11 @@ gf_448_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const gf x) { | ||||
gf_448_serialize ( | |||||
uint8_t *serial, | |||||
const gf_448_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_448_t red; | gf red; | ||||
gf_448_copy(red, x); | gf_copy(red, x); | ||||
gf_448_strong_reduce(red); | gf_strong_reduce(red); | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28); | uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28); | ||||
@@ -681,11 +660,7 @@ gf_448_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[56]) { | ||||
gf_448_deserialize ( | |||||
gf_448_t x, | |||||
const uint8_t serial[56] | |||||
) { | |||||
int i,j; | int i,j; | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t out = 0; | uint64_t out = 0; | ||||
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { | |||||
return (((__uint128_t)a)-1)>>64; | return (((__uint128_t)a)-1)>>64; | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_448_mul ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
const gf_448_t bs | |||||
) { | |||||
const uint64_t *a = as->limb, *b = bs->limb; | const uint64_t *a = as->limb, *b = bs->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -182,12 +177,7 @@ gf_448_mul ( | |||||
c[1] += ((uint64_t)(accum1)); | c[1] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_448_mulw ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
uint64_t b | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -211,11 +201,7 @@ gf_448_mulw ( | |||||
c[1] += accum4 >> 56; | c[1] += accum4 >> 56; | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_448_sqr ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -326,10 +312,7 @@ gf_448_sqr ( | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_448_strong_reduce ( | |||||
gf_448_t a | |||||
) { | |||||
uint64_t mask = (1ull<<56)-1; | uint64_t mask = (1ull<<56)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -369,15 +352,11 @@ gf_448_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const gf x) { | ||||
gf_448_serialize ( | |||||
uint8_t *serial, | |||||
const gf_448_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_448_t red; | gf red; | ||||
gf_448_copy(red, x); | gf_copy(red, x); | ||||
gf_448_strong_reduce(red); | gf_strong_reduce(red); | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
for (j=0; j<7; j++) { | for (j=0; j<7; j++) { | ||||
serial[7*i+j] = red->limb[i]; | serial[7*i+j] = red->limb[i]; | ||||
@@ -387,11 +366,7 @@ gf_448_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[56]) { | ||||
gf_448_deserialize ( | |||||
gf_448_t x, | |||||
const uint8_t serial[56] | |||||
) { | |||||
int i,j; | int i,j; | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
uint64_t out = 0; | uint64_t out = 0; | ||||
@@ -3,14 +3,8 @@ | |||||
*/ | */ | ||||
#include "f_field.h" | #include "f_field.h" | ||||
#include "x86-64-arith.h" | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
void | |||||
gf_448_mul ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
const gf_448_t bs | |||||
) { | |||||
const uint64_t *a = as->limb, *b = bs->limb; | const uint64_t *a = as->limb, *b = bs->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -145,12 +139,7 @@ gf_448_mul ( | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | ||||
gf_448_mulw ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as, | |||||
uint64_t b | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -190,11 +179,7 @@ gf_448_mulw ( | |||||
c[1] += accum4 >> 56; | c[1] += accum4 >> 56; | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_448_sqr ( | |||||
gf_448_s *__restrict__ cs, | |||||
const gf_448_t as | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -305,10 +290,7 @@ gf_448_sqr ( | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_448_strong_reduce ( | |||||
gf_448_t a | |||||
) { | |||||
uint64_t mask = (1ull<<56)-1; | uint64_t mask = (1ull<<56)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -348,15 +330,11 @@ gf_448_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const gf x) { | ||||
gf_448_serialize ( | |||||
uint8_t *serial, | |||||
const gf_448_t x | |||||
) { | |||||
int i,j; | int i,j; | ||||
gf_448_t red; | gf red; | ||||
gf_448_copy(red, x); | gf_copy(red, x); | ||||
gf_448_strong_reduce(red); | gf_strong_reduce(red); | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
for (j=0; j<7; j++) { | for (j=0; j<7; j++) { | ||||
serial[7*i+j] = red->limb[i]; | serial[7*i+j] = red->limb[i]; | ||||
@@ -366,11 +344,7 @@ gf_448_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[56]) { | ||||
gf_448_deserialize ( | |||||
gf_448_t x, | |||||
const uint8_t serial[56] | |||||
) { | |||||
int i,j; | int i,j; | ||||
for (i=0; i<8; i++) { | for (i=0; i<8; i++) { | ||||
word_t out = 0; | word_t out = 0; | ||||
@@ -1,323 +0,0 @@ | |||||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||||
* Released under the MIT License. See LICENSE.txt for license information. | |||||
*/ | |||||
#ifndef __X86_64_ARITH_H__ | |||||
#define __X86_64_ARITH_H__ | |||||
#include <stdint.h> | |||||
/* FUTURE: non x86-64 versions of these. | |||||
* FUTURE: autogenerate | |||||
*/ | |||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"d"(a)); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"r"(b), "a"(a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"r"(b), [a]"d"(a)); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"leaq (,%%rdx,2), %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
uint64_t lo2 = *acc2, hi2 = *acc2>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
"addq %[c], %[lo2]; " | |||||
"adcq %[d], %[hi2]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
"addq %%rax, %[lo2]; " | |||||
"adcq %%rdx, %[hi2]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
*acc2 = (((__uint128_t)(hi2))<<64) | lo2; | |||||
} | |||||
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"r"(b), "a"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d, lo = *acc, hi = *acc>>64; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[lo], %[c]; " | |||||
"sbbq %[hi], %[d]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
*acc = (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { | |||||
return ((__uint128_t)(a)) * b; | |||||
} | |||||
static __inline__ __int128_t widemuls(int64_t a, int64_t b) { | |||||
return ((__int128_t)(a)) * b; | |||||
} | |||||
static __inline__ uint64_t opacify(uint64_t x) { | |||||
__asm__ volatile("" : "+r"(x)); | |||||
return x; | |||||
} | |||||
static __inline__ mask_t is_zero(uint64_t x) { | |||||
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); | |||||
return ~x; | |||||
} | |||||
#endif /* __X86_64_ARITH_H__ */ |
@@ -4,12 +4,7 @@ | |||||
#include "f_field.h" | #include "f_field.h" | ||||
void | void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) { | ||||
gf_480_mul ( | |||||
gf_480_t *__restrict__ cs, | |||||
const gf_480_t *as, | |||||
const gf_480_t *bs | |||||
) { | |||||
const uint64_t *a = as->limb, *b = bs->limb; | const uint64_t *a = as->limb, *b = bs->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -144,12 +139,7 @@ gf_480_mul ( | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) { | ||||
gf_480_mulw ( | |||||
gf_480_t *__restrict__ cs, | |||||
const gf_480_t *as, | |||||
uint64_t b | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -189,11 +179,7 @@ gf_480_mulw ( | |||||
c[1] += accum4 >> 60; | c[1] += accum4 >> 60; | ||||
} | } | ||||
void | void gf_sqr (gf *__restrict__ cs, const gf *as) { | ||||
gf_480_sqr ( | |||||
gf_480_t *__restrict__ cs, | |||||
const gf_480_t *as | |||||
) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -304,10 +290,7 @@ gf_480_sqr ( | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void | void gf_strong_reduce (gf *a) { | ||||
gf_480_strong_reduce ( | |||||
gf_480_t *a | |||||
) { | |||||
uint64_t mask = (1ull<<60)-1; | uint64_t mask = (1ull<<60)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -347,15 +330,11 @@ gf_480_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const struct gf *x) { | ||||
gf_480_serialize ( | |||||
uint8_t *serial, | |||||
const struct gf_480_t *x | |||||
) { | |||||
int i,j,k=0; | int i,j,k=0; | ||||
gf_480_t red; | gf red; | ||||
gf_480_copy(&red, x); | gf_copy(&red, x); | ||||
gf_480_strong_reduce(&red); | gf_strong_reduce(&red); | ||||
word_t r = 0; | word_t r = 0; | ||||
for (i=0; i<8; i+=2) { | for (i=0; i<8; i+=2) { | ||||
r = red.limb[i]; | r = red.limb[i]; | ||||
@@ -373,11 +352,7 @@ gf_480_serialize ( | |||||
} | } | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf *x, const uint8_t serial[60]) { | ||||
gf_480_deserialize ( | |||||
gf_480_t *x, | |||||
const uint8_t serial[60] | |||||
) { | |||||
int i,j,k=0; | int i,j,k=0; | ||||
for (i=0; i<8; i+=2) { | for (i=0; i<8; i+=2) { | ||||
@@ -1,275 +0,0 @@ | |||||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||||
* Released under the MIT License. See LICENSE.txt for license information. | |||||
*/ | |||||
#ifndef __X86_64_ARITH_H__ | |||||
#define __X86_64_ARITH_H__ | |||||
#include <stdint.h> | |||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax;" | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"d"(a)); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { | |||||
#ifndef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b];" | |||||
: [c]"=a"(c), [d]"=d"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "cc"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#else | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx;" | |||||
"leaq (,%%rdx,2), %%rdx;" | |||||
"mulx %[b], %[c], %[d];" | |||||
: [c]"=r"(c), [d]"=r"(d) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx"); | |||||
return (((__uint128_t)(d))<<64) | c; | |||||
#endif | |||||
} | |||||
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
uint64_t lo2 = *acc2, hi2 = *acc2>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
"addq %[c], %[lo2]; " | |||||
"adcq %[d], %[hi2]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
"addq %%rax, %[lo2]; " | |||||
"adcq %%rdx, %[hi2]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
*acc2 = (((__uint128_t)(hi2))<<64) | lo2; | |||||
} | |||||
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"d"(a) | |||||
: "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"r"(a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"addq %[c], %[lo]; " | |||||
"adcq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"addq %%rax, %[lo]; " | |||||
"adcq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t lo = *acc, hi = *acc>>64; | |||||
#ifdef __BMI2__ | |||||
uint64_t c,d; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"addq %%rdx, %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[c], %[lo]; " | |||||
"sbbq %[d], %[hi]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
#else | |||||
__asm__ volatile | |||||
("movq %[a], %%rax; " | |||||
"addq %%rax, %%rax; " | |||||
"mulq %[b]; " | |||||
"subq %%rax, %[lo]; " | |||||
"sbbq %%rdx, %[hi]; " | |||||
: [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rax", "rdx", "cc"); | |||||
#endif | |||||
*acc = (((__uint128_t)(hi))<<64) | lo; | |||||
} | |||||
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { | |||||
uint64_t c,d, lo = *acc, hi = *acc>>64; | |||||
__asm__ volatile | |||||
("movq %[a], %%rdx; " | |||||
"mulx %[b], %[c], %[d]; " | |||||
"subq %[lo], %[c]; " | |||||
"sbbq %[hi], %[d]; " | |||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) | |||||
: [b]"m"(*b), [a]"m"(*a) | |||||
: "rdx", "cc"); | |||||
*acc = (((__uint128_t)(d))<<64) | c; | |||||
} | |||||
static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { | |||||
return ((__uint128_t)(a)) * b; | |||||
} | |||||
static __inline__ __int128_t widemuls(int64_t a, int64_t b) { | |||||
return ((__int128_t)(a)) * b; | |||||
} | |||||
static __inline__ uint64_t opacify(uint64_t x) { | |||||
__asm__ volatile("" : "+r"(x)); | |||||
return x; | |||||
} | |||||
static __inline__ mask_t is_zero(uint64_t x) { | |||||
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); | |||||
return ~x; | |||||
} | |||||
#endif /* __X86_64_ARITH_H__ */ |
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) { | |||||
return (((__uint128_t)a)-1)>>64; | return (((__uint128_t)a)-1)>>64; | ||||
} | } | ||||
void | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
gf_521_mul ( | |||||
gf_521_t *__restrict__ cs, | |||||
const gf_521_t *as, | |||||
const gf_521_t *bs | |||||
) { | |||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
const uint64_t *a = as->limb, *b = bs->limb; | const uint64_t *a = as->limb, *b = bs->limb; | ||||
__uint128_t accum0, accum1; | __uint128_t accum0, accum1; | ||||
@@ -157,10 +152,9 @@ gf_521_mul ( | |||||
c[8] += accum1 >> 58; | c[8] += accum1 >> 58; | ||||
} | } | ||||
void | void gf_mulw ( | ||||
gf_521_mulw ( | gf_s *__restrict__ cs, | ||||
gf_521_t *__restrict__ cs, | const gf as, | ||||
const gf_521_t *as, | |||||
uint64_t b | uint64_t b | ||||
) { | ) { | ||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
@@ -196,11 +190,7 @@ gf_521_mulw ( | |||||
c[1] += accum6 >> 58; | c[1] += accum6 >> 58; | ||||
} | } | ||||
void | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
gf_521_sqr ( | |||||
gf_521_t *__restrict__ cs, | |||||
const gf_521_t *as | |||||
) { | |||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
__uint128_t accum0, accum1; | __uint128_t accum0, accum1; | ||||
@@ -305,10 +295,7 @@ gf_521_sqr ( | |||||
c[8] += accum1 >> 58; | c[8] += accum1 >> 58; | ||||
} | } | ||||
void | void gf_strong_reduce (gf a) { | ||||
gf_521_strong_reduce ( | |||||
gf_521_t *a | |||||
) { | |||||
uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; | uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -346,15 +333,11 @@ gf_521_strong_reduce ( | |||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const struct gf x) { | ||||
gf_521_serialize ( | |||||
uint8_t *serial, | |||||
const struct gf_521_t *x | |||||
) { | |||||
int i,k=0; | int i,k=0; | ||||
gf_521_t red; | gf red; | ||||
gf_521_copy(&red, x); | gf_copy(&red, x); | ||||
gf_521_strong_reduce(&red); | gf_strong_reduce(&red); | ||||
uint64_t r=0; | uint64_t r=0; | ||||
int bits = 0; | int bits = 0; | ||||
@@ -370,11 +353,7 @@ gf_521_serialize ( | |||||
serial[k++] = r; | serial[k++] = r; | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf x, const uint8_t serial[66]) { | ||||
gf_521_deserialize ( | |||||
gf_521_t *x, | |||||
const uint8_t serial[66] | |||||
) { | |||||
int i,k=0,bits=0; | int i,k=0,bits=0; | ||||
__uint128_t out = 0; | __uint128_t out = 0; | ||||
uint64_t mask = (1ull<<58)-1; | uint64_t mask = (1ull<<58)-1; | ||||
@@ -167,12 +167,7 @@ static inline void hexad_sqr_signed ( | |||||
void | void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) { | ||||
gf_521_mul ( | |||||
gf_521_t *__restrict__ cs, | |||||
const gf_521_t *as, | |||||
const gf_521_t *bs | |||||
) { | |||||
int i; | int i; | ||||
#if 0 | #if 0 | ||||
@@ -253,13 +248,7 @@ gf_521_mul ( | |||||
} | } | ||||
void | void gf_sqr (gf *__restrict__ cs, const gf *as) { | ||||
gf_521_sqr ( | |||||
gf_521_t *__restrict__ cs, | |||||
const gf_521_t *as | |||||
) { | |||||
int i; | int i; | ||||
#if 0 | #if 0 | ||||
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); | assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); | ||||
@@ -312,15 +301,7 @@ gf_521_sqr ( | |||||
*(uint64x4_t *)&c[8] = out2; | *(uint64x4_t *)&c[8] = out2; | ||||
} | } | ||||
void | void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) { | ||||
gf_521_mulw ( | |||||
gf_521_t *__restrict__ cs, | |||||
const gf_521_t *as, | |||||
uint64_t b | |||||
) { | |||||
#if 0 | #if 0 | ||||
int i; | int i; | ||||
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); | assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0); | ||||
@@ -374,10 +355,7 @@ gf_521_mulw ( | |||||
} | } | ||||
void | void gf_strong_reduce (gf *a) { | ||||
gf_521_strong_reduce ( | |||||
gf_521_t *a | |||||
) { | |||||
uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; | uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; | ||||
/* first, clear high */ | /* first, clear high */ | ||||
@@ -417,15 +395,11 @@ gf_521_strong_reduce ( | |||||
a->limb[3] = a->limb[7] = a->limb[11] = 0; | a->limb[3] = a->limb[7] = a->limb[11] = 0; | ||||
} | } | ||||
void | void gf_serialize (uint8_t *serial, const struct gf *x) { | ||||
gf_521_serialize ( | |||||
uint8_t *serial, | |||||
const struct gf_521_t *x | |||||
) { | |||||
unsigned int i,k=0; | unsigned int i,k=0; | ||||
gf_521_t red; | gf red; | ||||
gf_521_copy(&red, x); | gf_copy(&red, x); | ||||
gf_521_strong_reduce(&red); | gf_strong_reduce(&red); | ||||
uint64_t r=0; | uint64_t r=0; | ||||
int bits = 0; | int bits = 0; | ||||
@@ -441,11 +415,7 @@ gf_521_serialize ( | |||||
serial[k++] = r; | serial[k++] = r; | ||||
} | } | ||||
mask_t | mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) { | ||||
gf_521_deserialize ( | |||||
gf_521_t *x, | |||||
const uint8_t serial[LIMBPERM(66)] | |||||
) { | |||||
int i,k=0,bits=0; | int i,k=0,bits=0; | ||||
__uint128_t out = 0; | __uint128_t out = 0; | ||||
uint64_t mask = (1ull<<58)-1; | uint64_t mask = (1ull<<58)-1; | ||||