From 2e23ac747b700526820be43deee85ac8086512a2 Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Fri, 15 Jan 2016 14:13:47 -0800 Subject: [PATCH] move (some of the?) per-field code to src/per_field.c --- Makefile | 11 ++- src/curve_ed25519/curve_data.inc.c | 1 - src/curve_ed448goldilocks/curve_data.inc.c | 1 - src/decaf.c | 108 --------------------- src/decaf_crypto.c | 1 + src/gen_headers/f_field_h.py | 9 +- src/include/word.h | 11 +++ src/p25519/f_arithmetic.c | 10 +- src/per_field.c | 99 +++++++++++++++++++ 9 files changed, 129 insertions(+), 122 deletions(-) create mode 100644 src/per_field.c diff --git a/Makefile b/Makefile index 420f010..abc59cb 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,7 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/* ################################################################ define define_field ARCH_FOR_$(1) ?= $(2) -COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o +COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o $$(BUILD_OBJ)/$(1)_per_field.o LIBCOMPONENTS += $$(COMPONENTS_OF_$(1)) $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS) @@ -138,6 +138,11 @@ $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS) $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \ -I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \ -S -c -o $$@ $$< + +$$(BUILD_ASM)/$(1)_per_field.s: src/per_field.c $$(HEADERS) + $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \ + -I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \ + -S -c -o $$@ $$< endef ################################################################ @@ -171,8 +176,8 @@ $$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS) $$(BUILD_ASM)/decaf_crypto_$(1).s: src/decaf_crypto.c $$(HEADERS) $$(CC) $$(CFLAGS) \ - -I src/curve_$(1)/ \ - -I $(BUILD_H)/curve_$(1) \ + -I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \ + -I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \ -S -c -o $$@ $$< LIBCOMPONENTS += $$(BUILD_OBJ)/decaf_$(1).o $$(BUILD_OBJ)/decaf_tables_$(1).o diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c index 9012b4c..e9b302a 100644 --- a/src/curve_ed25519/curve_data.inc.c +++ b/src/curve_ed25519/curve_data.inc.c @@ -8,7 +8,6 @@ #define scalar_t decaf_255_scalar_t #define point_t decaf_255_point_t #define precomputed_s decaf_255_precomputed_s -#define SER_BYTES DECAF_255_SER_BYTES #define IMAGINE_TWIST 1 #define P_MOD_8 5 #define COFACTOR 8 diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c index b5c8217..89b0cd0 100644 --- a/src/curve_ed448goldilocks/curve_data.inc.c +++ b/src/curve_ed448goldilocks/curve_data.inc.c @@ -7,7 +7,6 @@ #define scalar_t decaf_448_scalar_t #define point_t decaf_448_point_t #define precomputed_s decaf_448_precomputed_s -#define SER_BYTES DECAF_448_SER_BYTES #define IMAGINE_TWIST 0 #define P_MOD_8 7 #define COFACTOR 4 diff --git a/src/decaf.c b/src/decaf.c index d2564e9..ca19f23 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -50,17 +50,8 @@ extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */ const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; extern const scalar_t API_NS(sc_r2); extern const decaf_word_t API_NS(MONTGOMERY_FACTOR); - extern const point_t API_NS(point_base); -/* These are externally exposed (but private) instead of static so that - * f_arithmetic.c can use it - */ -#define ONE API_NS(ONE) -#define ZERO API_NS(ZERO) -#define gf_eq API_NS(gf_eq) -const gf ZERO = {{{0}}}, ONE = {{{1}}}; - /* Projective Niels coordinates */ typedef struct { gf a, b, c; } niels_s, niels_t[1]; typedef struct { niels_t n; gf z; } __attribute__((aligned(32))) pniels_s, pniels_t[1]; /* MAGIC alignment */ @@ -75,93 +66,9 @@ const precomputed_s *API_NS(precomputed_base) = const size_t API_NS2(sizeof,precomputed_s) = sizeof(precomputed_s); const size_t API_NS2(alignof,precomputed_s) = 32; -/* TODO PERF: Vectorize vs unroll */ -#ifdef __clang__ -#if 100*__clang_major__ + __clang_minor__ > 305 -#define UNROLL _Pragma("clang loop unroll(full)") // PERF TODO: vectorize? -#endif -#endif - -#ifndef UNROLL -#define UNROLL -#endif - #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; ilimb[LIMBPERM(j)]) << fill; - fill += LIMB_PLACE_VALUE(LIMBPERM(j)); - j++; - } - serial[i] = buffer; - fill -= 8; - buffer >>= 8; - } -} - -mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) { - unsigned int j=0, fill=0; - dword_t buffer = 0; - dsword_t scarry = 0; - UNROLL for (unsigned int i=0; ilimb[LIMBPERM(i)] = (i>= LIMB_PLACE_VALUE(LIMBPERM(i)); - scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t)); - } - return word_is_zero(buffer) & ~word_is_zero(scarry); -} - -void gf_strong_reduce (gf a) { - /* first, clear high */ - gf_weak_reduce(a); /* PERF: only really need one step of this, but whatevs */ - - /* now the total is less than 2p */ - - /* compute total_value - p. No need to reduce mod p. */ - dsword_t scarry = 0; - for (unsigned int i=0; ilimb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]; - a->limb[i] = scarry & LIMB_MASK(LIMBPERM(i)); - scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); - } - - /* uncommon case: it was >= p, so now scarry = 0 and this = x - * common case: it was < p, so now scarry = -1 and this = x - p + 2^255 - * so let's add back in p. will carry back off the top for 2^255. - */ - assert(word_is_zero(scarry) | word_is_zero(scarry+1)); - - word_t scarry_0 = scarry; - dword_t carry = 0; - - /* add it back */ - for (unsigned int i=0; ilimb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]); - a->limb[i] = carry & LIMB_MASK(LIMBPERM(i)); - carry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); - } - - assert(word_is_zero(carry + scarry_0)); -} - /** Constant time, x = is_z ? z : y */ static INLINE void cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) { @@ -186,21 +93,6 @@ cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { } } -/** Compare a==b */ -/* Not static because it's used in inverse square root. */ -decaf_word_t gf_eq(const gf a, const gf b); -decaf_word_t gf_eq(const gf a, const gf b) { - gf c; - gf_sub(c,a,b); - gf_strong_reduce(c); - decaf_word_t ret=0; - for (unsigned int i=0; ilimb)/sizeof(c->limb[0]); i++) { - ret |= c->limb[i]; - } - - return word_is_zero(ret); -} - /** Inverse square root using addition chain. */ static decaf_bool_t gf_isqrt_chk(gf y, const gf x, decaf_bool_t allow_zero) { diff --git a/src/decaf_crypto.c b/src/decaf_crypto.c index 47c42ee..3a74ce0 100644 --- a/src/decaf_crypto.c +++ b/src/decaf_crypto.c @@ -8,6 +8,7 @@ * @brief Example Decaf crypto routines */ +#include "f_field.h" /* for SER_BYTES; FUTURE: find a better way to do this? */ #include #include diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py index 420b588..44fbcee 100644 --- a/src/gen_headers/f_field_h.py +++ b/src/gen_headers/f_field_h.py @@ -14,14 +14,19 @@ f_field_h = gen_file( #define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1 #define NLIMBS (%(gf_impl_bits)d/sizeof(word_t)/8) +#define SER_BYTES ((%(gf_bits)d-1)/8 + 1) typedef struct gf_%(gf_shortname)s_s { word_t limb[NLIMBS]; } __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1]; #define GF_LIT_LIMB_BITS %(gf_lit_limb_bits)d #define GF_BITS %(gf_bits)d +#define ZERO gf_%(gf_shortname)s_ZERO +#define ONE gf_%(gf_shortname)s_ONE +#define MODULUS gf_%(gf_shortname)s_MODULUS #define gf gf_%(gf_shortname)s_t #define gf_s gf_%(gf_shortname)s_s +#define gf_eq gf_%(gf_shortname)s_eq #define gf_copy gf_%(gf_shortname)s_copy #define gf_add_RAW gf_%(gf_shortname)s_add_RAW #define gf_sub_RAW gf_%(gf_shortname)s_sub_RAW @@ -34,7 +39,6 @@ typedef struct gf_%(gf_shortname)s_s { #define gf_isr gf_%(gf_shortname)s_isr #define gf_serialize gf_%(gf_shortname)s_serialize #define gf_deserialize gf_%(gf_shortname)s_deserialize -#define MODULUS gf_%(gf_shortname)s_MODULUS #define SQRT_MINUS_ONE P%(gf_shortname)s_SQRT_MINUS_ONE /* might not be defined */ @@ -44,7 +48,7 @@ typedef struct gf_%(gf_shortname)s_s { extern "C" { #endif -const gf MODULUS; +const gf MODULUS, ZERO, ONE; /* Defined below in f_impl.h */ static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; } @@ -58,6 +62,7 @@ void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); void gf_sqr (gf_s *__restrict__ out, const gf a); void gf_serialize (uint8_t *serial, const gf x); +mask_t gf_eq (const gf x, const gf y); mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]); #ifdef __cplusplus diff --git a/src/include/word.h b/src/include/word.h index 2261b13..54f2ff8 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -241,4 +241,15 @@ malloc_vector(size_t size) { } } +/* PERF: vectorize vs unroll */ +#ifdef __clang__ +#if 100*__clang_major__ + __clang_minor__ > 305 +#define UNROLL _Pragma("clang loop unroll(full)") // PERF TODO: vectorize? +#endif +#endif + +#ifndef UNROLL +#define UNROLL +#endif + #endif /* __WORD_H__ */ diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c index f348307..9d42892 100644 --- a/src/p25519/f_arithmetic.c +++ b/src/p25519/f_arithmetic.c @@ -11,7 +11,7 @@ #include "field.h" #include "constant_time.h" -const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL( +const gf_25519_t SQRT_MINUS_ONE = {FIELD_LITERAL( 0x61b274a0ea0b0, 0x0d5a5fc8f189d, 0x7ef5e9cbd0c60, @@ -22,10 +22,6 @@ const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL( const gf MODULUS = {FIELD_LITERAL( 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff )}; - -/* TODO put in header */ -extern const gf_25519_t decaf_255_ONE; -extern mask_t decaf_255_gf_eq(const gf_25519_t a, const gf_25519_t b); /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ void gf_isr ( @@ -44,8 +40,8 @@ void gf_isr ( st[i&1][0] = tmp2[0]; } - mask_t mask = decaf_255_gf_eq(st[1],decaf_255_ONE) | decaf_255_gf_eq(st[1],SQRT_MINUS_ONE); + mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE); - constant_time_select(tmp1, decaf_255_ONE, SQRT_MINUS_ONE, sizeof(tmp1), mask, 0); + constant_time_select(tmp1, ONE, SQRT_MINUS_ONE, sizeof(tmp1), mask, 0); gf_mul(a,tmp1,st[0]); } diff --git a/src/per_field.c b/src/per_field.c new file mode 100644 index 0000000..b826b40 --- /dev/null +++ b/src/per_field.c @@ -0,0 +1,99 @@ +/** + * @cond internal + * @file decaf_crypto.c + * @copyright + * Copyright (c) 2015-2016 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @brief Generic arithmetic which has to be compiled per field. + */ + +#include "field.h" + +const gf ZERO = {{{0}}}, ONE = {{{1}}}; + +/** Serialize to wire format. */ +void gf_serialize (uint8_t serial[SER_BYTES], const gf x) { + gf red; + gf_copy(red, x); + gf_strong_reduce(red); + + unsigned int j=0, fill=0; + dword_t buffer = 0; + UNROLL for (unsigned int i=0; ilimb[LIMBPERM(j)]) << fill; + fill += LIMB_PLACE_VALUE(LIMBPERM(j)); + j++; + } + serial[i] = buffer; + fill -= 8; + buffer >>= 8; + } +} + +/** Deserialize from wire format; return -1 on success and 0 on failure. */ +mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) { + unsigned int j=0, fill=0; + dword_t buffer = 0; + dsword_t scarry = 0; + UNROLL for (unsigned int i=0; ilimb[LIMBPERM(i)] = (i>= LIMB_PLACE_VALUE(LIMBPERM(i)); + scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t)); + } + return word_is_zero(buffer) & ~word_is_zero(scarry); +} + +/** Reduce to canonical form. */ +void gf_strong_reduce (gf a) { + /* first, clear high */ + gf_weak_reduce(a); /* Determined to have negligible perf impact. */ + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + dsword_t scarry = 0; + for (unsigned int i=0; ilimb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]; + a->limb[i] = scarry & LIMB_MASK(LIMBPERM(i)); + scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^255 + * so let's add back in p. will carry back off the top for 2^255. + */ + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); + + word_t scarry_0 = scarry; + dword_t carry = 0; + + /* add it back */ + for (unsigned int i=0; ilimb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]); + a->limb[i] = carry & LIMB_MASK(LIMBPERM(i)); + carry >>= LIMB_PLACE_VALUE(LIMBPERM(i)); + } + + assert(word_is_zero(carry + scarry_0)); +} + +/** Compare a==b */ +mask_t gf_eq(const gf a, const gf b) { + gf c; + gf_sub(c,a,b); + gf_strong_reduce(c); + mask_t ret=0; + for (unsigned int i=0; ilimb)/sizeof(c->limb[0]); i++) { + ret |= c->limb[i]; + } + + return word_is_zero(ret); +}