@@ -19,13 +19,13 @@ ASM ?= $(CC) | |||||
DECAF ?= decaf_fast | DECAF ?= decaf_fast | ||||
ifneq (,$(findstring x86_64,$(MACHINE))) | ifneq (,$(findstring x86_64,$(MACHINE))) | ||||
ARCH ?= arch_x86_64 | |||||
ARCH ?= arch_ref64 | |||||
else | else | ||||
# no i386 port yet | # no i386 port yet | ||||
ARCH ?= arch_arm_32 | |||||
ARCH ?= arch_ref32 | |||||
endif | endif | ||||
FIELD ?= p255 | |||||
FIELD ?= p25519 | |||||
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | ||||
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | ||||
@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) | |||||
LANGFLAGS = -std=c99 -fno-strict-aliasing | LANGFLAGS = -std=c99 -fno-strict-aliasing | ||||
LANGXXFLAGS = -fno-strict-aliasing | LANGXXFLAGS = -fno-strict-aliasing | ||||
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | ||||
OFLAGS ?= -O3 | |||||
OFLAGS ?= -O2 | |||||
TODAY = $(shell date "+%Y-%m-%d") | TODAY = $(shell date "+%Y-%m-%d") | ||||
@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t; | |||||
/** Galois field element internal structure */ | /** Galois field element internal structure */ | ||||
typedef struct gf_s { | typedef struct gf_s { | ||||
decaf_word_t limb[DECAF_255_LIMBS]; | decaf_word_t limb[DECAF_255_LIMBS]; | ||||
} __attribute__((aligned(32))) gf_s, gf[1]; | |||||
} gf_s, gf[1]; | |||||
/** @endcond */ | /** @endcond */ | ||||
/** Number of bytes in a serialized point. */ | /** Number of bytes in a serialized point. */ | ||||
@@ -18,7 +18,7 @@ | |||||
#include "shake.h" | #include "shake.h" | ||||
/** Number of bytes for a symmetric key (expanded to full key) */ | /** Number of bytes for a symmetric key (expanded to full key) */ | ||||
#define DECAF_448_SYMMETRIC_KEY_BYTES 32 | |||||
#define DECAF_255_SYMMETRIC_KEY_BYTES 32 | |||||
/** @cond internal */ | /** @cond internal */ | ||||
#define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | ||||
@@ -31,29 +31,29 @@ | |||||
/** @endcond */ | /** @endcond */ | ||||
/** A symmetric key, the compressed point of a private key. */ | /** A symmetric key, the compressed point of a private key. */ | ||||
typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; | |||||
typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES]; | |||||
/** An encoded public key. */ | /** An encoded public key. */ | ||||
typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; | |||||
typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES]; | |||||
/** A signature. */ | /** A signature. */ | ||||
typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; | |||||
typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES]; | |||||
typedef struct { | typedef struct { | ||||
/** @cond intetrnal */ | /** @cond intetrnal */ | ||||
/** The symmetric key from which everything is expanded */ | /** The symmetric key from which everything is expanded */ | ||||
decaf_448_symmetric_key_t sym; | |||||
decaf_255_symmetric_key_t sym; | |||||
/** The scalar x */ | /** The scalar x */ | ||||
decaf_448_scalar_t secret_scalar; | |||||
decaf_255_scalar_t secret_scalar; | |||||
/** x*Base */ | /** x*Base */ | ||||
decaf_448_public_key_t pub; | |||||
decaf_255_public_key_t pub; | |||||
/** @endcond */ | /** @endcond */ | ||||
} /** Private key structure for pointers. */ | } /** Private key structure for pointers. */ | ||||
decaf_448_private_key_s, | |||||
decaf_255_private_key_s, | |||||
/** A private key (gmp array[1] style). */ | /** A private key (gmp array[1] style). */ | ||||
decaf_448_private_key_t[1]; | |||||
decaf_255_private_key_t[1]; | |||||
#ifdef __cplusplus | #ifdef __cplusplus | ||||
extern "C" { | extern "C" { | ||||
@@ -64,16 +64,16 @@ extern "C" { | |||||
* @param [out] priv The derived private key. | * @param [out] priv The derived private key. | ||||
* @param [in] proto The compressed or proto-key, which must be 32 random bytes. | * @param [in] proto The compressed or proto-key, which must be 32 random bytes. | ||||
*/ | */ | ||||
void decaf_448_derive_private_key ( | |||||
decaf_448_private_key_t priv, | |||||
const decaf_448_symmetric_key_t proto | |||||
void decaf_255_derive_private_key ( | |||||
decaf_255_private_key_t priv, | |||||
const decaf_255_symmetric_key_t proto | |||||
) NONNULL2 API_VIS; | ) NONNULL2 API_VIS; | ||||
/** | /** | ||||
* @brief Destroy a private key. | * @brief Destroy a private key. | ||||
*/ | */ | ||||
void decaf_448_destroy_private_key ( | |||||
decaf_448_private_key_t priv | |||||
void decaf_255_destroy_private_key ( | |||||
decaf_255_private_key_t priv | |||||
) NONNULL1 API_VIS; | ) NONNULL1 API_VIS; | ||||
/** | /** | ||||
@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key ( | |||||
* @param [out] pub The extracted private key. | * @param [out] pub The extracted private key. | ||||
* @param [in] priv The private key. | * @param [in] priv The private key. | ||||
*/ | */ | ||||
void decaf_448_private_to_public ( | |||||
decaf_448_public_key_t pub, | |||||
const decaf_448_private_key_t priv | |||||
void decaf_255_private_to_public ( | |||||
decaf_255_public_key_t pub, | |||||
const decaf_255_private_key_t priv | |||||
) NONNULL2 API_VIS; | ) NONNULL2 API_VIS; | ||||
/** | /** | ||||
@@ -104,11 +104,11 @@ void decaf_448_private_to_public ( | |||||
* and will almost definitely change in the future. | * and will almost definitely change in the future. | ||||
*/ | */ | ||||
decaf_bool_t | decaf_bool_t | ||||
decaf_448_shared_secret ( | |||||
decaf_255_shared_secret ( | |||||
uint8_t *shared, | uint8_t *shared, | ||||
size_t shared_bytes, | size_t shared_bytes, | ||||
const decaf_448_private_key_t my_privkey, | |||||
const decaf_448_public_key_t your_pubkey | |||||
const decaf_255_private_key_t my_privkey, | |||||
const decaf_255_public_key_t your_pubkey | |||||
) NONNULL134 WARN_UNUSED API_VIS; | ) NONNULL134 WARN_UNUSED API_VIS; | ||||
/** | /** | ||||
@@ -119,9 +119,9 @@ decaf_448_shared_secret ( | |||||
* @param [in] shake A SHAKE256 context with the message. | * @param [in] shake A SHAKE256 context with the message. | ||||
*/ | */ | ||||
void | void | ||||
decaf_448_sign_shake ( | |||||
decaf_448_signature_t sig, | |||||
const decaf_448_private_key_t priv, | |||||
decaf_255_sign_shake ( | |||||
decaf_255_signature_t sig, | |||||
const decaf_255_private_key_t priv, | |||||
const keccak_sponge_t shake | const keccak_sponge_t shake | ||||
) NONNULL3 API_VIS; | ) NONNULL3 API_VIS; | ||||
@@ -134,9 +134,9 @@ decaf_448_sign_shake ( | |||||
* @param [in] message_len The message's length. | * @param [in] message_len The message's length. | ||||
*/ | */ | ||||
void | void | ||||
decaf_448_sign ( | |||||
decaf_448_signature_t sig, | |||||
const decaf_448_private_key_t priv, | |||||
decaf_255_sign ( | |||||
decaf_255_signature_t sig, | |||||
const decaf_255_private_key_t priv, | |||||
const unsigned char *message, | const unsigned char *message, | ||||
size_t message_len | size_t message_len | ||||
) NONNULL3 API_VIS; | ) NONNULL3 API_VIS; | ||||
@@ -149,9 +149,9 @@ decaf_448_sign ( | |||||
* @param [in] shake A SHAKE256 context with the message. | * @param [in] shake A SHAKE256 context with the message. | ||||
*/ | */ | ||||
decaf_bool_t | decaf_bool_t | ||||
decaf_448_verify_shake ( | |||||
const decaf_448_signature_t sig, | |||||
const decaf_448_public_key_t pub, | |||||
decaf_255_verify_shake ( | |||||
const decaf_255_signature_t sig, | |||||
const decaf_255_public_key_t pub, | |||||
const keccak_sponge_t shake | const keccak_sponge_t shake | ||||
) NONNULL3 API_VIS WARN_UNUSED; | ) NONNULL3 API_VIS WARN_UNUSED; | ||||
@@ -164,9 +164,9 @@ decaf_448_verify_shake ( | |||||
* @param [in] message_len The message's length. | * @param [in] message_len The message's length. | ||||
*/ | */ | ||||
decaf_bool_t | decaf_bool_t | ||||
decaf_448_verify ( | |||||
const decaf_448_signature_t sig, | |||||
const decaf_448_public_key_t pub, | |||||
decaf_255_verify ( | |||||
const decaf_255_signature_t sig, | |||||
const decaf_255_public_key_t pub, | |||||
const unsigned char *message, | const unsigned char *message, | ||||
size_t message_len | size_t message_len | ||||
) NONNULL3 API_VIS WARN_UNUSED; | ) NONNULL3 API_VIS WARN_UNUSED; | ||||
@@ -192,18 +192,18 @@ private: | |||||
}; | }; | ||||
/**@cond internal*/ | /**@cond internal*/ | ||||
inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||||
inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||||
*this = rng.read(SER_BYTES); | *this = rng.read(SER_BYTES); | ||||
} | } | ||||
inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||||
inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||||
SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | ||||
rng.read(buffer); | rng.read(buffer); | ||||
set_to_hash(buffer); | set_to_hash(buffer); | ||||
} | } | ||||
inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||||
inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||||
SecureBuffer out(STEG_BYTES); | SecureBuffer out(STEG_BYTES); | ||||
bool done; | bool done; | ||||
do { | do { | ||||
@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t; | |||||
#define siv static inline void __attribute__((always_inline)) | #define siv static inline void __attribute__((always_inline)) | ||||
static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | ||||
static const int EDWARDS_D = 121665; | |||||
static const int EDWARDS_D = -89747; | |||||
// Gonna test with PinkBikeShed until the math works... | |||||
// Curve25519: 121665; | |||||
static const scalar_t sc_p = {{{ | static const scalar_t sc_p = {{{ | ||||
// Gonna test with PinkBikeShed until the math works... | |||||
SC_LIMB(0xb6b98fd8849faf35), | |||||
SC_LIMB(0x16241e6093b2ce59), | |||||
SC_LIMB(0), | |||||
SC_LIMB(0x2000000000000000) | |||||
/* Curve25519: | |||||
SC_LIMB(0x5812631a5cf5d3ed), | SC_LIMB(0x5812631a5cf5d3ed), | ||||
SC_LIMB(0x14def9dea2f79cd6), | SC_LIMB(0x14def9dea2f79cd6), | ||||
SC_LIMB(0), | SC_LIMB(0), | ||||
SC_LIMB(0), | |||||
SC_LIMB(0x1000000000000000) | SC_LIMB(0x1000000000000000) | ||||
*/ | |||||
}}}; | }}}; | ||||
const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | ||||
@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR; | |||||
/* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | ||||
const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | ||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||||
5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||||
}; | }; | ||||
extern const point_t API_NS(point_base); | extern const point_t API_NS(point_base); | ||||
@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32; | |||||
#ifdef __clang__ | #ifdef __clang__ | ||||
#if 100*__clang_major__ + __clang_minor__ > 305 | #if 100*__clang_major__ + __clang_minor__ > 305 | ||||
#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||||
#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize? | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#ifndef VECTORIZE | |||||
#define VECTORIZE | |||||
#ifndef UNROLL | |||||
#define UNROLL | |||||
#endif | #endif | ||||
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | ||||
#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }} | |||||
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }} | |||||
/** Copy x = y */ | /** Copy x = y */ | ||||
siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | ||||
@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) { | |||||
/** Subtract mod p. Bias by 2 and don't reduce */ | /** Subtract mod p. Bias by 2 and don't reduce */ | ||||
siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | ||||
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||||
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||||
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ||||
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | ||||
gf_bias(c, 2); | gf_bias(c, 2); | ||||
@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) { | |||||
/** Add mod p. Don't reduce. */ | /** Add mod p. Don't reduce. */ | ||||
siv gf_add_nr ( gf c, const gf a, const gf b ) { | siv gf_add_nr ( gf c, const gf a, const gf b ) { | ||||
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||||
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||||
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ||||
field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | ||||
} | } | ||||
@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) { | |||||
/** Constant time, if (swap) (x,y) = (y,x); */ | /** Constant time, if (swap) (x,y) = (y,x); */ | ||||
siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | ||||
FOR_LIMB_V(i, { | |||||
FOR_LIMB_U(i, { | |||||
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | ||||
x->limb[i] ^= s; | x->limb[i] ^= s; | ||||
y->limb[i] ^= s; | y->limb[i] ^= s; | ||||
@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) ( | |||||
} | } | ||||
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | ||||
#else | #else | ||||
(void)out; | |||||
(void)a; | |||||
return 0; | |||||
decaf_255_scalar_t b, ma; | |||||
int i; | |||||
sc_montmul(b,API_NS(scalar_one),sc_r2); | |||||
sc_montmul(ma,a,sc_r2); | |||||
for (i=SCALAR_BITS-1; i>=0; i--) { | |||||
sc_montsqr(b,b); | |||||
decaf_word_t w = sc_p->limb[i/WBITS]; | |||||
if (i<WBITS) { | |||||
assert(w >= 2); | |||||
w-=2; | |||||
} | |||||
if (1 & w>>(i%WBITS)) { | |||||
sc_montmul(b,b,ma); | |||||
} | |||||
} | |||||
sc_montmul(out,b,decaf_255_scalar_one); | |||||
API_NS(scalar_destroy)(b); | |||||
API_NS(scalar_destroy)(ma); | |||||
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -0,0 +1,50 @@ | |||||
/** | |||||
* @file decaf_config.h | |||||
* @author Mike Hamburg | |||||
* | |||||
* @copyright | |||||
* Copyright (c) 2015 Cryptography Research, Inc. \n | |||||
* Released under the MIT License. See LICENSE.txt for license information. | |||||
* | |||||
* @brief Configuration for decaf_fast.c | |||||
*/ | |||||
#ifndef __DECAF_255_CONFIG_H__ | |||||
#define __DECAF_255_CONFIG_H__ 1 | |||||
/** | |||||
* Use the Montgomery ladder for direct scalarmul. | |||||
* | |||||
* The Montgomery ladder is faster than Edwards scalarmul, but providing | |||||
* the features Decaf supports (cofactor elimination, twist rejection) | |||||
* makes it complicated and adds code. Removing the ladder saves a few | |||||
* kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul | |||||
* time. | |||||
*/ | |||||
#define DECAF_USE_MONTGOMERY_LADDER 1 | |||||
/** The number of comb tables for fixed base scalarmul. */ | |||||
#define DECAF_COMBS_N 3 | |||||
/** The number of teeth per comb for fixed base scalarmul. */ | |||||
#define DECAF_COMBS_T 5 | |||||
/** The comb spacing fixed base scalarmul. */ | |||||
#define DECAF_COMBS_S 17 | |||||
/** Performance tuning: the width of the fixed window for scalar mul. */ | |||||
#define DECAF_WINDOW_BITS 4 | |||||
/** | |||||
* The number of bits used for the precomputed table in variable-time | |||||
* double scalarmul. | |||||
*/ | |||||
#define DECAF_WNAF_FIXED_TABLE_BITS 5 | |||||
/** | |||||
* Performance tuning: bits used for the variable table in variable-time | |||||
* double scalarmul. | |||||
*/ | |||||
#define DECAF_WNAF_VAR_TABLE_BITS 3 | |||||
#endif /* __DECAF_255_CONFIG_H__ */ |
@@ -22,164 +22,33 @@ p255_mul ( | |||||
const p255_t *as, | const p255_t *as, | ||||
const p255_t *bs | const p255_t *bs | ||||
) { | ) { | ||||
const uint64_t *a = as->limb, *b = bs->limb; | |||||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||||
uint64_t bh[4]; | |||||
int i,j; | |||||
for (i=0; i<4; i++) bh[i] = b[i+1] * 19; | |||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
__uint128_t accum0 = 0, accum1 = 0, accum2; | |||||
uint64_t mask = (1ull<<51) - 1; | |||||
uint64_t aa[4], bb[4], bbb[4]; | |||||
unsigned int i; | |||||
for (i=0; i<4; i++) { | |||||
aa[i] = a[i] + a[i+4]; | |||||
bb[i] = b[i] + b[i+4]; | |||||
bbb[i] = bb[i] + b[i+4]; | |||||
} | |||||
int I_HATE_UNROLLED_LOOPS = 0; | |||||
if (I_HATE_UNROLLED_LOOPS) { | |||||
/* The compiler probably won't unroll this, | |||||
* so it's like 80% slower. | |||||
*/ | |||||
for (i=0; i<4; i++) { | |||||
accum2 = 0; | |||||
unsigned int j; | |||||
for (j=0; j<=i; j++) { | |||||
accum2 += widemul(a[j], b[i-j]); | |||||
accum1 += widemul(aa[j], bb[i-j]); | |||||
accum0 += widemul(a[j+4], b[i-j+4]); | |||||
} | |||||
for (; j<4; j++) { | |||||
accum2 += widemul(a[j], b[i-j+8]); | |||||
accum1 += widemul(aa[j], bbb[i-j+4]); | |||||
accum0 += widemul(a[j+4], bb[i-j+4]); | |||||
} | |||||
accum1 -= accum2; | |||||
accum0 += accum2; | |||||
c[i] = ((uint64_t)(accum0)) & mask; | |||||
c[i+4] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
__uint128_t accum = 0; | |||||
for (i=0; i<5; i++) { | |||||
for (j=0; j<=i; j++) { | |||||
accum += widemul(b[i-j], a[j]); | |||||
} | } | ||||
} else { | |||||
accum2 = widemul(a[0], b[0]); | |||||
accum1 += widemul(aa[0], bb[0]); | |||||
accum0 += widemul(a[4], b[4]); | |||||
accum2 += widemul(a[1], b[7]); | |||||
accum1 += widemul(aa[1], bbb[3]); | |||||
accum0 += widemul(a[5], bb[3]); | |||||
accum2 += widemul(a[2], b[6]); | |||||
accum1 += widemul(aa[2], bbb[2]); | |||||
accum0 += widemul(a[6], bb[2]); | |||||
accum2 += widemul(a[3], b[5]); | |||||
accum1 += widemul(aa[3], bbb[1]); | |||||
accum0 += widemul(a[7], bb[1]); | |||||
accum1 -= accum2; | |||||
accum0 += accum2; | |||||
c[0] = ((uint64_t)(accum0)) & mask; | |||||
c[4] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
accum2 = widemul(a[0], b[1]); | |||||
accum1 += widemul(aa[0], bb[1]); | |||||
accum0 += widemul(a[4], b[5]); | |||||
accum2 += widemul(a[1], b[0]); | |||||
accum1 += widemul(aa[1], bb[0]); | |||||
accum0 += widemul(a[5], b[4]); | |||||
accum2 += widemul(a[2], b[7]); | |||||
accum1 += widemul(aa[2], bbb[3]); | |||||
accum0 += widemul(a[6], bb[3]); | |||||
accum2 += widemul(a[3], b[6]); | |||||
accum1 += widemul(aa[3], bbb[2]); | |||||
accum0 += widemul(a[7], bb[2]); | |||||
accum1 -= accum2; | |||||
accum0 += accum2; | |||||
c[1] = ((uint64_t)(accum0)) & mask; | |||||
c[5] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
accum2 = widemul(a[0], b[2]); | |||||
accum1 += widemul(aa[0], bb[2]); | |||||
accum0 += widemul(a[4], b[6]); | |||||
accum2 += widemul(a[1], b[1]); | |||||
accum1 += widemul(aa[1], bb[1]); | |||||
accum0 += widemul(a[5], b[5]); | |||||
accum2 += widemul(a[2], b[0]); | |||||
accum1 += widemul(aa[2], bb[0]); | |||||
accum0 += widemul(a[6], b[4]); | |||||
accum2 += widemul(a[3], b[7]); | |||||
accum1 += widemul(aa[3], bbb[3]); | |||||
accum0 += widemul(a[7], bb[3]); | |||||
accum1 -= accum2; | |||||
accum0 += accum2; | |||||
c[2] = ((uint64_t)(accum0)) & mask; | |||||
c[6] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
accum2 = widemul(a[0], b[3]); | |||||
accum1 += widemul(aa[0], bb[3]); | |||||
accum0 += widemul(a[4], b[7]); | |||||
accum2 += widemul(a[1], b[2]); | |||||
accum1 += widemul(aa[1], bb[2]); | |||||
accum0 += widemul(a[5], b[6]); | |||||
accum2 += widemul(a[2], b[1]); | |||||
accum1 += widemul(aa[2], bb[1]); | |||||
accum0 += widemul(a[6], b[5]); | |||||
accum2 += widemul(a[3], b[0]); | |||||
accum1 += widemul(aa[3], bb[0]); | |||||
accum0 += widemul(a[7], b[4]); | |||||
accum1 -= accum2; | |||||
accum0 += accum2; | |||||
c[3] = ((uint64_t)(accum0)) & mask; | |||||
c[7] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
} /* !I_HATE_UNROLLED_LOOPS */ | |||||
accum0 += accum1; | |||||
accum0 += c[4]; | |||||
accum1 += c[0]; | |||||
c[4] = ((uint64_t)(accum0)) & mask; | |||||
c[0] = ((uint64_t)(accum1)) & mask; | |||||
accum0 >>= 56; | |||||
accum1 >>= 56; | |||||
c[5] += ((uint64_t)(accum0)); | |||||
c[1] += ((uint64_t)(accum1)); | |||||
for (; j<5; j++) { | |||||
accum += widemul(bh[i-j+4], a[j]); | |||||
} | |||||
c[i] = accum & mask; | |||||
accum >>= 51; | |||||
} | |||||
/* PERF: parallelize? eh well this is reference */ | |||||
accum *= 19; | |||||
accum += c[0]; | |||||
c[0] = accum & mask; | |||||
accum >>= 51; | |||||
assert(accum < mask); | |||||
c[1] += accum; | |||||
} | } | ||||
void | void | ||||
@@ -188,27 +57,25 @@ p255_mulw ( | |||||
const p255_t *as, | const p255_t *as, | ||||
uint64_t b | uint64_t b | ||||
) { | ) { | ||||
const uint64_t *a = as->limb; | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||||
int i; | |||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
__uint128_t accum0 = 0, accum4 = 0; | |||||
uint64_t mask = (1ull<<56) - 1; | |||||
int i; | |||||
for (i=0; i<4; i++) { | |||||
accum0 += widemul(b, a[i]); | |||||
accum4 += widemul(b, a[i+4]); | |||||
c[i] = accum0 & mask; accum0 >>= 56; | |||||
c[i+4] = accum4 & mask; accum4 >>= 56; | |||||
__uint128_t accum = 0; | |||||
for (i=0; i<5; i++) { | |||||
accum += widemul(b, a[i]); | |||||
c[i] = accum & mask; | |||||
accum >>= 51; | |||||
} | } | ||||
/* PERF: parallelize? eh well this is reference */ | |||||
accum *= 19; | |||||
accum += c[0]; | |||||
c[0] = accum & mask; | |||||
accum >>= 51; | |||||
accum0 += accum4 + c[4]; | |||||
c[4] = accum0 & mask; | |||||
c[5] += accum0 >> 56; | |||||
accum4 += c[0]; | |||||
c[0] = accum4 & mask; | |||||
c[1] += accum4 >> 56; | |||||
assert(accum < mask); | |||||
c[1] += accum; | |||||
} | } | ||||
void | void | ||||
@@ -223,23 +90,21 @@ void | |||||
p255_strong_reduce ( | p255_strong_reduce ( | ||||
p255_t *a | p255_t *a | ||||
) { | ) { | ||||
uint64_t mask = (1ull<<56)-1; | |||||
uint64_t mask = (1ull<<51)-1; | |||||
/* first, clear high */ | /* first, clear high */ | ||||
a->limb[4] += a->limb[7]>>56; | |||||
a->limb[0] += a->limb[7]>>56; | |||||
a->limb[7] &= mask; | |||||
a->limb[0] += (a->limb[4]>>51)*19; | |||||
a->limb[4] &= mask; | |||||
/* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */ | |||||
/* now the total is less than 2p */ | |||||
/* compute total_value - p. No need to reduce mod p. */ | /* compute total_value - p. No need to reduce mod p. */ | ||||
__int128_t scarry = 0; | __int128_t scarry = 0; | ||||
int i; | int i; | ||||
for (i=0; i<8; i++) { | |||||
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); | |||||
for (i=0; i<5; i++) { | |||||
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); | |||||
a->limb[i] = scarry & mask; | a->limb[i] = scarry & mask; | ||||
scarry >>= 56; | |||||
scarry >>= 51; | |||||
} | } | ||||
/* uncommon case: it was >= p, so now scarry = 0 and this = x | /* uncommon case: it was >= p, so now scarry = 0 and this = x | ||||
@@ -253,10 +118,10 @@ p255_strong_reduce ( | |||||
__uint128_t carry = 0; | __uint128_t carry = 0; | ||||
/* add it back */ | /* add it back */ | ||||
for (i=0; i<8; i++) { | |||||
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); | |||||
for (i=0; i<5; i++) { | |||||
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); | |||||
a->limb[i] = carry & mask; | a->limb[i] = carry & mask; | ||||
carry >>= 56; | |||||
carry >>= 51; | |||||
} | } | ||||
assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
@@ -271,12 +136,13 @@ p255_serialize ( | |||||
p255_t red; | p255_t red; | ||||
p255_copy(&red, x); | p255_copy(&red, x); | ||||
p255_strong_reduce(&red); | p255_strong_reduce(&red); | ||||
for (i=0; i<8; i++) { | |||||
for (j=0; j<7; j++) { | |||||
serial[7*i+j] = red.limb[i]; | |||||
red.limb[i] >>= 8; | |||||
uint64_t *r = red.limb; | |||||
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | |||||
for (i=0; i<4; i++) { | |||||
for (j=0; j<8; j++) { | |||||
serial[8*i+j] = ser64[i]; | |||||
ser64[i] >>= 8; | |||||
} | } | ||||
assert(red.limb[i] == 0); | |||||
} | } | ||||
} | } | ||||
@@ -286,33 +152,27 @@ p255_deserialize ( | |||||
const uint8_t serial[32] | const uint8_t serial[32] | ||||
) { | ) { | ||||
int i,j; | int i,j; | ||||
for (i=0; i<8; i++) { | |||||
uint64_t ser64[4], mask = ((1ull<<51)-1); | |||||
for (i=0; i<4; i++) { | |||||
uint64_t out = 0; | uint64_t out = 0; | ||||
for (j=0; j<7; j++) { | |||||
out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||||
for (j=0; j<8; j++) { | |||||
out |= ((uint64_t)serial[8*i+j])<<(8*j); | |||||
} | } | ||||
x->limb[i] = out; | |||||
ser64[i] = out; | |||||
} | } | ||||
/* Check for reduction. | |||||
* | |||||
* The idea is to create a variable ge which is all ones (rather, 56 ones) | |||||
* if and only if the low $i$ words of $x$ are >= those of p. | |||||
* | |||||
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) | |||||
*/ | |||||
uint64_t ge = -1, mask = (1ull<<56)-1; | |||||
for (i=0; i<4; i++) { | |||||
ge &= x->limb[i]; | |||||
} | |||||
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ | |||||
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); | |||||
/* Test for >= 2^255-19 */ | |||||
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); | |||||
ge &= ser64[1]; | |||||
ge &= ser64[2]; | |||||
ge &= (ser64[3]<<1) + 1; | |||||
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); | |||||
/* Propagate the rest */ | |||||
for (i=5; i<8; i++) { | |||||
ge &= x->limb[i]; | |||||
} | |||||
x->limb[0] = ser64[0] & mask; | |||||
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; | |||||
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; | |||||
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; | |||||
x->limb[4] = ser64[3]>>12; | |||||
return ~is_zero(ge ^ mask); | |||||
return ~is_zero(~ge); | |||||
} | } |
@@ -15,7 +15,17 @@ typedef struct p255_t { | |||||
} p255_t; | } p255_t; | ||||
#define LBITS 51 | #define LBITS 51 | ||||
#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}} | |||||
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} | |||||
/* | |||||
#define FIELD_LITERAL(a,b,c,d) {{ \ | |||||
(a##ull) & LMASK, \ | |||||
((a##ull)>>51 | (b##ull)<<13) & LMASK, \ | |||||
((b##ull)>>38 | (c##ull)<<26) & LMASK, \ | |||||
((c##ull)>>25 | (d##ull)<<39) & LMASK, \ | |||||
(d##ull)>>12 \ | |||||
}} | |||||
*/ | |||||
#ifdef __cplusplus | #ifdef __cplusplus | ||||
extern "C" { | extern "C" { | ||||
@@ -140,9 +150,9 @@ p255_weak_reduce ( | |||||
p255_t *a | p255_t *a | ||||
) { | ) { | ||||
uint64_t mask = (1ull<<51) - 1; | uint64_t mask = (1ull<<51) - 1; | ||||
uint64_t tmp = a->limb[5] >> 51; | |||||
uint64_t tmp = a->limb[4] >> 51; | |||||
int i; | int i; | ||||
for (i=7; i>0; i--) { | |||||
for (i=4; i>0; i--) { | |||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | ||||
} | } | ||||
a->limb[0] = (a->limb[0] & mask) + tmp*19; | a->limb[0] = (a->limb[0] & mask) + tmp*19; | ||||
@@ -10,58 +10,51 @@ | |||||
#include "field.h" | #include "field.h" | ||||
extern field_a_t ONE; // TODO | |||||
static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere? | |||||
static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere? | |||||
0x61b274a0ea0b0, | 0x61b274a0ea0b0, | ||||
0x0d5a5fc8f189d, | 0x0d5a5fc8f189d, | ||||
0x7ef5e9cbd0c60, | 0x7ef5e9cbd0c60, | ||||
0x78595a6804c9e, | 0x78595a6804c9e, | ||||
0x2b8324804fc1d | 0x2b8324804fc1d | ||||
); | |||||
)}; | |||||
static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted | |||||
1,0,0,0,0 | |||||
)}; | |||||
void | |||||
field_isr ( | |||||
field_a_t a, | |||||
const field_a_t x | |||||
) { | |||||
field_a_t st[3], tmp1, tmp2; | |||||
const struct { unsigned char sh, idx } ops[] = { | |||||
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||||
}; | |||||
field_cpy(st[0],x); | |||||
field_cpy(st[1],x); | |||||
field_cpy(st[2],x); | |||||
// ARCH MAGIC FIXME copy-pasted from decaf_fast.c | |||||
static mask_t gf_eq(const field_a_t a, const field_a_t b) { | |||||
field_a_t c; | |||||
field_sub(c,a,b); | |||||
field_strong_reduce(c); | |||||
mask_t ret=0; | |||||
int i; | int i; | ||||
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||||
field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||||
field_mul(tmp2, tmp1, st[ops[i].idx]); | |||||
field_cpy(st[i&1], tmp2); | |||||
} | |||||
mask_t m = field_eq(st[1], ONE); | |||||
cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m); | |||||
field_mul(a,tmp1,st[0]); | |||||
}; | |||||
for (i=0; i<5; i++) { ret |= c->limb[i]; } | |||||
return ((__uint128_t)ret - 1) >> 64; | |||||
} | |||||
/* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ | |||||
void | void | ||||
field_isr ( | field_isr ( | ||||
field_a_t a, | field_a_t a, | ||||
const field_a_t x | const field_a_t x | ||||
) { | ) { | ||||
field_a_t st[3], tmp1, tmp2; | field_a_t st[3], tmp1, tmp2; | ||||
const struct { unsigned char sh, idx } ops[] = { | |||||
const struct { unsigned char sh, idx; } ops[] = { | |||||
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | ||||
}; | }; | ||||
field_cpy(st[0],x); | |||||
field_cpy(st[1],x); | |||||
field_cpy(st[2],x); | |||||
int i; | |||||
st[0][0] = st[1][0] = st[2][0] = x[0]; | |||||
unsigned int i; | |||||
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | ||||
field_sqrn(tmp1, st[1^i&1], ops[i].sh); | field_sqrn(tmp1, st[1^i&1], ops[i].sh); | ||||
field_mul(tmp2, tmp1, st[ops[i].idx]); | field_mul(tmp2, tmp1, st[ops[i].idx]); | ||||
field_cpy(st[i&1], tmp2); | |||||
st[i&1][0] = tmp2[0]; | |||||
} | } | ||||
mask_t m = field_eq(st[1], ONE); | |||||
mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE); | |||||
// ARCH MAGIC FIXME: should be cond_sel | |||||
for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i] & mask) | |||||
| (SQRT_MINUS_ONE->limb[i] & ~mask); | |||||
field_mul(a,tmp1,st[0]); | |||||
} | } |