@@ -19,13 +19,13 @@ ASM ?= $(CC) | |||
DECAF ?= decaf_fast | |||
ifneq (,$(findstring x86_64,$(MACHINE))) | |||
ARCH ?= arch_x86_64 | |||
ARCH ?= arch_ref64 | |||
else | |||
# no i386 port yet | |||
ARCH ?= arch_arm_32 | |||
ARCH ?= arch_ref32 | |||
endif | |||
FIELD ?= p255 | |||
FIELD ?= p25519 | |||
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | |||
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | |||
@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) | |||
LANGFLAGS = -std=c99 -fno-strict-aliasing | |||
LANGXXFLAGS = -fno-strict-aliasing | |||
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | |||
OFLAGS ?= -O3 | |||
OFLAGS ?= -O2 | |||
TODAY = $(shell date "+%Y-%m-%d") | |||
@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t; | |||
/** Galois field element internal structure */ | |||
typedef struct gf_s { | |||
decaf_word_t limb[DECAF_255_LIMBS]; | |||
} __attribute__((aligned(32))) gf_s, gf[1]; | |||
} gf_s, gf[1]; | |||
/** @endcond */ | |||
/** Number of bytes in a serialized point. */ | |||
@@ -18,7 +18,7 @@ | |||
#include "shake.h" | |||
/** Number of bytes for a symmetric key (expanded to full key) */ | |||
#define DECAF_448_SYMMETRIC_KEY_BYTES 32 | |||
#define DECAF_255_SYMMETRIC_KEY_BYTES 32 | |||
/** @cond internal */ | |||
#define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | |||
@@ -31,29 +31,29 @@ | |||
/** @endcond */ | |||
/** A symmetric key, the compressed point of a private key. */ | |||
typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; | |||
typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES]; | |||
/** An encoded public key. */ | |||
typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; | |||
typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES]; | |||
/** A signature. */ | |||
typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; | |||
typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES]; | |||
typedef struct { | |||
/** @cond intetrnal */ | |||
/** The symmetric key from which everything is expanded */ | |||
decaf_448_symmetric_key_t sym; | |||
decaf_255_symmetric_key_t sym; | |||
/** The scalar x */ | |||
decaf_448_scalar_t secret_scalar; | |||
decaf_255_scalar_t secret_scalar; | |||
/** x*Base */ | |||
decaf_448_public_key_t pub; | |||
decaf_255_public_key_t pub; | |||
/** @endcond */ | |||
} /** Private key structure for pointers. */ | |||
decaf_448_private_key_s, | |||
decaf_255_private_key_s, | |||
/** A private key (gmp array[1] style). */ | |||
decaf_448_private_key_t[1]; | |||
decaf_255_private_key_t[1]; | |||
#ifdef __cplusplus | |||
extern "C" { | |||
@@ -64,16 +64,16 @@ extern "C" { | |||
* @param [out] priv The derived private key. | |||
* @param [in] proto The compressed or proto-key, which must be 32 random bytes. | |||
*/ | |||
void decaf_448_derive_private_key ( | |||
decaf_448_private_key_t priv, | |||
const decaf_448_symmetric_key_t proto | |||
void decaf_255_derive_private_key ( | |||
decaf_255_private_key_t priv, | |||
const decaf_255_symmetric_key_t proto | |||
) NONNULL2 API_VIS; | |||
/** | |||
* @brief Destroy a private key. | |||
*/ | |||
void decaf_448_destroy_private_key ( | |||
decaf_448_private_key_t priv | |||
void decaf_255_destroy_private_key ( | |||
decaf_255_private_key_t priv | |||
) NONNULL1 API_VIS; | |||
/** | |||
@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key ( | |||
* @param [out] pub The extracted private key. | |||
* @param [in] priv The private key. | |||
*/ | |||
void decaf_448_private_to_public ( | |||
decaf_448_public_key_t pub, | |||
const decaf_448_private_key_t priv | |||
void decaf_255_private_to_public ( | |||
decaf_255_public_key_t pub, | |||
const decaf_255_private_key_t priv | |||
) NONNULL2 API_VIS; | |||
/** | |||
@@ -104,11 +104,11 @@ void decaf_448_private_to_public ( | |||
* and will almost definitely change in the future. | |||
*/ | |||
decaf_bool_t | |||
decaf_448_shared_secret ( | |||
decaf_255_shared_secret ( | |||
uint8_t *shared, | |||
size_t shared_bytes, | |||
const decaf_448_private_key_t my_privkey, | |||
const decaf_448_public_key_t your_pubkey | |||
const decaf_255_private_key_t my_privkey, | |||
const decaf_255_public_key_t your_pubkey | |||
) NONNULL134 WARN_UNUSED API_VIS; | |||
/** | |||
@@ -119,9 +119,9 @@ decaf_448_shared_secret ( | |||
* @param [in] shake A SHAKE256 context with the message. | |||
*/ | |||
void | |||
decaf_448_sign_shake ( | |||
decaf_448_signature_t sig, | |||
const decaf_448_private_key_t priv, | |||
decaf_255_sign_shake ( | |||
decaf_255_signature_t sig, | |||
const decaf_255_private_key_t priv, | |||
const keccak_sponge_t shake | |||
) NONNULL3 API_VIS; | |||
@@ -134,9 +134,9 @@ decaf_448_sign_shake ( | |||
* @param [in] message_len The message's length. | |||
*/ | |||
void | |||
decaf_448_sign ( | |||
decaf_448_signature_t sig, | |||
const decaf_448_private_key_t priv, | |||
decaf_255_sign ( | |||
decaf_255_signature_t sig, | |||
const decaf_255_private_key_t priv, | |||
const unsigned char *message, | |||
size_t message_len | |||
) NONNULL3 API_VIS; | |||
@@ -149,9 +149,9 @@ decaf_448_sign ( | |||
* @param [in] shake A SHAKE256 context with the message. | |||
*/ | |||
decaf_bool_t | |||
decaf_448_verify_shake ( | |||
const decaf_448_signature_t sig, | |||
const decaf_448_public_key_t pub, | |||
decaf_255_verify_shake ( | |||
const decaf_255_signature_t sig, | |||
const decaf_255_public_key_t pub, | |||
const keccak_sponge_t shake | |||
) NONNULL3 API_VIS WARN_UNUSED; | |||
@@ -164,9 +164,9 @@ decaf_448_verify_shake ( | |||
* @param [in] message_len The message's length. | |||
*/ | |||
decaf_bool_t | |||
decaf_448_verify ( | |||
const decaf_448_signature_t sig, | |||
const decaf_448_public_key_t pub, | |||
decaf_255_verify ( | |||
const decaf_255_signature_t sig, | |||
const decaf_255_public_key_t pub, | |||
const unsigned char *message, | |||
size_t message_len | |||
) NONNULL3 API_VIS WARN_UNUSED; | |||
@@ -192,18 +192,18 @@ private: | |||
}; | |||
/**@cond internal*/ | |||
inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||
inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||
*this = rng.read(SER_BYTES); | |||
} | |||
inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||
inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||
SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | |||
rng.read(buffer); | |||
set_to_hash(buffer); | |||
} | |||
inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||
inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||
SecureBuffer out(STEG_BYTES); | |||
bool done; | |||
do { | |||
@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t; | |||
#define siv static inline void __attribute__((always_inline)) | |||
static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | |||
static const int EDWARDS_D = 121665; | |||
static const int EDWARDS_D = -89747; | |||
// Gonna test with PinkBikeShed until the math works... | |||
// Curve25519: 121665; | |||
static const scalar_t sc_p = {{{ | |||
// Gonna test with PinkBikeShed until the math works... | |||
SC_LIMB(0xb6b98fd8849faf35), | |||
SC_LIMB(0x16241e6093b2ce59), | |||
SC_LIMB(0), | |||
SC_LIMB(0x2000000000000000) | |||
/* Curve25519: | |||
SC_LIMB(0x5812631a5cf5d3ed), | |||
SC_LIMB(0x14def9dea2f79cd6), | |||
SC_LIMB(0), | |||
SC_LIMB(0), | |||
SC_LIMB(0x1000000000000000) | |||
*/ | |||
}}}; | |||
const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | |||
@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR; | |||
/* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | |||
const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | |||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||
5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||
}; | |||
extern const point_t API_NS(point_base); | |||
@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32; | |||
#ifdef __clang__ | |||
#if 100*__clang_major__ + __clang_minor__ > 305 | |||
#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||
#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize? | |||
#endif | |||
#endif | |||
#ifndef VECTORIZE | |||
#define VECTORIZE | |||
#ifndef UNROLL | |||
#define UNROLL | |||
#endif | |||
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | |||
#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }} | |||
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }} | |||
/** Copy x = y */ | |||
siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | |||
@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) { | |||
/** Subtract mod p. Bias by 2 and don't reduce */ | |||
siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | |||
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | |||
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | |||
gf_bias(c, 2); | |||
@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) { | |||
/** Add mod p. Don't reduce. */ | |||
siv gf_add_nr ( gf c, const gf a, const gf b ) { | |||
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | |||
field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | |||
} | |||
@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) { | |||
/** Constant time, if (swap) (x,y) = (y,x); */ | |||
siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | |||
FOR_LIMB_V(i, { | |||
FOR_LIMB_U(i, { | |||
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
x->limb[i] ^= s; | |||
y->limb[i] ^= s; | |||
@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) ( | |||
} | |||
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | |||
#else | |||
(void)out; | |||
(void)a; | |||
return 0; | |||
decaf_255_scalar_t b, ma; | |||
int i; | |||
sc_montmul(b,API_NS(scalar_one),sc_r2); | |||
sc_montmul(ma,a,sc_r2); | |||
for (i=SCALAR_BITS-1; i>=0; i--) { | |||
sc_montsqr(b,b); | |||
decaf_word_t w = sc_p->limb[i/WBITS]; | |||
if (i<WBITS) { | |||
assert(w >= 2); | |||
w-=2; | |||
} | |||
if (1 & w>>(i%WBITS)) { | |||
sc_montmul(b,b,ma); | |||
} | |||
} | |||
sc_montmul(out,b,decaf_255_scalar_one); | |||
API_NS(scalar_destroy)(b); | |||
API_NS(scalar_destroy)(ma); | |||
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); | |||
#endif | |||
} | |||
@@ -0,0 +1,50 @@ | |||
/** | |||
* @file decaf_config.h | |||
* @author Mike Hamburg | |||
* | |||
* @copyright | |||
* Copyright (c) 2015 Cryptography Research, Inc. \n | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
* | |||
* @brief Configuration for decaf_fast.c | |||
*/ | |||
#ifndef __DECAF_255_CONFIG_H__ | |||
#define __DECAF_255_CONFIG_H__ 1 | |||
/** | |||
* Use the Montgomery ladder for direct scalarmul. | |||
* | |||
* The Montgomery ladder is faster than Edwards scalarmul, but providing | |||
* the features Decaf supports (cofactor elimination, twist rejection) | |||
* makes it complicated and adds code. Removing the ladder saves a few | |||
* kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul | |||
* time. | |||
*/ | |||
#define DECAF_USE_MONTGOMERY_LADDER 1 | |||
/** The number of comb tables for fixed base scalarmul. */ | |||
#define DECAF_COMBS_N 3 | |||
/** The number of teeth per comb for fixed base scalarmul. */ | |||
#define DECAF_COMBS_T 5 | |||
/** The comb spacing fixed base scalarmul. */ | |||
#define DECAF_COMBS_S 17 | |||
/** Performance tuning: the width of the fixed window for scalar mul. */ | |||
#define DECAF_WINDOW_BITS 4 | |||
/** | |||
* The number of bits used for the precomputed table in variable-time | |||
* double scalarmul. | |||
*/ | |||
#define DECAF_WNAF_FIXED_TABLE_BITS 5 | |||
/** | |||
* Performance tuning: bits used for the variable table in variable-time | |||
* double scalarmul. | |||
*/ | |||
#define DECAF_WNAF_VAR_TABLE_BITS 3 | |||
#endif /* __DECAF_255_CONFIG_H__ */ |
@@ -22,164 +22,33 @@ p255_mul ( | |||
const p255_t *as, | |||
const p255_t *bs | |||
) { | |||
const uint64_t *a = as->limb, *b = bs->limb; | |||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||
uint64_t bh[4]; | |||
int i,j; | |||
for (i=0; i<4; i++) bh[i] = b[i+1] * 19; | |||
uint64_t *c = cs->limb; | |||
__uint128_t accum0 = 0, accum1 = 0, accum2; | |||
uint64_t mask = (1ull<<51) - 1; | |||
uint64_t aa[4], bb[4], bbb[4]; | |||
unsigned int i; | |||
for (i=0; i<4; i++) { | |||
aa[i] = a[i] + a[i+4]; | |||
bb[i] = b[i] + b[i+4]; | |||
bbb[i] = bb[i] + b[i+4]; | |||
} | |||
int I_HATE_UNROLLED_LOOPS = 0; | |||
if (I_HATE_UNROLLED_LOOPS) { | |||
/* The compiler probably won't unroll this, | |||
* so it's like 80% slower. | |||
*/ | |||
for (i=0; i<4; i++) { | |||
accum2 = 0; | |||
unsigned int j; | |||
for (j=0; j<=i; j++) { | |||
accum2 += widemul(a[j], b[i-j]); | |||
accum1 += widemul(aa[j], bb[i-j]); | |||
accum0 += widemul(a[j+4], b[i-j+4]); | |||
} | |||
for (; j<4; j++) { | |||
accum2 += widemul(a[j], b[i-j+8]); | |||
accum1 += widemul(aa[j], bbb[i-j+4]); | |||
accum0 += widemul(a[j+4], bb[i-j+4]); | |||
} | |||
accum1 -= accum2; | |||
accum0 += accum2; | |||
c[i] = ((uint64_t)(accum0)) & mask; | |||
c[i+4] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
__uint128_t accum = 0; | |||
for (i=0; i<5; i++) { | |||
for (j=0; j<=i; j++) { | |||
accum += widemul(b[i-j], a[j]); | |||
} | |||
} else { | |||
accum2 = widemul(a[0], b[0]); | |||
accum1 += widemul(aa[0], bb[0]); | |||
accum0 += widemul(a[4], b[4]); | |||
accum2 += widemul(a[1], b[7]); | |||
accum1 += widemul(aa[1], bbb[3]); | |||
accum0 += widemul(a[5], bb[3]); | |||
accum2 += widemul(a[2], b[6]); | |||
accum1 += widemul(aa[2], bbb[2]); | |||
accum0 += widemul(a[6], bb[2]); | |||
accum2 += widemul(a[3], b[5]); | |||
accum1 += widemul(aa[3], bbb[1]); | |||
accum0 += widemul(a[7], bb[1]); | |||
accum1 -= accum2; | |||
accum0 += accum2; | |||
c[0] = ((uint64_t)(accum0)) & mask; | |||
c[4] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
accum2 = widemul(a[0], b[1]); | |||
accum1 += widemul(aa[0], bb[1]); | |||
accum0 += widemul(a[4], b[5]); | |||
accum2 += widemul(a[1], b[0]); | |||
accum1 += widemul(aa[1], bb[0]); | |||
accum0 += widemul(a[5], b[4]); | |||
accum2 += widemul(a[2], b[7]); | |||
accum1 += widemul(aa[2], bbb[3]); | |||
accum0 += widemul(a[6], bb[3]); | |||
accum2 += widemul(a[3], b[6]); | |||
accum1 += widemul(aa[3], bbb[2]); | |||
accum0 += widemul(a[7], bb[2]); | |||
accum1 -= accum2; | |||
accum0 += accum2; | |||
c[1] = ((uint64_t)(accum0)) & mask; | |||
c[5] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
accum2 = widemul(a[0], b[2]); | |||
accum1 += widemul(aa[0], bb[2]); | |||
accum0 += widemul(a[4], b[6]); | |||
accum2 += widemul(a[1], b[1]); | |||
accum1 += widemul(aa[1], bb[1]); | |||
accum0 += widemul(a[5], b[5]); | |||
accum2 += widemul(a[2], b[0]); | |||
accum1 += widemul(aa[2], bb[0]); | |||
accum0 += widemul(a[6], b[4]); | |||
accum2 += widemul(a[3], b[7]); | |||
accum1 += widemul(aa[3], bbb[3]); | |||
accum0 += widemul(a[7], bb[3]); | |||
accum1 -= accum2; | |||
accum0 += accum2; | |||
c[2] = ((uint64_t)(accum0)) & mask; | |||
c[6] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
accum2 = widemul(a[0], b[3]); | |||
accum1 += widemul(aa[0], bb[3]); | |||
accum0 += widemul(a[4], b[7]); | |||
accum2 += widemul(a[1], b[2]); | |||
accum1 += widemul(aa[1], bb[2]); | |||
accum0 += widemul(a[5], b[6]); | |||
accum2 += widemul(a[2], b[1]); | |||
accum1 += widemul(aa[2], bb[1]); | |||
accum0 += widemul(a[6], b[5]); | |||
accum2 += widemul(a[3], b[0]); | |||
accum1 += widemul(aa[3], bb[0]); | |||
accum0 += widemul(a[7], b[4]); | |||
accum1 -= accum2; | |||
accum0 += accum2; | |||
c[3] = ((uint64_t)(accum0)) & mask; | |||
c[7] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
} /* !I_HATE_UNROLLED_LOOPS */ | |||
accum0 += accum1; | |||
accum0 += c[4]; | |||
accum1 += c[0]; | |||
c[4] = ((uint64_t)(accum0)) & mask; | |||
c[0] = ((uint64_t)(accum1)) & mask; | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
c[5] += ((uint64_t)(accum0)); | |||
c[1] += ((uint64_t)(accum1)); | |||
for (; j<5; j++) { | |||
accum += widemul(bh[i-j+4], a[j]); | |||
} | |||
c[i] = accum & mask; | |||
accum >>= 51; | |||
} | |||
/* PERF: parallelize? eh well this is reference */ | |||
accum *= 19; | |||
accum += c[0]; | |||
c[0] = accum & mask; | |||
accum >>= 51; | |||
assert(accum < mask); | |||
c[1] += accum; | |||
} | |||
void | |||
@@ -188,27 +57,25 @@ p255_mulw ( | |||
const p255_t *as, | |||
uint64_t b | |||
) { | |||
const uint64_t *a = as->limb; | |||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
int i; | |||
uint64_t *c = cs->limb; | |||
__uint128_t accum0 = 0, accum4 = 0; | |||
uint64_t mask = (1ull<<56) - 1; | |||
int i; | |||
for (i=0; i<4; i++) { | |||
accum0 += widemul(b, a[i]); | |||
accum4 += widemul(b, a[i+4]); | |||
c[i] = accum0 & mask; accum0 >>= 56; | |||
c[i+4] = accum4 & mask; accum4 >>= 56; | |||
__uint128_t accum = 0; | |||
for (i=0; i<5; i++) { | |||
accum += widemul(b, a[i]); | |||
c[i] = accum & mask; | |||
accum >>= 51; | |||
} | |||
/* PERF: parallelize? eh well this is reference */ | |||
accum *= 19; | |||
accum += c[0]; | |||
c[0] = accum & mask; | |||
accum >>= 51; | |||
accum0 += accum4 + c[4]; | |||
c[4] = accum0 & mask; | |||
c[5] += accum0 >> 56; | |||
accum4 += c[0]; | |||
c[0] = accum4 & mask; | |||
c[1] += accum4 >> 56; | |||
assert(accum < mask); | |||
c[1] += accum; | |||
} | |||
void | |||
@@ -223,23 +90,21 @@ void | |||
p255_strong_reduce ( | |||
p255_t *a | |||
) { | |||
uint64_t mask = (1ull<<56)-1; | |||
uint64_t mask = (1ull<<51)-1; | |||
/* first, clear high */ | |||
a->limb[4] += a->limb[7]>>56; | |||
a->limb[0] += a->limb[7]>>56; | |||
a->limb[7] &= mask; | |||
a->limb[0] += (a->limb[4]>>51)*19; | |||
a->limb[4] &= mask; | |||
/* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */ | |||
/* now the total is less than 2p */ | |||
/* compute total_value - p. No need to reduce mod p. */ | |||
__int128_t scarry = 0; | |||
int i; | |||
for (i=0; i<8; i++) { | |||
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); | |||
for (i=0; i<5; i++) { | |||
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); | |||
a->limb[i] = scarry & mask; | |||
scarry >>= 56; | |||
scarry >>= 51; | |||
} | |||
/* uncommon case: it was >= p, so now scarry = 0 and this = x | |||
@@ -253,10 +118,10 @@ p255_strong_reduce ( | |||
__uint128_t carry = 0; | |||
/* add it back */ | |||
for (i=0; i<8; i++) { | |||
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); | |||
for (i=0; i<5; i++) { | |||
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); | |||
a->limb[i] = carry & mask; | |||
carry >>= 56; | |||
carry >>= 51; | |||
} | |||
assert(is_zero(carry + scarry)); | |||
@@ -271,12 +136,13 @@ p255_serialize ( | |||
p255_t red; | |||
p255_copy(&red, x); | |||
p255_strong_reduce(&red); | |||
for (i=0; i<8; i++) { | |||
for (j=0; j<7; j++) { | |||
serial[7*i+j] = red.limb[i]; | |||
red.limb[i] >>= 8; | |||
uint64_t *r = red.limb; | |||
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | |||
for (i=0; i<4; i++) { | |||
for (j=0; j<8; j++) { | |||
serial[8*i+j] = ser64[i]; | |||
ser64[i] >>= 8; | |||
} | |||
assert(red.limb[i] == 0); | |||
} | |||
} | |||
@@ -286,33 +152,27 @@ p255_deserialize ( | |||
const uint8_t serial[32] | |||
) { | |||
int i,j; | |||
for (i=0; i<8; i++) { | |||
uint64_t ser64[4], mask = ((1ull<<51)-1); | |||
for (i=0; i<4; i++) { | |||
uint64_t out = 0; | |||
for (j=0; j<7; j++) { | |||
out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||
for (j=0; j<8; j++) { | |||
out |= ((uint64_t)serial[8*i+j])<<(8*j); | |||
} | |||
x->limb[i] = out; | |||
ser64[i] = out; | |||
} | |||
/* Check for reduction. | |||
* | |||
* The idea is to create a variable ge which is all ones (rather, 56 ones) | |||
* if and only if the low $i$ words of $x$ are >= those of p. | |||
* | |||
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) | |||
*/ | |||
uint64_t ge = -1, mask = (1ull<<56)-1; | |||
for (i=0; i<4; i++) { | |||
ge &= x->limb[i]; | |||
} | |||
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ | |||
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); | |||
/* Test for >= 2^255-19 */ | |||
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); | |||
ge &= ser64[1]; | |||
ge &= ser64[2]; | |||
ge &= (ser64[3]<<1) + 1; | |||
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); | |||
/* Propagate the rest */ | |||
for (i=5; i<8; i++) { | |||
ge &= x->limb[i]; | |||
} | |||
x->limb[0] = ser64[0] & mask; | |||
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; | |||
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; | |||
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; | |||
x->limb[4] = ser64[3]>>12; | |||
return ~is_zero(ge ^ mask); | |||
return ~is_zero(~ge); | |||
} |
@@ -15,7 +15,17 @@ typedef struct p255_t { | |||
} p255_t; | |||
#define LBITS 51 | |||
#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}} | |||
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} | |||
/* | |||
#define FIELD_LITERAL(a,b,c,d) {{ \ | |||
(a##ull) & LMASK, \ | |||
((a##ull)>>51 | (b##ull)<<13) & LMASK, \ | |||
((b##ull)>>38 | (c##ull)<<26) & LMASK, \ | |||
((c##ull)>>25 | (d##ull)<<39) & LMASK, \ | |||
(d##ull)>>12 \ | |||
}} | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" { | |||
@@ -140,9 +150,9 @@ p255_weak_reduce ( | |||
p255_t *a | |||
) { | |||
uint64_t mask = (1ull<<51) - 1; | |||
uint64_t tmp = a->limb[5] >> 51; | |||
uint64_t tmp = a->limb[4] >> 51; | |||
int i; | |||
for (i=7; i>0; i--) { | |||
for (i=4; i>0; i--) { | |||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | |||
} | |||
a->limb[0] = (a->limb[0] & mask) + tmp*19; | |||
@@ -10,58 +10,51 @@ | |||
#include "field.h" | |||
extern field_a_t ONE; // TODO | |||
static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere? | |||
static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere? | |||
0x61b274a0ea0b0, | |||
0x0d5a5fc8f189d, | |||
0x7ef5e9cbd0c60, | |||
0x78595a6804c9e, | |||
0x2b8324804fc1d | |||
); | |||
)}; | |||
static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted | |||
1,0,0,0,0 | |||
)}; | |||
void | |||
field_isr ( | |||
field_a_t a, | |||
const field_a_t x | |||
) { | |||
field_a_t st[3], tmp1, tmp2; | |||
const struct { unsigned char sh, idx } ops[] = { | |||
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||
}; | |||
field_cpy(st[0],x); | |||
field_cpy(st[1],x); | |||
field_cpy(st[2],x); | |||
// ARCH MAGIC FIXME copy-pasted from decaf_fast.c | |||
static mask_t gf_eq(const field_a_t a, const field_a_t b) { | |||
field_a_t c; | |||
field_sub(c,a,b); | |||
field_strong_reduce(c); | |||
mask_t ret=0; | |||
int i; | |||
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||
field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||
field_mul(tmp2, tmp1, st[ops[i].idx]); | |||
field_cpy(st[i&1], tmp2); | |||
} | |||
mask_t m = field_eq(st[1], ONE); | |||
cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m); | |||
field_mul(a,tmp1,st[0]); | |||
}; | |||
for (i=0; i<5; i++) { ret |= c->limb[i]; } | |||
return ((__uint128_t)ret - 1) >> 64; | |||
} | |||
/* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ | |||
void | |||
field_isr ( | |||
field_a_t a, | |||
const field_a_t x | |||
) { | |||
field_a_t st[3], tmp1, tmp2; | |||
const struct { unsigned char sh, idx } ops[] = { | |||
const struct { unsigned char sh, idx; } ops[] = { | |||
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||
}; | |||
field_cpy(st[0],x); | |||
field_cpy(st[1],x); | |||
field_cpy(st[2],x); | |||
int i; | |||
st[0][0] = st[1][0] = st[2][0] = x[0]; | |||
unsigned int i; | |||
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||
field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||
field_mul(tmp2, tmp1, st[ops[i].idx]); | |||
field_cpy(st[i&1], tmp2); | |||
st[i&1][0] = tmp2[0]; | |||
} | |||
mask_t m = field_eq(st[1], ONE); | |||
mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE); | |||
// ARCH MAGIC FIXME: should be cond_sel | |||
for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i] & mask) | |||
| (SQRT_MINUS_ONE->limb[i] & ~mask); | |||
field_mul(a,tmp1,st[0]); | |||
} |