| @@ -19,13 +19,13 @@ ASM ?= $(CC) | |||
| DECAF ?= decaf_fast | |||
| ifneq (,$(findstring x86_64,$(MACHINE))) | |||
| ARCH ?= arch_x86_64 | |||
| ARCH ?= arch_ref64 | |||
| else | |||
| # no i386 port yet | |||
| ARCH ?= arch_arm_32 | |||
| ARCH ?= arch_ref32 | |||
| endif | |||
| FIELD ?= p255 | |||
| FIELD ?= p25519 | |||
| WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | |||
| -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | |||
| @@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) | |||
| LANGFLAGS = -std=c99 -fno-strict-aliasing | |||
| LANGXXFLAGS = -fno-strict-aliasing | |||
| GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | |||
| OFLAGS ?= -O3 | |||
| OFLAGS ?= -O2 | |||
| TODAY = $(shell date "+%Y-%m-%d") | |||
| @@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t; | |||
| /** Galois field element internal structure */ | |||
| typedef struct gf_s { | |||
| decaf_word_t limb[DECAF_255_LIMBS]; | |||
| } __attribute__((aligned(32))) gf_s, gf[1]; | |||
| } gf_s, gf[1]; | |||
| /** @endcond */ | |||
| /** Number of bytes in a serialized point. */ | |||
| @@ -18,7 +18,7 @@ | |||
| #include "shake.h" | |||
| /** Number of bytes for a symmetric key (expanded to full key) */ | |||
| #define DECAF_448_SYMMETRIC_KEY_BYTES 32 | |||
| #define DECAF_255_SYMMETRIC_KEY_BYTES 32 | |||
| /** @cond internal */ | |||
| #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | |||
| @@ -31,29 +31,29 @@ | |||
| /** @endcond */ | |||
| /** A symmetric key, the compressed point of a private key. */ | |||
| typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; | |||
| typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES]; | |||
| /** An encoded public key. */ | |||
| typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; | |||
| typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES]; | |||
| /** A signature. */ | |||
| typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; | |||
| typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES]; | |||
| typedef struct { | |||
| /** @cond intetrnal */ | |||
| /** The symmetric key from which everything is expanded */ | |||
| decaf_448_symmetric_key_t sym; | |||
| decaf_255_symmetric_key_t sym; | |||
| /** The scalar x */ | |||
| decaf_448_scalar_t secret_scalar; | |||
| decaf_255_scalar_t secret_scalar; | |||
| /** x*Base */ | |||
| decaf_448_public_key_t pub; | |||
| decaf_255_public_key_t pub; | |||
| /** @endcond */ | |||
| } /** Private key structure for pointers. */ | |||
| decaf_448_private_key_s, | |||
| decaf_255_private_key_s, | |||
| /** A private key (gmp array[1] style). */ | |||
| decaf_448_private_key_t[1]; | |||
| decaf_255_private_key_t[1]; | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| @@ -64,16 +64,16 @@ extern "C" { | |||
| * @param [out] priv The derived private key. | |||
| * @param [in] proto The compressed or proto-key, which must be 32 random bytes. | |||
| */ | |||
| void decaf_448_derive_private_key ( | |||
| decaf_448_private_key_t priv, | |||
| const decaf_448_symmetric_key_t proto | |||
| void decaf_255_derive_private_key ( | |||
| decaf_255_private_key_t priv, | |||
| const decaf_255_symmetric_key_t proto | |||
| ) NONNULL2 API_VIS; | |||
| /** | |||
| * @brief Destroy a private key. | |||
| */ | |||
| void decaf_448_destroy_private_key ( | |||
| decaf_448_private_key_t priv | |||
| void decaf_255_destroy_private_key ( | |||
| decaf_255_private_key_t priv | |||
| ) NONNULL1 API_VIS; | |||
| /** | |||
| @@ -81,9 +81,9 @@ void decaf_448_destroy_private_key ( | |||
| * @param [out] pub The extracted private key. | |||
| * @param [in] priv The private key. | |||
| */ | |||
| void decaf_448_private_to_public ( | |||
| decaf_448_public_key_t pub, | |||
| const decaf_448_private_key_t priv | |||
| void decaf_255_private_to_public ( | |||
| decaf_255_public_key_t pub, | |||
| const decaf_255_private_key_t priv | |||
| ) NONNULL2 API_VIS; | |||
| /** | |||
| @@ -104,11 +104,11 @@ void decaf_448_private_to_public ( | |||
| * and will almost definitely change in the future. | |||
| */ | |||
| decaf_bool_t | |||
| decaf_448_shared_secret ( | |||
| decaf_255_shared_secret ( | |||
| uint8_t *shared, | |||
| size_t shared_bytes, | |||
| const decaf_448_private_key_t my_privkey, | |||
| const decaf_448_public_key_t your_pubkey | |||
| const decaf_255_private_key_t my_privkey, | |||
| const decaf_255_public_key_t your_pubkey | |||
| ) NONNULL134 WARN_UNUSED API_VIS; | |||
| /** | |||
| @@ -119,9 +119,9 @@ decaf_448_shared_secret ( | |||
| * @param [in] shake A SHAKE256 context with the message. | |||
| */ | |||
| void | |||
| decaf_448_sign_shake ( | |||
| decaf_448_signature_t sig, | |||
| const decaf_448_private_key_t priv, | |||
| decaf_255_sign_shake ( | |||
| decaf_255_signature_t sig, | |||
| const decaf_255_private_key_t priv, | |||
| const keccak_sponge_t shake | |||
| ) NONNULL3 API_VIS; | |||
| @@ -134,9 +134,9 @@ decaf_448_sign_shake ( | |||
| * @param [in] message_len The message's length. | |||
| */ | |||
| void | |||
| decaf_448_sign ( | |||
| decaf_448_signature_t sig, | |||
| const decaf_448_private_key_t priv, | |||
| decaf_255_sign ( | |||
| decaf_255_signature_t sig, | |||
| const decaf_255_private_key_t priv, | |||
| const unsigned char *message, | |||
| size_t message_len | |||
| ) NONNULL3 API_VIS; | |||
| @@ -149,9 +149,9 @@ decaf_448_sign ( | |||
| * @param [in] shake A SHAKE256 context with the message. | |||
| */ | |||
| decaf_bool_t | |||
| decaf_448_verify_shake ( | |||
| const decaf_448_signature_t sig, | |||
| const decaf_448_public_key_t pub, | |||
| decaf_255_verify_shake ( | |||
| const decaf_255_signature_t sig, | |||
| const decaf_255_public_key_t pub, | |||
| const keccak_sponge_t shake | |||
| ) NONNULL3 API_VIS WARN_UNUSED; | |||
| @@ -164,9 +164,9 @@ decaf_448_verify_shake ( | |||
| * @param [in] message_len The message's length. | |||
| */ | |||
| decaf_bool_t | |||
| decaf_448_verify ( | |||
| const decaf_448_signature_t sig, | |||
| const decaf_448_public_key_t pub, | |||
| decaf_255_verify ( | |||
| const decaf_255_signature_t sig, | |||
| const decaf_255_public_key_t pub, | |||
| const unsigned char *message, | |||
| size_t message_len | |||
| ) NONNULL3 API_VIS WARN_UNUSED; | |||
| @@ -192,18 +192,18 @@ private: | |||
| }; | |||
| /**@cond internal*/ | |||
| inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||
| inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||
| *this = rng.read(SER_BYTES); | |||
| } | |||
| inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||
| inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||
| SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | |||
| rng.read(buffer); | |||
| set_to_hash(buffer); | |||
| } | |||
| inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||
| inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||
| SecureBuffer out(STEG_BYTES); | |||
| bool done; | |||
| do { | |||
| @@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t; | |||
| #define siv static inline void __attribute__((always_inline)) | |||
| static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | |||
| static const int EDWARDS_D = 121665; | |||
| static const int EDWARDS_D = -89747; | |||
| // Gonna test with PinkBikeShed until the math works... | |||
| // Curve25519: 121665; | |||
| static const scalar_t sc_p = {{{ | |||
| // Gonna test with PinkBikeShed until the math works... | |||
| SC_LIMB(0xb6b98fd8849faf35), | |||
| SC_LIMB(0x16241e6093b2ce59), | |||
| SC_LIMB(0), | |||
| SC_LIMB(0x2000000000000000) | |||
| /* Curve25519: | |||
| SC_LIMB(0x5812631a5cf5d3ed), | |||
| SC_LIMB(0x14def9dea2f79cd6), | |||
| SC_LIMB(0), | |||
| SC_LIMB(0), | |||
| SC_LIMB(0x1000000000000000) | |||
| */ | |||
| }}}; | |||
| const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | |||
| @@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR; | |||
| /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | |||
| const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | |||
| 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||
| 5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||
| }; | |||
| extern const point_t API_NS(point_base); | |||
| @@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32; | |||
| #ifdef __clang__ | |||
| #if 100*__clang_major__ + __clang_minor__ > 305 | |||
| #define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||
| #define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize? | |||
| #endif | |||
| #endif | |||
| #ifndef VECTORIZE | |||
| #define VECTORIZE | |||
| #ifndef UNROLL | |||
| #define UNROLL | |||
| #endif | |||
| #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | |||
| #define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }} | |||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }} | |||
| /** Copy x = y */ | |||
| siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | |||
| @@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) { | |||
| /** Subtract mod p. Bias by 2 and don't reduce */ | |||
| siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | |||
| // FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||
| // FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||
| ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | |||
| field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | |||
| gf_bias(c, 2); | |||
| @@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) { | |||
| /** Add mod p. Don't reduce. */ | |||
| siv gf_add_nr ( gf c, const gf a, const gf b ) { | |||
| // FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||
| // FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||
| ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | |||
| field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | |||
| } | |||
| @@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) { | |||
| /** Constant time, if (swap) (x,y) = (y,x); */ | |||
| siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | |||
| FOR_LIMB_V(i, { | |||
| FOR_LIMB_U(i, { | |||
| decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
| x->limb[i] ^= s; | |||
| y->limb[i] ^= s; | |||
| @@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) ( | |||
| } | |||
| return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | |||
| #else | |||
| (void)out; | |||
| (void)a; | |||
| return 0; | |||
| decaf_255_scalar_t b, ma; | |||
| int i; | |||
| sc_montmul(b,API_NS(scalar_one),sc_r2); | |||
| sc_montmul(ma,a,sc_r2); | |||
| for (i=SCALAR_BITS-1; i>=0; i--) { | |||
| sc_montsqr(b,b); | |||
| decaf_word_t w = sc_p->limb[i/WBITS]; | |||
| if (i<WBITS) { | |||
| assert(w >= 2); | |||
| w-=2; | |||
| } | |||
| if (1 & w>>(i%WBITS)) { | |||
| sc_montmul(b,b,ma); | |||
| } | |||
| } | |||
| sc_montmul(out,b,decaf_255_scalar_one); | |||
| API_NS(scalar_destroy)(b); | |||
| API_NS(scalar_destroy)(ma); | |||
| return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,50 @@ | |||
| /** | |||
| * @file decaf_config.h | |||
| * @author Mike Hamburg | |||
| * | |||
| * @copyright | |||
| * Copyright (c) 2015 Cryptography Research, Inc. \n | |||
| * Released under the MIT License. See LICENSE.txt for license information. | |||
| * | |||
| * @brief Configuration for decaf_fast.c | |||
| */ | |||
| #ifndef __DECAF_255_CONFIG_H__ | |||
| #define __DECAF_255_CONFIG_H__ 1 | |||
| /** | |||
| * Use the Montgomery ladder for direct scalarmul. | |||
| * | |||
| * The Montgomery ladder is faster than Edwards scalarmul, but providing | |||
| * the features Decaf supports (cofactor elimination, twist rejection) | |||
| * makes it complicated and adds code. Removing the ladder saves a few | |||
| * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul | |||
| * time. | |||
| */ | |||
| #define DECAF_USE_MONTGOMERY_LADDER 1 | |||
| /** The number of comb tables for fixed base scalarmul. */ | |||
| #define DECAF_COMBS_N 3 | |||
| /** The number of teeth per comb for fixed base scalarmul. */ | |||
| #define DECAF_COMBS_T 5 | |||
| /** The comb spacing fixed base scalarmul. */ | |||
| #define DECAF_COMBS_S 17 | |||
| /** Performance tuning: the width of the fixed window for scalar mul. */ | |||
| #define DECAF_WINDOW_BITS 4 | |||
| /** | |||
| * The number of bits used for the precomputed table in variable-time | |||
| * double scalarmul. | |||
| */ | |||
| #define DECAF_WNAF_FIXED_TABLE_BITS 5 | |||
| /** | |||
| * Performance tuning: bits used for the variable table in variable-time | |||
| * double scalarmul. | |||
| */ | |||
| #define DECAF_WNAF_VAR_TABLE_BITS 3 | |||
| #endif /* __DECAF_255_CONFIG_H__ */ | |||
| @@ -22,164 +22,33 @@ p255_mul ( | |||
| const p255_t *as, | |||
| const p255_t *bs | |||
| ) { | |||
| const uint64_t *a = as->limb, *b = bs->limb; | |||
| const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||
| uint64_t bh[4]; | |||
| int i,j; | |||
| for (i=0; i<4; i++) bh[i] = b[i+1] * 19; | |||
| uint64_t *c = cs->limb; | |||
| __uint128_t accum0 = 0, accum1 = 0, accum2; | |||
| uint64_t mask = (1ull<<51) - 1; | |||
| uint64_t aa[4], bb[4], bbb[4]; | |||
| unsigned int i; | |||
| for (i=0; i<4; i++) { | |||
| aa[i] = a[i] + a[i+4]; | |||
| bb[i] = b[i] + b[i+4]; | |||
| bbb[i] = bb[i] + b[i+4]; | |||
| } | |||
| int I_HATE_UNROLLED_LOOPS = 0; | |||
| if (I_HATE_UNROLLED_LOOPS) { | |||
| /* The compiler probably won't unroll this, | |||
| * so it's like 80% slower. | |||
| */ | |||
| for (i=0; i<4; i++) { | |||
| accum2 = 0; | |||
| unsigned int j; | |||
| for (j=0; j<=i; j++) { | |||
| accum2 += widemul(a[j], b[i-j]); | |||
| accum1 += widemul(aa[j], bb[i-j]); | |||
| accum0 += widemul(a[j+4], b[i-j+4]); | |||
| } | |||
| for (; j<4; j++) { | |||
| accum2 += widemul(a[j], b[i-j+8]); | |||
| accum1 += widemul(aa[j], bbb[i-j+4]); | |||
| accum0 += widemul(a[j+4], bb[i-j+4]); | |||
| } | |||
| accum1 -= accum2; | |||
| accum0 += accum2; | |||
| c[i] = ((uint64_t)(accum0)) & mask; | |||
| c[i+4] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| __uint128_t accum = 0; | |||
| for (i=0; i<5; i++) { | |||
| for (j=0; j<=i; j++) { | |||
| accum += widemul(b[i-j], a[j]); | |||
| } | |||
| } else { | |||
| accum2 = widemul(a[0], b[0]); | |||
| accum1 += widemul(aa[0], bb[0]); | |||
| accum0 += widemul(a[4], b[4]); | |||
| accum2 += widemul(a[1], b[7]); | |||
| accum1 += widemul(aa[1], bbb[3]); | |||
| accum0 += widemul(a[5], bb[3]); | |||
| accum2 += widemul(a[2], b[6]); | |||
| accum1 += widemul(aa[2], bbb[2]); | |||
| accum0 += widemul(a[6], bb[2]); | |||
| accum2 += widemul(a[3], b[5]); | |||
| accum1 += widemul(aa[3], bbb[1]); | |||
| accum0 += widemul(a[7], bb[1]); | |||
| accum1 -= accum2; | |||
| accum0 += accum2; | |||
| c[0] = ((uint64_t)(accum0)) & mask; | |||
| c[4] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| accum2 = widemul(a[0], b[1]); | |||
| accum1 += widemul(aa[0], bb[1]); | |||
| accum0 += widemul(a[4], b[5]); | |||
| accum2 += widemul(a[1], b[0]); | |||
| accum1 += widemul(aa[1], bb[0]); | |||
| accum0 += widemul(a[5], b[4]); | |||
| accum2 += widemul(a[2], b[7]); | |||
| accum1 += widemul(aa[2], bbb[3]); | |||
| accum0 += widemul(a[6], bb[3]); | |||
| accum2 += widemul(a[3], b[6]); | |||
| accum1 += widemul(aa[3], bbb[2]); | |||
| accum0 += widemul(a[7], bb[2]); | |||
| accum1 -= accum2; | |||
| accum0 += accum2; | |||
| c[1] = ((uint64_t)(accum0)) & mask; | |||
| c[5] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| accum2 = widemul(a[0], b[2]); | |||
| accum1 += widemul(aa[0], bb[2]); | |||
| accum0 += widemul(a[4], b[6]); | |||
| accum2 += widemul(a[1], b[1]); | |||
| accum1 += widemul(aa[1], bb[1]); | |||
| accum0 += widemul(a[5], b[5]); | |||
| accum2 += widemul(a[2], b[0]); | |||
| accum1 += widemul(aa[2], bb[0]); | |||
| accum0 += widemul(a[6], b[4]); | |||
| accum2 += widemul(a[3], b[7]); | |||
| accum1 += widemul(aa[3], bbb[3]); | |||
| accum0 += widemul(a[7], bb[3]); | |||
| accum1 -= accum2; | |||
| accum0 += accum2; | |||
| c[2] = ((uint64_t)(accum0)) & mask; | |||
| c[6] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| accum2 = widemul(a[0], b[3]); | |||
| accum1 += widemul(aa[0], bb[3]); | |||
| accum0 += widemul(a[4], b[7]); | |||
| accum2 += widemul(a[1], b[2]); | |||
| accum1 += widemul(aa[1], bb[2]); | |||
| accum0 += widemul(a[5], b[6]); | |||
| accum2 += widemul(a[2], b[1]); | |||
| accum1 += widemul(aa[2], bb[1]); | |||
| accum0 += widemul(a[6], b[5]); | |||
| accum2 += widemul(a[3], b[0]); | |||
| accum1 += widemul(aa[3], bb[0]); | |||
| accum0 += widemul(a[7], b[4]); | |||
| accum1 -= accum2; | |||
| accum0 += accum2; | |||
| c[3] = ((uint64_t)(accum0)) & mask; | |||
| c[7] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| } /* !I_HATE_UNROLLED_LOOPS */ | |||
| accum0 += accum1; | |||
| accum0 += c[4]; | |||
| accum1 += c[0]; | |||
| c[4] = ((uint64_t)(accum0)) & mask; | |||
| c[0] = ((uint64_t)(accum1)) & mask; | |||
| accum0 >>= 56; | |||
| accum1 >>= 56; | |||
| c[5] += ((uint64_t)(accum0)); | |||
| c[1] += ((uint64_t)(accum1)); | |||
| for (; j<5; j++) { | |||
| accum += widemul(bh[i-j+4], a[j]); | |||
| } | |||
| c[i] = accum & mask; | |||
| accum >>= 51; | |||
| } | |||
| /* PERF: parallelize? eh well this is reference */ | |||
| accum *= 19; | |||
| accum += c[0]; | |||
| c[0] = accum & mask; | |||
| accum >>= 51; | |||
| assert(accum < mask); | |||
| c[1] += accum; | |||
| } | |||
| void | |||
| @@ -188,27 +57,25 @@ p255_mulw ( | |||
| const p255_t *as, | |||
| uint64_t b | |||
| ) { | |||
| const uint64_t *a = as->limb; | |||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
| int i; | |||
| uint64_t *c = cs->limb; | |||
| __uint128_t accum0 = 0, accum4 = 0; | |||
| uint64_t mask = (1ull<<56) - 1; | |||
| int i; | |||
| for (i=0; i<4; i++) { | |||
| accum0 += widemul(b, a[i]); | |||
| accum4 += widemul(b, a[i+4]); | |||
| c[i] = accum0 & mask; accum0 >>= 56; | |||
| c[i+4] = accum4 & mask; accum4 >>= 56; | |||
| __uint128_t accum = 0; | |||
| for (i=0; i<5; i++) { | |||
| accum += widemul(b, a[i]); | |||
| c[i] = accum & mask; | |||
| accum >>= 51; | |||
| } | |||
| /* PERF: parallelize? eh well this is reference */ | |||
| accum *= 19; | |||
| accum += c[0]; | |||
| c[0] = accum & mask; | |||
| accum >>= 51; | |||
| accum0 += accum4 + c[4]; | |||
| c[4] = accum0 & mask; | |||
| c[5] += accum0 >> 56; | |||
| accum4 += c[0]; | |||
| c[0] = accum4 & mask; | |||
| c[1] += accum4 >> 56; | |||
| assert(accum < mask); | |||
| c[1] += accum; | |||
| } | |||
| void | |||
| @@ -223,23 +90,21 @@ void | |||
| p255_strong_reduce ( | |||
| p255_t *a | |||
| ) { | |||
| uint64_t mask = (1ull<<56)-1; | |||
| uint64_t mask = (1ull<<51)-1; | |||
| /* first, clear high */ | |||
| a->limb[4] += a->limb[7]>>56; | |||
| a->limb[0] += a->limb[7]>>56; | |||
| a->limb[7] &= mask; | |||
| a->limb[0] += (a->limb[4]>>51)*19; | |||
| a->limb[4] &= mask; | |||
| /* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */ | |||
| /* now the total is less than 2p */ | |||
| /* compute total_value - p. No need to reduce mod p. */ | |||
| __int128_t scarry = 0; | |||
| int i; | |||
| for (i=0; i<8; i++) { | |||
| scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); | |||
| for (i=0; i<5; i++) { | |||
| scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); | |||
| a->limb[i] = scarry & mask; | |||
| scarry >>= 56; | |||
| scarry >>= 51; | |||
| } | |||
| /* uncommon case: it was >= p, so now scarry = 0 and this = x | |||
| @@ -253,10 +118,10 @@ p255_strong_reduce ( | |||
| __uint128_t carry = 0; | |||
| /* add it back */ | |||
| for (i=0; i<8; i++) { | |||
| carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); | |||
| for (i=0; i<5; i++) { | |||
| carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); | |||
| a->limb[i] = carry & mask; | |||
| carry >>= 56; | |||
| carry >>= 51; | |||
| } | |||
| assert(is_zero(carry + scarry)); | |||
| @@ -271,12 +136,13 @@ p255_serialize ( | |||
| p255_t red; | |||
| p255_copy(&red, x); | |||
| p255_strong_reduce(&red); | |||
| for (i=0; i<8; i++) { | |||
| for (j=0; j<7; j++) { | |||
| serial[7*i+j] = red.limb[i]; | |||
| red.limb[i] >>= 8; | |||
| uint64_t *r = red.limb; | |||
| uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | |||
| for (i=0; i<4; i++) { | |||
| for (j=0; j<8; j++) { | |||
| serial[8*i+j] = ser64[i]; | |||
| ser64[i] >>= 8; | |||
| } | |||
| assert(red.limb[i] == 0); | |||
| } | |||
| } | |||
| @@ -286,33 +152,27 @@ p255_deserialize ( | |||
| const uint8_t serial[32] | |||
| ) { | |||
| int i,j; | |||
| for (i=0; i<8; i++) { | |||
| uint64_t ser64[4], mask = ((1ull<<51)-1); | |||
| for (i=0; i<4; i++) { | |||
| uint64_t out = 0; | |||
| for (j=0; j<7; j++) { | |||
| out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||
| for (j=0; j<8; j++) { | |||
| out |= ((uint64_t)serial[8*i+j])<<(8*j); | |||
| } | |||
| x->limb[i] = out; | |||
| ser64[i] = out; | |||
| } | |||
| /* Check for reduction. | |||
| * | |||
| * The idea is to create a variable ge which is all ones (rather, 56 ones) | |||
| * if and only if the low $i$ words of $x$ are >= those of p. | |||
| * | |||
| * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) | |||
| */ | |||
| uint64_t ge = -1, mask = (1ull<<56)-1; | |||
| for (i=0; i<4; i++) { | |||
| ge &= x->limb[i]; | |||
| } | |||
| /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ | |||
| ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); | |||
| /* Test for >= 2^255-19 */ | |||
| uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); | |||
| ge &= ser64[1]; | |||
| ge &= ser64[2]; | |||
| ge &= (ser64[3]<<1) + 1; | |||
| ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); | |||
| /* Propagate the rest */ | |||
| for (i=5; i<8; i++) { | |||
| ge &= x->limb[i]; | |||
| } | |||
| x->limb[0] = ser64[0] & mask; | |||
| x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; | |||
| x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; | |||
| x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; | |||
| x->limb[4] = ser64[3]>>12; | |||
| return ~is_zero(ge ^ mask); | |||
| return ~is_zero(~ge); | |||
| } | |||
| @@ -15,7 +15,17 @@ typedef struct p255_t { | |||
| } p255_t; | |||
| #define LBITS 51 | |||
| #define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}} | |||
| #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} | |||
| /* | |||
| #define FIELD_LITERAL(a,b,c,d) {{ \ | |||
| (a##ull) & LMASK, \ | |||
| ((a##ull)>>51 | (b##ull)<<13) & LMASK, \ | |||
| ((b##ull)>>38 | (c##ull)<<26) & LMASK, \ | |||
| ((c##ull)>>25 | (d##ull)<<39) & LMASK, \ | |||
| (d##ull)>>12 \ | |||
| }} | |||
| */ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| @@ -140,9 +150,9 @@ p255_weak_reduce ( | |||
| p255_t *a | |||
| ) { | |||
| uint64_t mask = (1ull<<51) - 1; | |||
| uint64_t tmp = a->limb[5] >> 51; | |||
| uint64_t tmp = a->limb[4] >> 51; | |||
| int i; | |||
| for (i=7; i>0; i--) { | |||
| for (i=4; i>0; i--) { | |||
| a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | |||
| } | |||
| a->limb[0] = (a->limb[0] & mask) + tmp*19; | |||
| @@ -10,58 +10,51 @@ | |||
| #include "field.h" | |||
| extern field_a_t ONE; // TODO | |||
| static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere? | |||
| static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere? | |||
| 0x61b274a0ea0b0, | |||
| 0x0d5a5fc8f189d, | |||
| 0x7ef5e9cbd0c60, | |||
| 0x78595a6804c9e, | |||
| 0x2b8324804fc1d | |||
| ); | |||
| )}; | |||
| static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted | |||
| 1,0,0,0,0 | |||
| )}; | |||
| void | |||
| field_isr ( | |||
| field_a_t a, | |||
| const field_a_t x | |||
| ) { | |||
| field_a_t st[3], tmp1, tmp2; | |||
| const struct { unsigned char sh, idx } ops[] = { | |||
| {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||
| }; | |||
| field_cpy(st[0],x); | |||
| field_cpy(st[1],x); | |||
| field_cpy(st[2],x); | |||
| // ARCH MAGIC FIXME copy-pasted from decaf_fast.c | |||
| static mask_t gf_eq(const field_a_t a, const field_a_t b) { | |||
| field_a_t c; | |||
| field_sub(c,a,b); | |||
| field_strong_reduce(c); | |||
| mask_t ret=0; | |||
| int i; | |||
| for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||
| field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||
| field_mul(tmp2, tmp1, st[ops[i].idx]); | |||
| field_cpy(st[i&1], tmp2); | |||
| } | |||
| mask_t m = field_eq(st[1], ONE); | |||
| cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m); | |||
| field_mul(a,tmp1,st[0]); | |||
| }; | |||
| for (i=0; i<5; i++) { ret |= c->limb[i]; } | |||
| return ((__uint128_t)ret - 1) >> 64; | |||
| } | |||
| /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ | |||
| void | |||
| field_isr ( | |||
| field_a_t a, | |||
| const field_a_t x | |||
| ) { | |||
| field_a_t st[3], tmp1, tmp2; | |||
| const struct { unsigned char sh, idx } ops[] = { | |||
| const struct { unsigned char sh, idx; } ops[] = { | |||
| {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||
| }; | |||
| field_cpy(st[0],x); | |||
| field_cpy(st[1],x); | |||
| field_cpy(st[2],x); | |||
| int i; | |||
| st[0][0] = st[1][0] = st[2][0] = x[0]; | |||
| unsigned int i; | |||
| for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||
| field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||
| field_mul(tmp2, tmp1, st[ops[i].idx]); | |||
| field_cpy(st[i&1], tmp2); | |||
| st[i&1][0] = tmp2[0]; | |||
| } | |||
| mask_t m = field_eq(st[1], ONE); | |||
| mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE); | |||
| // ARCH MAGIC FIXME: should be cond_sel | |||
| for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i] & mask) | |||
| | (SQRT_MINUS_ONE->limb[i] & ~mask); | |||
| field_mul(a,tmp1,st[0]); | |||
| } | |||