From 03ecad0551fe624cb47dbabcce9932c2efe07ffa Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Fri, 19 Jun 2015 14:15:20 -0700 Subject: [PATCH] it compiles, but it certainly doesnt work yet --- Makefile | 8 +- include/decaf_255.h | 2 +- include/decaf_crypto.h | 64 ++++---- include/shake.hxx | 6 +- src/decaf_fast.c | 52 ++++-- src/include/decaf_255_config.h | 50 ++++++ src/p25519/arch_ref64/p25519.c | 284 +++++++++------------------------ src/p25519/arch_ref64/p25519.h | 16 +- src/p25519/f_arithmetic.c | 59 +++---- 9 files changed, 240 insertions(+), 301 deletions(-) create mode 100644 src/include/decaf_255_config.h diff --git a/Makefile b/Makefile index c3a295e..4ac57aa 100644 --- a/Makefile +++ b/Makefile @@ -19,13 +19,13 @@ ASM ?= $(CC) DECAF ?= decaf_fast ifneq (,$(findstring x86_64,$(MACHINE))) -ARCH ?= arch_x86_64 +ARCH ?= arch_ref64 else # no i386 port yet -ARCH ?= arch_arm_32 +ARCH ?= arch_ref32 endif -FIELD ?= p255 +FIELD ?= p25519 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) @@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) LANGFLAGS = -std=c99 -fno-strict-aliasing LANGXXFLAGS = -fno-strict-aliasing GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC -OFLAGS ?= -O3 +OFLAGS ?= -O2 TODAY = $(shell date "+%Y-%m-%d") diff --git a/include/decaf_255.h b/include/decaf_255.h index 10e3b74..b978155 100644 --- a/include/decaf_255.h +++ b/include/decaf_255.h @@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t; /** Galois field element internal structure */ typedef struct gf_s { decaf_word_t limb[DECAF_255_LIMBS]; -} __attribute__((aligned(32))) gf_s, gf[1]; +} gf_s, gf[1]; /** @endcond */ /** Number of bytes in a serialized point. */ diff --git a/include/decaf_crypto.h b/include/decaf_crypto.h index 6e428fc..6e34bdd 100644 --- a/include/decaf_crypto.h +++ b/include/decaf_crypto.h @@ -18,7 +18,7 @@ #include "shake.h" /** Number of bytes for a symmetric key (expanded to full key) */ -#define DECAF_448_SYMMETRIC_KEY_BYTES 32 +#define DECAF_255_SYMMETRIC_KEY_BYTES 32 /** @cond internal */ #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h @@ -31,29 +31,29 @@ /** @endcond */ /** A symmetric key, the compressed point of a private key. */ -typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; +typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES]; /** An encoded public key. */ -typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; +typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES]; /** A signature. */ -typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; +typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES]; typedef struct { /** @cond intetrnal */ /** The symmetric key from which everything is expanded */ - decaf_448_symmetric_key_t sym; + decaf_255_symmetric_key_t sym; /** The scalar x */ - decaf_448_scalar_t secret_scalar; + decaf_255_scalar_t secret_scalar; /** x*Base */ - decaf_448_public_key_t pub; + decaf_255_public_key_t pub; /** @endcond */ } /** Private key structure for pointers. */ - decaf_448_private_key_s, + decaf_255_private_key_s, /** A private key (gmp array[1] style). */ - decaf_448_private_key_t[1]; + decaf_255_private_key_t[1]; #ifdef __cplusplus extern "C" { @@ -64,16 +64,16 @@ extern "C" { * @param [out] priv The derived private key. * @param [in] proto The compressed or proto-key, which must be 32 random bytes. */ -void decaf_448_derive_private_key ( - decaf_448_private_key_t priv, - const decaf_448_symmetric_key_t proto +void decaf_255_derive_private_key ( + decaf_255_private_key_t priv, + const decaf_255_symmetric_key_t proto ) NONNULL2 API_VIS; /** * @brief Destroy a private key. */ -void decaf_448_destroy_private_key ( - decaf_448_private_key_t priv +void decaf_255_destroy_private_key ( + decaf_255_private_key_t priv ) NONNULL1 API_VIS; /** @@ -81,9 +81,9 @@ void decaf_448_destroy_private_key ( * @param [out] pub The extracted private key. * @param [in] priv The private key. */ -void decaf_448_private_to_public ( - decaf_448_public_key_t pub, - const decaf_448_private_key_t priv +void decaf_255_private_to_public ( + decaf_255_public_key_t pub, + const decaf_255_private_key_t priv ) NONNULL2 API_VIS; /** @@ -104,11 +104,11 @@ void decaf_448_private_to_public ( * and will almost definitely change in the future. */ decaf_bool_t -decaf_448_shared_secret ( +decaf_255_shared_secret ( uint8_t *shared, size_t shared_bytes, - const decaf_448_private_key_t my_privkey, - const decaf_448_public_key_t your_pubkey + const decaf_255_private_key_t my_privkey, + const decaf_255_public_key_t your_pubkey ) NONNULL134 WARN_UNUSED API_VIS; /** @@ -119,9 +119,9 @@ decaf_448_shared_secret ( * @param [in] shake A SHAKE256 context with the message. */ void -decaf_448_sign_shake ( - decaf_448_signature_t sig, - const decaf_448_private_key_t priv, +decaf_255_sign_shake ( + decaf_255_signature_t sig, + const decaf_255_private_key_t priv, const keccak_sponge_t shake ) NONNULL3 API_VIS; @@ -134,9 +134,9 @@ decaf_448_sign_shake ( * @param [in] message_len The message's length. */ void -decaf_448_sign ( - decaf_448_signature_t sig, - const decaf_448_private_key_t priv, +decaf_255_sign ( + decaf_255_signature_t sig, + const decaf_255_private_key_t priv, const unsigned char *message, size_t message_len ) NONNULL3 API_VIS; @@ -149,9 +149,9 @@ decaf_448_sign ( * @param [in] shake A SHAKE256 context with the message. */ decaf_bool_t -decaf_448_verify_shake ( - const decaf_448_signature_t sig, - const decaf_448_public_key_t pub, +decaf_255_verify_shake ( + const decaf_255_signature_t sig, + const decaf_255_public_key_t pub, const keccak_sponge_t shake ) NONNULL3 API_VIS WARN_UNUSED; @@ -164,9 +164,9 @@ decaf_448_verify_shake ( * @param [in] message_len The message's length. */ decaf_bool_t -decaf_448_verify ( - const decaf_448_signature_t sig, - const decaf_448_public_key_t pub, +decaf_255_verify ( + const decaf_255_signature_t sig, + const decaf_255_public_key_t pub, const unsigned char *message, size_t message_len ) NONNULL3 API_VIS WARN_UNUSED; diff --git a/include/shake.hxx b/include/shake.hxx index 95b5d00..97edec5 100644 --- a/include/shake.hxx +++ b/include/shake.hxx @@ -192,18 +192,18 @@ private: }; /**@cond internal*/ -inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { +inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { *this = rng.read(SER_BYTES); } -inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { +inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); rng.read(buffer); set_to_hash(buffer); } -inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { +inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { SecureBuffer out(STEG_BYTES); bool done; do { diff --git a/src/decaf_fast.c b/src/decaf_fast.c index 7a404b9..faa684c 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t; #define siv static inline void __attribute__((always_inline)) static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; -static const int EDWARDS_D = 121665; +static const int EDWARDS_D = -89747; + // Gonna test with PinkBikeShed until the math works... + // Curve25519: 121665; static const scalar_t sc_p = {{{ + // Gonna test with PinkBikeShed until the math works... + SC_LIMB(0xb6b98fd8849faf35), + SC_LIMB(0x16241e6093b2ce59), + SC_LIMB(0), + SC_LIMB(0x2000000000000000) + /* Curve25519: SC_LIMB(0x5812631a5cf5d3ed), SC_LIMB(0x14def9dea2f79cd6), SC_LIMB(0), - SC_LIMB(0), SC_LIMB(0x1000000000000000) + */ }}}; const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; @@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR; /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ const unsigned char base_point_ser_for_pregen[SER_BYTES] = { - 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + 5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; extern const point_t API_NS(point_base); @@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32; #ifdef __clang__ #if 100*__clang_major__ + __clang_minor__ > 305 -#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") +#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize? #endif #endif -#ifndef VECTORIZE -#define VECTORIZE +#ifndef UNROLL +#define UNROLL #endif #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; ilimb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); +// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); gf_bias(c, 2); @@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) { /** Add mod p. Don't reduce. */ siv gf_add_nr ( gf c, const gf a, const gf b ) { -// FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]); +// FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]); ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); } @@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) { /** Constant time, if (swap) (x,y) = (y,x); */ siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { - FOR_LIMB_V(i, { + FOR_LIMB_U(i, { decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; x->limb[i] ^= s; y->limb[i] ^= s; @@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) ( } return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); #else - (void)out; - (void)a; - return 0; + decaf_255_scalar_t b, ma; + int i; + sc_montmul(b,API_NS(scalar_one),sc_r2); + sc_montmul(ma,a,sc_r2); + for (i=SCALAR_BITS-1; i>=0; i--) { + sc_montsqr(b,b); + + decaf_word_t w = sc_p->limb[i/WBITS]; + if (i= 2); + w-=2; + } + if (1 & w>>(i%WBITS)) { + sc_montmul(b,b,ma); + } + } + + sc_montmul(out,b,decaf_255_scalar_one); + API_NS(scalar_destroy)(b); + API_NS(scalar_destroy)(ma); + return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); #endif } diff --git a/src/include/decaf_255_config.h b/src/include/decaf_255_config.h new file mode 100644 index 0000000..be9d978 --- /dev/null +++ b/src/include/decaf_255_config.h @@ -0,0 +1,50 @@ +/** + * @file decaf_config.h + * @author Mike Hamburg + * + * @copyright + * Copyright (c) 2015 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * + * @brief Configuration for decaf_fast.c + */ +#ifndef __DECAF_255_CONFIG_H__ +#define __DECAF_255_CONFIG_H__ 1 + +/** + * Use the Montgomery ladder for direct scalarmul. + * + * The Montgomery ladder is faster than Edwards scalarmul, but providing + * the features Decaf supports (cofactor elimination, twist rejection) + * makes it complicated and adds code. Removing the ladder saves a few + * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul + * time. + */ +#define DECAF_USE_MONTGOMERY_LADDER 1 + +/** The number of comb tables for fixed base scalarmul. */ +#define DECAF_COMBS_N 3 + +/** The number of teeth per comb for fixed base scalarmul. */ +#define DECAF_COMBS_T 5 + +/** The comb spacing fixed base scalarmul. */ +#define DECAF_COMBS_S 17 + +/** Performance tuning: the width of the fixed window for scalar mul. */ +#define DECAF_WINDOW_BITS 4 + +/** + * The number of bits used for the precomputed table in variable-time + * double scalarmul. + */ +#define DECAF_WNAF_FIXED_TABLE_BITS 5 + +/** + * Performance tuning: bits used for the variable table in variable-time + * double scalarmul. + */ +#define DECAF_WNAF_VAR_TABLE_BITS 3 + + +#endif /* __DECAF_255_CONFIG_H__ */ diff --git a/src/p25519/arch_ref64/p25519.c b/src/p25519/arch_ref64/p25519.c index b5892fb..37cedb0 100644 --- a/src/p25519/arch_ref64/p25519.c +++ b/src/p25519/arch_ref64/p25519.c @@ -22,164 +22,33 @@ p255_mul ( const p255_t *as, const p255_t *bs ) { - const uint64_t *a = as->limb, *b = bs->limb; + const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); + + uint64_t bh[4]; + int i,j; + for (i=0; i<4; i++) bh[i] = b[i+1] * 19; + uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum1 = 0, accum2; - uint64_t mask = (1ull<<51) - 1; - - uint64_t aa[4], bb[4], bbb[4]; - - unsigned int i; - for (i=0; i<4; i++) { - aa[i] = a[i] + a[i+4]; - bb[i] = b[i] + b[i+4]; - bbb[i] = bb[i] + b[i+4]; - } - - int I_HATE_UNROLLED_LOOPS = 0; - - if (I_HATE_UNROLLED_LOOPS) { - /* The compiler probably won't unroll this, - * so it's like 80% slower. - */ - for (i=0; i<4; i++) { - accum2 = 0; - - unsigned int j; - for (j=0; j<=i; j++) { - accum2 += widemul(a[j], b[i-j]); - accum1 += widemul(aa[j], bb[i-j]); - accum0 += widemul(a[j+4], b[i-j+4]); - } - for (; j<4; j++) { - accum2 += widemul(a[j], b[i-j+8]); - accum1 += widemul(aa[j], bbb[i-j+4]); - accum0 += widemul(a[j+4], bb[i-j+4]); - } - - accum1 -= accum2; - accum0 += accum2; - - c[i] = ((uint64_t)(accum0)) & mask; - c[i+4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; + __uint128_t accum = 0; + for (i=0; i<5; i++) { + for (j=0; j<=i; j++) { + accum += widemul(b[i-j], a[j]); } - } else { - accum2 = widemul(a[0], b[0]); - accum1 += widemul(aa[0], bb[0]); - accum0 += widemul(a[4], b[4]); - - accum2 += widemul(a[1], b[7]); - accum1 += widemul(aa[1], bbb[3]); - accum0 += widemul(a[5], bb[3]); - - accum2 += widemul(a[2], b[6]); - accum1 += widemul(aa[2], bbb[2]); - accum0 += widemul(a[6], bb[2]); - - accum2 += widemul(a[3], b[5]); - accum1 += widemul(aa[3], bbb[1]); - accum0 += widemul(a[7], bb[1]); - - accum1 -= accum2; - accum0 += accum2; - - c[0] = ((uint64_t)(accum0)) & mask; - c[4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[1]); - accum1 += widemul(aa[0], bb[1]); - accum0 += widemul(a[4], b[5]); - - accum2 += widemul(a[1], b[0]); - accum1 += widemul(aa[1], bb[0]); - accum0 += widemul(a[5], b[4]); - - accum2 += widemul(a[2], b[7]); - accum1 += widemul(aa[2], bbb[3]); - accum0 += widemul(a[6], bb[3]); - - accum2 += widemul(a[3], b[6]); - accum1 += widemul(aa[3], bbb[2]); - accum0 += widemul(a[7], bb[2]); - - accum1 -= accum2; - accum0 += accum2; - - c[1] = ((uint64_t)(accum0)) & mask; - c[5] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[2]); - accum1 += widemul(aa[0], bb[2]); - accum0 += widemul(a[4], b[6]); - - accum2 += widemul(a[1], b[1]); - accum1 += widemul(aa[1], bb[1]); - accum0 += widemul(a[5], b[5]); - - accum2 += widemul(a[2], b[0]); - accum1 += widemul(aa[2], bb[0]); - accum0 += widemul(a[6], b[4]); - - accum2 += widemul(a[3], b[7]); - accum1 += widemul(aa[3], bbb[3]); - accum0 += widemul(a[7], bb[3]); - - accum1 -= accum2; - accum0 += accum2; - - c[2] = ((uint64_t)(accum0)) & mask; - c[6] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[3]); - accum1 += widemul(aa[0], bb[3]); - accum0 += widemul(a[4], b[7]); - - accum2 += widemul(a[1], b[2]); - accum1 += widemul(aa[1], bb[2]); - accum0 += widemul(a[5], b[6]); - - accum2 += widemul(a[2], b[1]); - accum1 += widemul(aa[2], bb[1]); - accum0 += widemul(a[6], b[5]); - - accum2 += widemul(a[3], b[0]); - accum1 += widemul(aa[3], bb[0]); - accum0 += widemul(a[7], b[4]); - - accum1 -= accum2; - accum0 += accum2; - - c[3] = ((uint64_t)(accum0)) & mask; - c[7] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - } /* !I_HATE_UNROLLED_LOOPS */ - - accum0 += accum1; - accum0 += c[4]; - accum1 += c[0]; - c[4] = ((uint64_t)(accum0)) & mask; - c[0] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - c[5] += ((uint64_t)(accum0)); - c[1] += ((uint64_t)(accum1)); + for (; j<5; j++) { + accum += widemul(bh[i-j+4], a[j]); + } + c[i] = accum & mask; + accum >>= 51; + } + /* PERF: parallelize? eh well this is reference */ + accum *= 19; + accum += c[0]; + c[0] = accum & mask; + accum >>= 51; + + assert(accum < mask); + c[1] += accum; } void @@ -188,27 +57,25 @@ p255_mulw ( const p255_t *as, uint64_t b ) { - const uint64_t *a = as->limb; + const uint64_t *a = as->limb, mask = ((1ull<<51)-1); + int i; + uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum4 = 0; - uint64_t mask = (1ull<<56) - 1; - - int i; - for (i=0; i<4; i++) { - accum0 += widemul(b, a[i]); - accum4 += widemul(b, a[i+4]); - c[i] = accum0 & mask; accum0 >>= 56; - c[i+4] = accum4 & mask; accum4 >>= 56; + __uint128_t accum = 0; + for (i=0; i<5; i++) { + accum += widemul(b, a[i]); + c[i] = accum & mask; + accum >>= 51; } + /* PERF: parallelize? eh well this is reference */ + accum *= 19; + accum += c[0]; + c[0] = accum & mask; + accum >>= 51; - accum0 += accum4 + c[4]; - c[4] = accum0 & mask; - c[5] += accum0 >> 56; - - accum4 += c[0]; - c[0] = accum4 & mask; - c[1] += accum4 >> 56; + assert(accum < mask); + c[1] += accum; } void @@ -223,23 +90,21 @@ void p255_strong_reduce ( p255_t *a ) { - uint64_t mask = (1ull<<56)-1; + uint64_t mask = (1ull<<51)-1; /* first, clear high */ - a->limb[4] += a->limb[7]>>56; - a->limb[0] += a->limb[7]>>56; - a->limb[7] &= mask; + a->limb[0] += (a->limb[4]>>51)*19; + a->limb[4] &= mask; - /* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */ + /* now the total is less than 2p */ /* compute total_value - p. No need to reduce mod p. */ - __int128_t scarry = 0; int i; - for (i=0; i<8; i++) { - scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); + for (i=0; i<5; i++) { + scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); a->limb[i] = scarry & mask; - scarry >>= 56; + scarry >>= 51; } /* uncommon case: it was >= p, so now scarry = 0 and this = x @@ -253,10 +118,10 @@ p255_strong_reduce ( __uint128_t carry = 0; /* add it back */ - for (i=0; i<8; i++) { - carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); + for (i=0; i<5; i++) { + carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); a->limb[i] = carry & mask; - carry >>= 56; + carry >>= 51; } assert(is_zero(carry + scarry)); @@ -271,12 +136,13 @@ p255_serialize ( p255_t red; p255_copy(&red, x); p255_strong_reduce(&red); - for (i=0; i<8; i++) { - for (j=0; j<7; j++) { - serial[7*i+j] = red.limb[i]; - red.limb[i] >>= 8; + uint64_t *r = red.limb; + uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; + for (i=0; i<4; i++) { + for (j=0; j<8; j++) { + serial[8*i+j] = ser64[i]; + ser64[i] >>= 8; } - assert(red.limb[i] == 0); } } @@ -286,33 +152,27 @@ p255_deserialize ( const uint8_t serial[32] ) { int i,j; - for (i=0; i<8; i++) { + uint64_t ser64[4], mask = ((1ull<<51)-1); + for (i=0; i<4; i++) { uint64_t out = 0; - for (j=0; j<7; j++) { - out |= ((uint64_t)serial[7*i+j])<<(8*j); + for (j=0; j<8; j++) { + out |= ((uint64_t)serial[8*i+j])<<(8*j); } - x->limb[i] = out; + ser64[i] = out; } - /* Check for reduction. - * - * The idea is to create a variable ge which is all ones (rather, 56 ones) - * if and only if the low $i$ words of $x$ are >= those of p. - * - * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) - */ - uint64_t ge = -1, mask = (1ull<<56)-1; - for (i=0; i<4; i++) { - ge &= x->limb[i]; - } - - /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); + /* Test for >= 2^255-19 */ + uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); + ge &= ser64[1]; + ge &= ser64[2]; + ge &= (ser64[3]<<1) + 1; + ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); - /* Propagate the rest */ - for (i=5; i<8; i++) { - ge &= x->limb[i]; - } + x->limb[0] = ser64[0] & mask; + x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; + x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; + x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; + x->limb[4] = ser64[3]>>12; - return ~is_zero(ge ^ mask); + return ~is_zero(~ge); } diff --git a/src/p25519/arch_ref64/p25519.h b/src/p25519/arch_ref64/p25519.h index d291222..be64923 100644 --- a/src/p25519/arch_ref64/p25519.h +++ b/src/p25519/arch_ref64/p25519.h @@ -15,7 +15,17 @@ typedef struct p255_t { } p255_t; #define LBITS 51 -#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}} +#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} + +/* +#define FIELD_LITERAL(a,b,c,d) {{ \ + (a##ull) & LMASK, \ + ((a##ull)>>51 | (b##ull)<<13) & LMASK, \ + ((b##ull)>>38 | (c##ull)<<26) & LMASK, \ + ((c##ull)>>25 | (d##ull)<<39) & LMASK, \ + (d##ull)>>12 \ +}} +*/ #ifdef __cplusplus extern "C" { @@ -140,9 +150,9 @@ p255_weak_reduce ( p255_t *a ) { uint64_t mask = (1ull<<51) - 1; - uint64_t tmp = a->limb[5] >> 51; + uint64_t tmp = a->limb[4] >> 51; int i; - for (i=7; i>0; i--) { + for (i=4; i>0; i--) { a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); } a->limb[0] = (a->limb[0] & mask) + tmp*19; diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c index 07c140b..eab2640 100644 --- a/src/p25519/f_arithmetic.c +++ b/src/p25519/f_arithmetic.c @@ -10,58 +10,51 @@ #include "field.h" -extern field_a_t ONE; // TODO - -static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere? +static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere? 0x61b274a0ea0b0, 0x0d5a5fc8f189d, 0x7ef5e9cbd0c60, 0x78595a6804c9e, 0x2b8324804fc1d -); +)}; + +static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted + 1,0,0,0,0 +)}; -void -field_isr ( - field_a_t a, - const field_a_t x -) { - field_a_t st[3], tmp1, tmp2; - const struct { unsigned char sh, idx } ops[] = { - {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} - }; - field_cpy(st[0],x); - field_cpy(st[1],x); - field_cpy(st[2],x); +// ARCH MAGIC FIXME copy-pasted from decaf_fast.c +static mask_t gf_eq(const field_a_t a, const field_a_t b) { + field_a_t c; + field_sub(c,a,b); + field_strong_reduce(c); + mask_t ret=0; int i; - for (i=0; ilimb[i]; } + return ((__uint128_t)ret - 1) >> 64; +} +/* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ void field_isr ( field_a_t a, const field_a_t x ) { field_a_t st[3], tmp1, tmp2; - const struct { unsigned char sh, idx } ops[] = { + const struct { unsigned char sh, idx; } ops[] = { {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} }; - field_cpy(st[0],x); - field_cpy(st[1],x); - field_cpy(st[2],x); - int i; + st[0][0] = st[1][0] = st[2][0] = x[0]; + unsigned int i; for (i=0; ilimb[i] = (ONE->limb[i] & mask) + | (SQRT_MINUS_ONE->limb[i] & ~mask); + field_mul(a,tmp1,st[0]); }