Browse Source

it compiles, but it certainly doesnt work yet

master
Michael Hamburg 9 years ago
parent
commit
03ecad0551
9 changed files with 240 additions and 301 deletions
  1. +4
    -4
      Makefile
  2. +1
    -1
      include/decaf_255.h
  3. +32
    -32
      include/decaf_crypto.h
  4. +3
    -3
      include/shake.hxx
  5. +39
    -13
      src/decaf_fast.c
  6. +50
    -0
      src/include/decaf_255_config.h
  7. +72
    -212
      src/p25519/arch_ref64/p25519.c
  8. +13
    -3
      src/p25519/arch_ref64/p25519.h
  9. +26
    -33
      src/p25519/f_arithmetic.c

+ 4
- 4
Makefile View File

@@ -19,13 +19,13 @@ ASM ?= $(CC)
DECAF ?= decaf_fast DECAF ?= decaf_fast


ifneq (,$(findstring x86_64,$(MACHINE))) ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_x86_64
ARCH ?= arch_ref64
else else
# no i386 port yet # no i386 port yet
ARCH ?= arch_arm_32
ARCH ?= arch_ref32
endif endif


FIELD ?= p255
FIELD ?= p25519


WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
LANGFLAGS = -std=c99 -fno-strict-aliasing LANGFLAGS = -std=c99 -fno-strict-aliasing
LANGXXFLAGS = -fno-strict-aliasing LANGXXFLAGS = -fno-strict-aliasing
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
OFLAGS ?= -O3
OFLAGS ?= -O2


TODAY = $(shell date "+%Y-%m-%d") TODAY = $(shell date "+%Y-%m-%d")




+ 1
- 1
include/decaf_255.h View File

@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t;
/** Galois field element internal structure */ /** Galois field element internal structure */
typedef struct gf_s { typedef struct gf_s {
decaf_word_t limb[DECAF_255_LIMBS]; decaf_word_t limb[DECAF_255_LIMBS];
} __attribute__((aligned(32))) gf_s, gf[1];
} gf_s, gf[1];
/** @endcond */ /** @endcond */


/** Number of bytes in a serialized point. */ /** Number of bytes in a serialized point. */


+ 32
- 32
include/decaf_crypto.h View File

@@ -18,7 +18,7 @@
#include "shake.h" #include "shake.h"


/** Number of bytes for a symmetric key (expanded to full key) */ /** Number of bytes for a symmetric key (expanded to full key) */
#define DECAF_448_SYMMETRIC_KEY_BYTES 32
#define DECAF_255_SYMMETRIC_KEY_BYTES 32


/** @cond internal */ /** @cond internal */
#define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h
@@ -31,29 +31,29 @@
/** @endcond */ /** @endcond */


/** A symmetric key, the compressed point of a private key. */ /** A symmetric key, the compressed point of a private key. */
typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES];
typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES];


/** An encoded public key. */ /** An encoded public key. */
typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES];
typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES];


/** A signature. */ /** A signature. */
typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES];
typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES];


typedef struct { typedef struct {
/** @cond intetrnal */ /** @cond intetrnal */
/** The symmetric key from which everything is expanded */ /** The symmetric key from which everything is expanded */
decaf_448_symmetric_key_t sym;
decaf_255_symmetric_key_t sym;
/** The scalar x */ /** The scalar x */
decaf_448_scalar_t secret_scalar;
decaf_255_scalar_t secret_scalar;
/** x*Base */ /** x*Base */
decaf_448_public_key_t pub;
decaf_255_public_key_t pub;
/** @endcond */ /** @endcond */
} /** Private key structure for pointers. */ } /** Private key structure for pointers. */
decaf_448_private_key_s,
decaf_255_private_key_s,
/** A private key (gmp array[1] style). */ /** A private key (gmp array[1] style). */
decaf_448_private_key_t[1];
decaf_255_private_key_t[1];


#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@@ -64,16 +64,16 @@ extern "C" {
* @param [out] priv The derived private key. * @param [out] priv The derived private key.
* @param [in] proto The compressed or proto-key, which must be 32 random bytes. * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
*/ */
void decaf_448_derive_private_key (
decaf_448_private_key_t priv,
const decaf_448_symmetric_key_t proto
void decaf_255_derive_private_key (
decaf_255_private_key_t priv,
const decaf_255_symmetric_key_t proto
) NONNULL2 API_VIS; ) NONNULL2 API_VIS;


/** /**
* @brief Destroy a private key. * @brief Destroy a private key.
*/ */
void decaf_448_destroy_private_key (
decaf_448_private_key_t priv
void decaf_255_destroy_private_key (
decaf_255_private_key_t priv
) NONNULL1 API_VIS; ) NONNULL1 API_VIS;


/** /**
@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key (
* @param [out] pub The extracted private key. * @param [out] pub The extracted private key.
* @param [in] priv The private key. * @param [in] priv The private key.
*/ */
void decaf_448_private_to_public (
decaf_448_public_key_t pub,
const decaf_448_private_key_t priv
void decaf_255_private_to_public (
decaf_255_public_key_t pub,
const decaf_255_private_key_t priv
) NONNULL2 API_VIS; ) NONNULL2 API_VIS;
/** /**
@@ -104,11 +104,11 @@ void decaf_448_private_to_public (
* and will almost definitely change in the future. * and will almost definitely change in the future.
*/ */
decaf_bool_t decaf_bool_t
decaf_448_shared_secret (
decaf_255_shared_secret (
uint8_t *shared, uint8_t *shared,
size_t shared_bytes, size_t shared_bytes,
const decaf_448_private_key_t my_privkey,
const decaf_448_public_key_t your_pubkey
const decaf_255_private_key_t my_privkey,
const decaf_255_public_key_t your_pubkey
) NONNULL134 WARN_UNUSED API_VIS; ) NONNULL134 WARN_UNUSED API_VIS;
/** /**
@@ -119,9 +119,9 @@ decaf_448_shared_secret (
* @param [in] shake A SHAKE256 context with the message. * @param [in] shake A SHAKE256 context with the message.
*/ */
void void
decaf_448_sign_shake (
decaf_448_signature_t sig,
const decaf_448_private_key_t priv,
decaf_255_sign_shake (
decaf_255_signature_t sig,
const decaf_255_private_key_t priv,
const keccak_sponge_t shake const keccak_sponge_t shake
) NONNULL3 API_VIS; ) NONNULL3 API_VIS;


@@ -134,9 +134,9 @@ decaf_448_sign_shake (
* @param [in] message_len The message's length. * @param [in] message_len The message's length.
*/ */
void void
decaf_448_sign (
decaf_448_signature_t sig,
const decaf_448_private_key_t priv,
decaf_255_sign (
decaf_255_signature_t sig,
const decaf_255_private_key_t priv,
const unsigned char *message, const unsigned char *message,
size_t message_len size_t message_len
) NONNULL3 API_VIS; ) NONNULL3 API_VIS;
@@ -149,9 +149,9 @@ decaf_448_sign (
* @param [in] shake A SHAKE256 context with the message. * @param [in] shake A SHAKE256 context with the message.
*/ */
decaf_bool_t decaf_bool_t
decaf_448_verify_shake (
const decaf_448_signature_t sig,
const decaf_448_public_key_t pub,
decaf_255_verify_shake (
const decaf_255_signature_t sig,
const decaf_255_public_key_t pub,
const keccak_sponge_t shake const keccak_sponge_t shake
) NONNULL3 API_VIS WARN_UNUSED; ) NONNULL3 API_VIS WARN_UNUSED;


@@ -164,9 +164,9 @@ decaf_448_verify_shake (
* @param [in] message_len The message's length. * @param [in] message_len The message's length.
*/ */
decaf_bool_t decaf_bool_t
decaf_448_verify (
const decaf_448_signature_t sig,
const decaf_448_public_key_t pub,
decaf_255_verify (
const decaf_255_signature_t sig,
const decaf_255_public_key_t pub,
const unsigned char *message, const unsigned char *message,
size_t message_len size_t message_len
) NONNULL3 API_VIS WARN_UNUSED; ) NONNULL3 API_VIS WARN_UNUSED;


+ 3
- 3
include/shake.hxx View File

@@ -192,18 +192,18 @@ private:
}; };


/**@cond internal*/ /**@cond internal*/
inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
*this = rng.read(SER_BYTES); *this = rng.read(SER_BYTES);
} }


inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES);
rng.read(buffer); rng.read(buffer);
set_to_hash(buffer); set_to_hash(buffer);
} }




inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
SecureBuffer out(STEG_BYTES); SecureBuffer out(STEG_BYTES);
bool done; bool done;
do { do {


+ 39
- 13
src/decaf_fast.c View File

@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t;
#define siv static inline void __attribute__((always_inline)) #define siv static inline void __attribute__((always_inline))
static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};


static const int EDWARDS_D = 121665;
static const int EDWARDS_D = -89747;
// Gonna test with PinkBikeShed until the math works...
// Curve25519: 121665;


static const scalar_t sc_p = {{{ static const scalar_t sc_p = {{{
// Gonna test with PinkBikeShed until the math works...
SC_LIMB(0xb6b98fd8849faf35),
SC_LIMB(0x16241e6093b2ce59),
SC_LIMB(0),
SC_LIMB(0x2000000000000000)
/* Curve25519:
SC_LIMB(0x5812631a5cf5d3ed), SC_LIMB(0x5812631a5cf5d3ed),
SC_LIMB(0x14def9dea2f79cd6), SC_LIMB(0x14def9dea2f79cd6),
SC_LIMB(0), SC_LIMB(0),
SC_LIMB(0),
SC_LIMB(0x1000000000000000) SC_LIMB(0x1000000000000000)
*/
}}}; }}};


const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR;


/* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */
const unsigned char base_point_ser_for_pregen[SER_BYTES] = { const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };


extern const point_t API_NS(point_base); extern const point_t API_NS(point_base);
@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32;


#ifdef __clang__ #ifdef __clang__
#if 100*__clang_major__ + __clang_minor__ > 305 #if 100*__clang_major__ + __clang_minor__ > 305
#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize?
#endif #endif
#endif #endif


#ifndef VECTORIZE
#define VECTORIZE
#ifndef UNROLL
#define UNROLL
#endif #endif


#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }}


/** Copy x = y */ /** Copy x = y */
siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) {


/** Subtract mod p. Bias by 2 and don't reduce */ /** Subtract mod p. Bias by 2 and don't reduce */
siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
gf_bias(c, 2); gf_bias(c, 2);
@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {


/** Add mod p. Don't reduce. */ /** Add mod p. Don't reduce. */
siv gf_add_nr ( gf c, const gf a, const gf b ) { siv gf_add_nr ( gf c, const gf a, const gf b ) {
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]);
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
} }
@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) {


/** Constant time, if (swap) (x,y) = (y,x); */ /** Constant time, if (swap) (x,y) = (y,x); */
siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
FOR_LIMB_V(i, {
FOR_LIMB_U(i, {
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
x->limb[i] ^= s; x->limb[i] ^= s;
y->limb[i] ^= s; y->limb[i] ^= s;
@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) (
} }
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#else #else
(void)out;
(void)a;
return 0;
decaf_255_scalar_t b, ma;
int i;
sc_montmul(b,API_NS(scalar_one),sc_r2);
sc_montmul(ma,a,sc_r2);
for (i=SCALAR_BITS-1; i>=0; i--) {
sc_montsqr(b,b);
decaf_word_t w = sc_p->limb[i/WBITS];
if (i<WBITS) {
assert(w >= 2);
w-=2;
}
if (1 & w>>(i%WBITS)) {
sc_montmul(b,b,ma);
}
}

sc_montmul(out,b,decaf_255_scalar_one);
API_NS(scalar_destroy)(b);
API_NS(scalar_destroy)(ma);
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
#endif #endif
} }




+ 50
- 0
src/include/decaf_255_config.h View File

@@ -0,0 +1,50 @@
/**
* @file decaf_config.h
* @author Mike Hamburg
*
* @copyright
* Copyright (c) 2015 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
*
* @brief Configuration for decaf_fast.c
*/
#ifndef __DECAF_255_CONFIG_H__
#define __DECAF_255_CONFIG_H__ 1

/**
* Use the Montgomery ladder for direct scalarmul.
*
* The Montgomery ladder is faster than Edwards scalarmul, but providing
* the features Decaf supports (cofactor elimination, twist rejection)
* makes it complicated and adds code. Removing the ladder saves a few
* kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
* time.
*/
#define DECAF_USE_MONTGOMERY_LADDER 1

/** The number of comb tables for fixed base scalarmul. */
#define DECAF_COMBS_N 3

/** The number of teeth per comb for fixed base scalarmul. */
#define DECAF_COMBS_T 5

/** The comb spacing fixed base scalarmul. */
#define DECAF_COMBS_S 17

/** Performance tuning: the width of the fixed window for scalar mul. */
#define DECAF_WINDOW_BITS 4

/**
* The number of bits used for the precomputed table in variable-time
* double scalarmul.
*/
#define DECAF_WNAF_FIXED_TABLE_BITS 5

/**
* Performance tuning: bits used for the variable table in variable-time
* double scalarmul.
*/
#define DECAF_WNAF_VAR_TABLE_BITS 3


#endif /* __DECAF_255_CONFIG_H__ */

+ 72
- 212
src/p25519/arch_ref64/p25519.c View File

@@ -22,164 +22,33 @@ p255_mul (
const p255_t *as, const p255_t *as,
const p255_t *bs const p255_t *bs
) { ) {
const uint64_t *a = as->limb, *b = bs->limb;
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t bh[4];
int i,j;
for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;


__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<51) - 1;

uint64_t aa[4], bb[4], bbb[4];

unsigned int i;
for (i=0; i<4; i++) {
aa[i] = a[i] + a[i+4];
bb[i] = b[i] + b[i+4];
bbb[i] = bb[i] + b[i+4];
}

int I_HATE_UNROLLED_LOOPS = 0;

if (I_HATE_UNROLLED_LOOPS) {
/* The compiler probably won't unroll this,
* so it's like 80% slower.
*/
for (i=0; i<4; i++) {
accum2 = 0;

unsigned int j;
for (j=0; j<=i; j++) {
accum2 += widemul(a[j], b[i-j]);
accum1 += widemul(aa[j], bb[i-j]);
accum0 += widemul(a[j+4], b[i-j+4]);
}
for (; j<4; j++) {
accum2 += widemul(a[j], b[i-j+8]);
accum1 += widemul(aa[j], bbb[i-j+4]);
accum0 += widemul(a[j+4], bb[i-j+4]);
}

accum1 -= accum2;
accum0 += accum2;

c[i] = ((uint64_t)(accum0)) & mask;
c[i+4] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;
__uint128_t accum = 0;
for (i=0; i<5; i++) {
for (j=0; j<=i; j++) {
accum += widemul(b[i-j], a[j]);
} }
} else {
accum2 = widemul(a[0], b[0]);
accum1 += widemul(aa[0], bb[0]);
accum0 += widemul(a[4], b[4]);

accum2 += widemul(a[1], b[7]);
accum1 += widemul(aa[1], bbb[3]);
accum0 += widemul(a[5], bb[3]);

accum2 += widemul(a[2], b[6]);
accum1 += widemul(aa[2], bbb[2]);
accum0 += widemul(a[6], bb[2]);

accum2 += widemul(a[3], b[5]);
accum1 += widemul(aa[3], bbb[1]);
accum0 += widemul(a[7], bb[1]);

accum1 -= accum2;
accum0 += accum2;

c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;

accum2 = widemul(a[0], b[1]);
accum1 += widemul(aa[0], bb[1]);
accum0 += widemul(a[4], b[5]);

accum2 += widemul(a[1], b[0]);
accum1 += widemul(aa[1], bb[0]);
accum0 += widemul(a[5], b[4]);

accum2 += widemul(a[2], b[7]);
accum1 += widemul(aa[2], bbb[3]);
accum0 += widemul(a[6], bb[3]);

accum2 += widemul(a[3], b[6]);
accum1 += widemul(aa[3], bbb[2]);
accum0 += widemul(a[7], bb[2]);

accum1 -= accum2;
accum0 += accum2;

c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;

accum2 = widemul(a[0], b[2]);
accum1 += widemul(aa[0], bb[2]);
accum0 += widemul(a[4], b[6]);

accum2 += widemul(a[1], b[1]);
accum1 += widemul(aa[1], bb[1]);
accum0 += widemul(a[5], b[5]);

accum2 += widemul(a[2], b[0]);
accum1 += widemul(aa[2], bb[0]);
accum0 += widemul(a[6], b[4]);

accum2 += widemul(a[3], b[7]);
accum1 += widemul(aa[3], bbb[3]);
accum0 += widemul(a[7], bb[3]);

accum1 -= accum2;
accum0 += accum2;

c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;

accum2 = widemul(a[0], b[3]);
accum1 += widemul(aa[0], bb[3]);
accum0 += widemul(a[4], b[7]);

accum2 += widemul(a[1], b[2]);
accum1 += widemul(aa[1], bb[2]);
accum0 += widemul(a[5], b[6]);

accum2 += widemul(a[2], b[1]);
accum1 += widemul(aa[2], bb[1]);
accum0 += widemul(a[6], b[5]);

accum2 += widemul(a[3], b[0]);
accum1 += widemul(aa[3], bb[0]);
accum0 += widemul(a[7], b[4]);

accum1 -= accum2;
accum0 += accum2;

c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;
} /* !I_HATE_UNROLLED_LOOPS */

accum0 += accum1;
accum0 += c[4];
accum1 += c[0];
c[4] = ((uint64_t)(accum0)) & mask;
c[0] = ((uint64_t)(accum1)) & mask;

accum0 >>= 56;
accum1 >>= 56;

c[5] += ((uint64_t)(accum0));
c[1] += ((uint64_t)(accum1));
for (; j<5; j++) {
accum += widemul(bh[i-j+4], a[j]);
}
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
} }


void void
@@ -188,27 +57,25 @@ p255_mulw (
const p255_t *as, const p255_t *as,
uint64_t b uint64_t b
) { ) {
const uint64_t *a = as->limb;
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb; uint64_t *c = cs->limb;


__uint128_t accum0 = 0, accum4 = 0;
uint64_t mask = (1ull<<56) - 1;

int i;
for (i=0; i<4; i++) {
accum0 += widemul(b, a[i]);
accum4 += widemul(b, a[i+4]);
c[i] = accum0 & mask; accum0 >>= 56;
c[i+4] = accum4 & mask; accum4 >>= 56;
__uint128_t accum = 0;
for (i=0; i<5; i++) {
accum += widemul(b, a[i]);
c[i] = accum & mask;
accum >>= 51;
} }
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 56;

accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 56;
assert(accum < mask);
c[1] += accum;
} }


void void
@@ -223,23 +90,21 @@ void
p255_strong_reduce ( p255_strong_reduce (
p255_t *a p255_t *a
) { ) {
uint64_t mask = (1ull<<56)-1;
uint64_t mask = (1ull<<51)-1;


/* first, clear high */ /* first, clear high */
a->limb[4] += a->limb[7]>>56;
a->limb[0] += a->limb[7]>>56;
a->limb[7] &= mask;
a->limb[0] += (a->limb[4]>>51)*19;
a->limb[4] &= mask;


/* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */
/* now the total is less than 2p */


/* compute total_value - p. No need to reduce mod p. */ /* compute total_value - p. No need to reduce mod p. */

__int128_t scarry = 0; __int128_t scarry = 0;
int i; int i;
for (i=0; i<8; i++) {
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
for (i=0; i<5; i++) {
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
a->limb[i] = scarry & mask; a->limb[i] = scarry & mask;
scarry >>= 56;
scarry >>= 51;
} }


/* uncommon case: it was >= p, so now scarry = 0 and this = x /* uncommon case: it was >= p, so now scarry = 0 and this = x
@@ -253,10 +118,10 @@ p255_strong_reduce (
__uint128_t carry = 0; __uint128_t carry = 0;


/* add it back */ /* add it back */
for (i=0; i<8; i++) {
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
for (i=0; i<5; i++) {
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
a->limb[i] = carry & mask; a->limb[i] = carry & mask;
carry >>= 56;
carry >>= 51;
} }


assert(is_zero(carry + scarry)); assert(is_zero(carry + scarry));
@@ -271,12 +136,13 @@ p255_serialize (
p255_t red; p255_t red;
p255_copy(&red, x); p255_copy(&red, x);
p255_strong_reduce(&red); p255_strong_reduce(&red);
for (i=0; i<8; i++) {
for (j=0; j<7; j++) {
serial[7*i+j] = red.limb[i];
red.limb[i] >>= 8;
uint64_t *r = red.limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
for (j=0; j<8; j++) {
serial[8*i+j] = ser64[i];
ser64[i] >>= 8;
} }
assert(red.limb[i] == 0);
} }
} }


@@ -286,33 +152,27 @@ p255_deserialize (
const uint8_t serial[32] const uint8_t serial[32]
) { ) {
int i,j; int i,j;
for (i=0; i<8; i++) {
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
uint64_t out = 0; uint64_t out = 0;
for (j=0; j<7; j++) {
out |= ((uint64_t)serial[7*i+j])<<(8*j);
for (j=0; j<8; j++) {
out |= ((uint64_t)serial[8*i+j])<<(8*j);
} }
x->limb[i] = out;
ser64[i] = out;
} }
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 56 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
uint64_t ge = -1, mask = (1ull<<56)-1;
for (i=0; i<4; i++) {
ge &= x->limb[i];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
/* Test for >= 2^255-19 */
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
ge &= ser64[1];
ge &= ser64[2];
ge &= (ser64[3]<<1) + 1;
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
x->limb[0] = ser64[0] & mask;
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(ge ^ mask);
return ~is_zero(~ge);
} }

+ 13
- 3
src/p25519/arch_ref64/p25519.h View File

@@ -15,7 +15,17 @@ typedef struct p255_t {
} p255_t; } p255_t;


#define LBITS 51 #define LBITS 51
#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}}
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

/*
#define FIELD_LITERAL(a,b,c,d) {{ \
(a##ull) & LMASK, \
((a##ull)>>51 | (b##ull)<<13) & LMASK, \
((b##ull)>>38 | (c##ull)<<26) & LMASK, \
((c##ull)>>25 | (d##ull)<<39) & LMASK, \
(d##ull)>>12 \
}}
*/


#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@@ -140,9 +150,9 @@ p255_weak_reduce (
p255_t *a p255_t *a
) { ) {
uint64_t mask = (1ull<<51) - 1; uint64_t mask = (1ull<<51) - 1;
uint64_t tmp = a->limb[5] >> 51;
uint64_t tmp = a->limb[4] >> 51;
int i; int i;
for (i=7; i>0; i--) {
for (i=4; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
} }
a->limb[0] = (a->limb[0] & mask) + tmp*19; a->limb[0] = (a->limb[0] & mask) + tmp*19;


+ 26
- 33
src/p25519/f_arithmetic.c View File

@@ -10,58 +10,51 @@


#include "field.h" #include "field.h"


extern field_a_t ONE; // TODO

static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere?
static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere?
0x61b274a0ea0b0, 0x61b274a0ea0b0,
0x0d5a5fc8f189d, 0x0d5a5fc8f189d,
0x7ef5e9cbd0c60, 0x7ef5e9cbd0c60,
0x78595a6804c9e, 0x78595a6804c9e,
0x2b8324804fc1d 0x2b8324804fc1d
);
)};
static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
1,0,0,0,0
)};


void
field_isr (
field_a_t a,
const field_a_t x
) {
field_a_t st[3], tmp1, tmp2;
const struct { unsigned char sh, idx } ops[] = {
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
};
field_cpy(st[0],x);
field_cpy(st[1],x);
field_cpy(st[2],x);
// ARCH MAGIC FIXME copy-pasted from decaf_fast.c
static mask_t gf_eq(const field_a_t a, const field_a_t b) {
field_a_t c;
field_sub(c,a,b);
field_strong_reduce(c);
mask_t ret=0;
int i; int i;
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
field_sqrn(tmp1, st[1^i&1], ops[i].sh);
field_mul(tmp2, tmp1, st[ops[i].idx]);
field_cpy(st[i&1], tmp2);
}
mask_t m = field_eq(st[1], ONE);
cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m);
field_mul(a,tmp1,st[0]);
};
for (i=0; i<5; i++) { ret |= c->limb[i]; }
return ((__uint128_t)ret - 1) >> 64;
}


/* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
void void
field_isr ( field_isr (
field_a_t a, field_a_t a,
const field_a_t x const field_a_t x
) { ) {
field_a_t st[3], tmp1, tmp2; field_a_t st[3], tmp1, tmp2;
const struct { unsigned char sh, idx } ops[] = {
const struct { unsigned char sh, idx; } ops[] = {
{1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
}; };
field_cpy(st[0],x);
field_cpy(st[1],x);
field_cpy(st[2],x);
int i;
st[0][0] = st[1][0] = st[2][0] = x[0];
unsigned int i;
for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
field_sqrn(tmp1, st[1^i&1], ops[i].sh); field_sqrn(tmp1, st[1^i&1], ops[i].sh);
field_mul(tmp2, tmp1, st[ops[i].idx]); field_mul(tmp2, tmp1, st[ops[i].idx]);
field_cpy(st[i&1], tmp2);
st[i&1][0] = tmp2[0];
} }
mask_t m = field_eq(st[1], ONE);
mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE);
// ARCH MAGIC FIXME: should be cond_sel
for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i] & mask)
| (SQRT_MINUS_ONE->limb[i] & ~mask);
field_mul(a,tmp1,st[0]);
} }

Loading…
Cancel
Save