From edb25d093c9df135cb75c2aba8cf1bc3a80b18c7 Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Sun, 22 Mar 2015 18:46:03 -0700 Subject: [PATCH] perf improvement in keygen, sign; fix perf regression for decaf versions of these --- include/decaf.h | 45 ++++++++++++++-------------- include/decaf_crypto.h | 2 +- src/decaf_crypto.c | 4 ++- src/decaf_fast.c | 59 +++++++++++++++++++++++++++++-------- src/include/constant_time.h | 3 +- src/scalarmul.c | 2 +- 6 files changed, 76 insertions(+), 39 deletions(-) diff --git a/include/decaf.h b/include/decaf.h index 40203ea..7609f34 100644 --- a/include/decaf.h +++ b/include/decaf.h @@ -33,6 +33,7 @@ #define __attribute__((x)) #endif #define API_VIS __attribute__((visibility("default"))) +#define NOINLINE __attribute__((noinline)) #define WARN_UNUSED __attribute__((warn_unused_result)) #define NONNULL1 __attribute__((nonnull(1))) #define NONNULL2 __attribute__((nonnull(1,2))) @@ -130,7 +131,7 @@ extern "C" { decaf_bool_t decaf_448_scalar_decode ( decaf_448_scalar_t s, const unsigned char ser[DECAF_448_SCALAR_BYTES] -) API_VIS WARN_UNUSED NONNULL2; +) API_VIS WARN_UNUSED NONNULL2 NOINLINE; /** * @brief Read a scalar from wire format or from bytes. Reduces mod @@ -144,7 +145,7 @@ void decaf_448_scalar_decode_long ( decaf_448_scalar_t s, const unsigned char *ser, size_t ser_len -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE; /** * @brief Serialize a scalar to wire format. @@ -155,7 +156,7 @@ void decaf_448_scalar_decode_long ( void decaf_448_scalar_encode ( unsigned char ser[DECAF_448_SCALAR_BYTES], const decaf_448_scalar_t s -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE NOINLINE; /** * @brief Add two scalars. The scalars may use the same memory. @@ -167,7 +168,7 @@ void decaf_448_scalar_add ( decaf_448_scalar_t out, const decaf_448_scalar_t a, const decaf_448_scalar_t b -) API_VIS NONNULL3; +) API_VIS NONNULL3 NOINLINE; /** * @brief Compare two scalars. @@ -179,7 +180,7 @@ void decaf_448_scalar_add ( decaf_bool_t decaf_448_scalar_eq ( const decaf_448_scalar_t a, const decaf_448_scalar_t b -) API_VIS WARN_UNUSED NONNULL2; +) API_VIS WARN_UNUSED NONNULL2 NOINLINE; /** * @brief Subtract two scalars. The scalars may use the same memory. @@ -191,7 +192,7 @@ void decaf_448_scalar_sub ( decaf_448_scalar_t out, const decaf_448_scalar_t a, const decaf_448_scalar_t b -) API_VIS NONNULL3; +) API_VIS NONNULL3 NOINLINE; /** * @brief Multiply two scalars. The scalars may use the same memory. @@ -203,7 +204,7 @@ void decaf_448_scalar_mul ( decaf_448_scalar_t out, const decaf_448_scalar_t a, const decaf_448_scalar_t b -) API_VIS NONNULL3; +) API_VIS NONNULL3 NOINLINE; /** * @brief Copy a scalar. The scalars may use the same memory, in which @@ -225,7 +226,7 @@ void decaf_448_scalar_copy ( void decaf_448_point_encode ( uint8_t ser[DECAF_448_SER_BYTES], const decaf_448_point_t pt -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE; /** * @brief Decode a point from a sequence of bytes. @@ -244,7 +245,7 @@ decaf_bool_t decaf_448_point_decode ( decaf_448_point_t pt, const uint8_t ser[DECAF_448_SER_BYTES], decaf_bool_t allow_identity -) API_VIS WARN_UNUSED NONNULL2; +) API_VIS WARN_UNUSED NONNULL2 NOINLINE; /** * @brief Copy a point. The input and output may alias, @@ -270,7 +271,7 @@ void decaf_448_point_copy ( decaf_bool_t decaf_448_point_eq ( const decaf_448_point_t a, const decaf_448_point_t b -) API_VIS WARN_UNUSED NONNULL2; +) API_VIS WARN_UNUSED NONNULL2 NOINLINE; /** * @brief Add two points to produce a third point. The @@ -285,7 +286,7 @@ void decaf_448_point_add ( decaf_448_point_t sum, const decaf_448_point_t a, const decaf_448_point_t b -) API_VIS NONNULL3; +) API_VIS NONNULL3; // TODO: NOINLINE? /** * @brief Double a point. Equivalent to @@ -297,7 +298,7 @@ void decaf_448_point_add ( void decaf_448_point_double ( decaf_448_point_t two_a, const decaf_448_point_t a -) API_VIS NONNULL2; +) API_VIS NONNULL2; // TODO: NOINLINE? /** * @brief Subtract two points to produce a third point. The @@ -312,7 +313,7 @@ void decaf_448_point_sub ( decaf_448_point_t diff, const decaf_448_point_t a, const decaf_448_point_t b -) API_VIS NONNULL3; +) API_VIS NONNULL3; // TODO: NOINLINE? /** * @brief Multiply a base point by a scalar: scaled = scalar*base. @@ -325,7 +326,7 @@ void decaf_448_point_scalarmul ( decaf_448_point_t scaled, const decaf_448_point_t base, const decaf_448_scalar_t scalar -) API_VIS NONNULL3; +) API_VIS NONNULL3 NOINLINE; /** * @brief Multiply a base point by a scalar: scaled = scalar*base. @@ -350,7 +351,7 @@ decaf_bool_t decaf_448_direct_scalarmul ( const decaf_448_scalar_t scalar, decaf_bool_t allow_identity, decaf_bool_t short_circuit -) API_VIS NONNULL3 WARN_UNUSED; +) API_VIS NONNULL3 WARN_UNUSED NOINLINE; /** * @brief Precompute a table for fast scalar multiplication. @@ -364,7 +365,7 @@ decaf_bool_t decaf_448_direct_scalarmul ( void decaf_448_precompute ( decaf_448_precomputed_s *a, const decaf_448_point_t b -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE; /** * @brief Multiply a precomputed base point by a scalar: @@ -381,7 +382,7 @@ void decaf_448_precomputed_scalarmul ( decaf_448_point_t scaled, const decaf_448_precomputed_s *base, const decaf_448_scalar_t scalar -) API_VIS NONNULL3; +) API_VIS NONNULL3 NOINLINE; /** * @brief Multiply two base points by two scalars: @@ -405,7 +406,7 @@ void decaf_448_point_double_scalarmul ( const decaf_448_scalar_t scalar1, const decaf_448_point_t base2, const decaf_448_scalar_t scalar2 -) API_VIS NONNULL5; +) API_VIS NONNULL5 NOINLINE; /** * @brief Test that a point is valid, for debugging purposes. @@ -416,7 +417,7 @@ void decaf_448_point_double_scalarmul ( */ decaf_bool_t decaf_448_point_valid ( const decaf_448_point_t toTest -) API_VIS WARN_UNUSED NONNULL1; +) API_VIS WARN_UNUSED NONNULL1 NOINLINE; /** * @brief Almost-Elligator-like hash to curve. @@ -448,7 +449,7 @@ decaf_bool_t decaf_448_point_valid ( void decaf_448_point_from_hash_nonuniform ( decaf_448_point_t pt, const unsigned char hashed_data[DECAF_448_SER_BYTES] -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE; /** * @brief Indifferentiable hash function encoding to curve. @@ -461,7 +462,7 @@ void decaf_448_point_from_hash_nonuniform ( void decaf_448_point_from_hash_uniform ( decaf_448_point_t pt, const unsigned char hashed_data[2*DECAF_448_SER_BYTES] -) API_VIS NONNULL2; +) API_VIS NONNULL2 NOINLINE; /** * @brief Overwrite data with zeros. Use memset_s if available. @@ -469,7 +470,7 @@ void decaf_448_point_from_hash_uniform ( void decaf_bzero ( void *data, size_t size -) NONNULL1 API_VIS; +) NONNULL1 API_VIS NOINLINE; /** * @brief Overwrite scalar with zeros. diff --git a/include/decaf_crypto.h b/include/decaf_crypto.h index f967fa9..ae370b8 100644 --- a/include/decaf_crypto.h +++ b/include/decaf_crypto.h @@ -16,7 +16,7 @@ #define DECAF_448_SYMMETRIC_KEY_BYTES 32 /** @cond internal */ -#define API_VIS __attribute__((visibility("default"))) +#define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h #define WARN_UNUSED __attribute__((warn_unused_result)) #define NONNULL1 __attribute__((nonnull(1))) #define NONNULL2 __attribute__((nonnull(1,2))) diff --git a/src/decaf_crypto.c b/src/decaf_crypto.c index 70aebd7..886f287 100644 --- a/src/decaf_crypto.c +++ b/src/decaf_crypto.c @@ -10,6 +10,7 @@ #include "decaf_crypto.h" #include +#include "sha512.h" static const unsigned int DECAF_448_SCALAR_OVERKILL_BYTES = DECAF_448_SCALAR_BYTES + 8; @@ -18,9 +19,10 @@ void decaf_448_derive_private_key ( const decaf_448_symmetric_key_t proto ) { const char *magic = "decaf_448_derive_private_key"; - keccak_sponge_t sponge; uint8_t encoded_scalar[DECAF_448_SCALAR_OVERKILL_BYTES]; decaf_448_point_t pub; + + keccak_sponge_t sponge; shake256_init(sponge); shake256_update(sponge, proto, sizeof(decaf_448_symmetric_key_t)); shake256_update(sponge, (const unsigned char *)magic, strlen(magic)); diff --git a/src/decaf_fast.c b/src/decaf_fast.c index 227a833..3809e8a 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -71,6 +71,16 @@ static const decaf_448_scalar_t decaf_448_scalar_r2 = {{{ SC_LIMB(0x3402a939f823b729) }}}; +static const decaf_448_scalar_t decaf_448_scalar_r1 = {{{ + SC_LIMB(0x721cf5b5529eec34), + SC_LIMB(0x7a4cf635c8e9c2ab), + SC_LIMB(0xeec492d944a725bf), + SC_LIMB(0x000000020cd77058), + SC_LIMB(0), + SC_LIMB(0), + SC_LIMB(0) +}}}; + static const decaf_word_t DECAF_MONTGOMERY_FACTOR = (decaf_word_t)(0x3bd440fae918bc5ull); /** base = twist of Goldilocks base point (~,19). */ @@ -611,25 +621,33 @@ void decaf_448_point_copy ( gf_cpy(a->t, b->t); } -decaf_bool_t decaf_448_scalar_decode( +siv decaf_448_scalar_decode_short ( decaf_448_scalar_t s, - const unsigned char ser[DECAF_448_SER_BYTES] + const unsigned char ser[DECAF_448_SER_BYTES], + unsigned int nbytes ) { unsigned int i,j,k=0; for (i=0; ilimb[i] = out; } - +} + +decaf_bool_t decaf_448_scalar_decode( + decaf_448_scalar_t s, + const unsigned char ser[DECAF_448_SER_BYTES] +) { + unsigned int i; + decaf_448_scalar_decode_short(s, ser, DECAF_448_SER_BYTES); decaf_sdword_t accum = 0; for (i=0; ilimb[i] - decaf_448_scalar_p->limb[i]) >> WBITS; } - decaf_448_scalar_mul(s,s,decaf_448_scalar_one); /* ham-handed reduce */ + decaf_448_montmul(s,s,decaf_448_scalar_r1,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); /* ham-handed reduce */ return accum; } @@ -671,16 +689,21 @@ void decaf_448_scalar_decode_long( } size_t i; - unsigned char tmp[DECAF_448_SER_BYTES] = {0}; decaf_448_scalar_t t1, t2; i = ser_len - (ser_len%DECAF_448_SER_BYTES); if (i==ser_len) i -= DECAF_448_SER_BYTES; - - memcpy(tmp, ser+i, ser_len - i); - ignore_result( decaf_448_scalar_decode(t1, tmp) ); - decaf_bzero(tmp, sizeof(tmp)); + decaf_448_scalar_decode_short(t1, &ser[i], ser_len-i); + + if (ser_len == sizeof(*ser)) { + assert(i==0); + /* ham-handed reduce */ + decaf_448_montmul(s,t1,decaf_448_scalar_r1,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); + decaf_448_scalar_destroy(t1); + return; + } + while (i) { i -= DECAF_448_SER_BYTES; decaf_448_montmul(t1,t1,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR); @@ -1075,6 +1098,15 @@ decaf_448_precompute ( extern const decaf_448_scalar_t decaf_448_precomputed_scalarmul_adjustment; +siv constant_time_lookup_niels ( + niels_s *__restrict__ ni, + const niels_t *table, + int nelts, + int idx +) { + constant_time_lookup(ni, table, sizeof(niels_s), nelts, idx); +} + void decaf_448_precomputed_scalarmul ( decaf_448_point_t out, const decaf_448_precomputed_s *table, @@ -1094,7 +1126,7 @@ void decaf_448_precomputed_scalarmul ( for (j=0; j>(t-1))-1; tab ^= invert; tab &= (1<<(t-1)) - 1; - - constant_time_lookup(ni, &table->table[j<<(t-1)], sizeof(ni), 1<<(t-1), tab); + + constant_time_lookup_niels(ni, &table->table[j<<(t-1)], 1<<(t-1), tab); + cond_neg_niels(ni, invert); if (i||j) { add_niels_to_pt(out, ni, j==n-1 && i /* * Constant-time operations on hopefully-compile-time-sized memory @@ -148,7 +149,7 @@ constant_time_lookup ( const unsigned char *table = (const unsigned char *)table_; word_t j,k; - really_memset(out, 0, elem_bytes); + memset(out, 0, elem_bytes); for (j=0; j