From 6546660199372e66f581e8b24483cc555c2a7d9b Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Sat, 25 Oct 2014 17:26:05 -0700 Subject: [PATCH] E-521-related changes. Not quite ready yet... This is largely a save-your-work checkin. Created p521/arch_ref64 code to make sure E-521 basically works. Fixed some of the testing code around E-521. It doesn't quite pass everything yet. Created p521/arch_x86_64 code with optimized multiply. In this checkin, the multiply is fast and works, but all the other code in that directory is the completely unoptimized ref64 build which reduces after every add and sub. So the whole thing isn't fast yet. --- src/goldilocks.c | 37 ++- src/include/constant_time.h | 9 +- src/include/word.h | 5 + src/p521/arch_ref64/arch_config.h | 1 + src/p521/arch_ref64/p521.c | 417 +++++++++++++++++++++++++++ src/p521/arch_ref64/p521.h | 244 ++++++++++++++++ src/p521/arch_x86_64/arch_config.h | 1 + src/p521/arch_x86_64/p521.c | 439 +++++++++++++++++++++++++++++ src/p521/arch_x86_64/p521.h | 247 ++++++++++++++++ src/p521/magic.c | 38 ++- src/scalarmul.c | 8 +- test/bench.c | 4 +- test/test_arithmetic.c | 7 +- test/test_pointops.c | 5 + test/test_scalarmul.c | 19 +- 15 files changed, 1455 insertions(+), 26 deletions(-) create mode 100644 src/p521/arch_ref64/arch_config.h create mode 100644 src/p521/arch_ref64/p521.c create mode 100644 src/p521/arch_ref64/p521.h create mode 100644 src/p521/arch_x86_64/arch_config.h create mode 100644 src/p521/arch_x86_64/p521.c create mode 100644 src/p521/arch_x86_64/p521.h diff --git a/src/goldilocks.c b/src/goldilocks.c index 11ccdfb..c3a4ca3 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -34,6 +34,27 @@ #define GOLDI_DIVERSIFY_BYTES 8 + +#if FIELD_BYTES <= SHA512_OUTPUT_BYTES +#define FIELD_HASH_BYTES SHA512_OUTPUT_BYTES +#define field_hash_final sha512_final +#else +#define FIELD_HASH_BYTES (SHA512_OUTPUT_BYTES * ((FIELD_BYTES-1)/SHA512_OUTPUT_BYTES + 1)) +static inline void field_hash_final ( + struct sha512_ctx_t *ctx, + unsigned char out[FIELD_HASH_BYTES] +) { + /* SHA PRNG I guess? I really should have used SHAKE */ + int i; + for (i=0; i<= (FIELD_BYTES-1) / SHA512_OUTPUT_BYTES; i++) { + if (i) + sha512_update(ctx, &out[(i-1)*SHA512_OUTPUT_BYTES], SHA512_OUTPUT_BYTES); + sha512_final(ctx, &out[i*SHA512_OUTPUT_BYTES]); + } +} +#endif + + /* These are just unique identifiers */ static const char *G_INITING = "initializing"; static const char *G_INITED = "initialized"; @@ -135,7 +156,7 @@ goldilocks_derive_private_key ( memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES); - unsigned char skb[SHA512_OUTPUT_BYTES]; + unsigned char skb[FIELD_HASH_BYTES]; word_t sk[GOLDI_FIELD_WORDS]; assert(sizeof(skb) >= sizeof(sk)); @@ -146,9 +167,9 @@ goldilocks_derive_private_key ( sha512_init(&ctx); sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES); sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES); - sha512_final(&ctx, (unsigned char *)skb); + field_hash_final(&ctx, (unsigned char *)skb); - barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &curve_prime_order); + barrett_deserialize_and_reduce(sk, skb, sizeof(skb), &curve_prime_order); barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES); scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); @@ -316,13 +337,13 @@ goldilocks_derive_challenge( uint64_t message_len ) { /* challenge = H(pk, [nonceG], message). */ - unsigned char sha_out[SHA512_OUTPUT_BYTES]; + unsigned char sha_out[FIELD_HASH_BYTES]; struct sha512_ctx_t ctx; sha512_init(&ctx); sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES); sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES); sha512_update(&ctx, message, message_len); - sha512_final(&ctx, sha_out); + field_hash_final(&ctx, sha_out); barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order); } @@ -346,7 +367,7 @@ goldilocks_sign ( } /* Derive a nonce. TODO: use HMAC. FUTURE: factor. */ - unsigned char sha_out[SHA512_OUTPUT_BYTES]; + unsigned char sha_out[FIELD_HASH_BYTES]; word_t tk[GOLDI_FIELD_WORDS]; struct sha512_ctx_t ctx; sha512_init(&ctx); @@ -354,8 +375,8 @@ goldilocks_sign ( sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); sha512_update(&ctx, message, message_len); sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); - sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order); + field_hash_final(&ctx, sha_out); + barrett_deserialize_and_reduce(tk, sha_out, sizeof(sha_out), &curve_prime_order); /* 4[nonce]G */ uint8_t signature_tmp[GOLDI_FIELD_BYTES]; diff --git a/src/include/constant_time.h b/src/include/constant_time.h index 686a508..405c2f5 100644 --- a/src/include/constant_time.h +++ b/src/include/constant_time.h @@ -127,7 +127,9 @@ constant_time_cond_swap ( /** * @brief Constant-time equivalent of memcpy(out, table + elem_bytes*idx, elem_bytes); * - * The table must be at least as aligned as elem_bytes. The output must be vector aligned. + * The table must be at least as aligned as elem_bytes. The output must be word aligned, + * and if the input size is vector aligned it must also be vector aligned. + * * The table and output must not alias. */ static __inline__ void @@ -151,8 +153,9 @@ constant_time_lookup ( big_register_t br_mask = br_is_zero(big_i); for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { if (elem_bytes % sizeof(big_register_t)) { - /* input unaligned, output aligned */ - *(big_register_t *)(out+k) |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; + /* unaligned */ + ((unaligned_br_t *)(out+k))->unaligned + |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; } else { /* aligned */ *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]); diff --git a/src/include/word.h b/src/include/word.h index 644edc7..31af42c 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -102,6 +102,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); #endif #if __AVX2__ + #define VECTOR_ALIGNED __attribute__((aligned(32))) typedef uint32x8_t big_register_t; typedef uint64x4_t uint64xn_t; typedef uint32x8_t uint32xn_t; @@ -113,6 +114,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return ret; } #elif __SSE2__ + #define VECTOR_ALIGNED __attribute__((aligned(16))) typedef uint32x4_t big_register_t; typedef uint64x2_t uint64xn_t; typedef uint32x4_t uint32xn_t; @@ -124,6 +126,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return ret; } #elif __ARM_NEON__ + #define VECTOR_ALIGNED __attribute__((aligned(16))) typedef uint32x4_t big_register_t; typedef uint64x2_t uint64xn_t; typedef uint32x4_t uint32xn_t; @@ -132,6 +135,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return vdupq_n_u32(x); } #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ + #define VECTOR_ALIGNED __attribute__((aligned(8))) typedef uint64_t big_register_t, uint64xn_t; typedef uint32_t uint32xn_t; @@ -140,6 +144,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return (big_register_t)x; } #else + #define VECTOR_ALIGNED __attribute__((aligned(4))) typedef uint64_t uint64xn_t; typedef uint32_t uint32xn_t; typedef uint32_t big_register_t; diff --git a/src/p521/arch_ref64/arch_config.h b/src/p521/arch_ref64/arch_config.h new file mode 100644 index 0000000..58758cc --- /dev/null +++ b/src/p521/arch_ref64/arch_config.h @@ -0,0 +1 @@ +#define WORD_BITS 64 diff --git a/src/p521/arch_ref64/p521.c b/src/p521/arch_ref64/p521.c new file mode 100644 index 0000000..8238699 --- /dev/null +++ b/src/p521/arch_ref64/p521.c @@ -0,0 +1,417 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p521.h" + +static __inline__ __uint128_t widemul( + const uint64_t a, + const uint64_t b +) { + return ((__uint128_t)a) * ((__uint128_t)b); +} + +static __inline__ uint64_t is_zero(uint64_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((__uint128_t)a)-1)>>64; +} + +void +p521_mul ( + p521_t *__restrict__ cs, + const p521_t *as, + const p521_t *bs +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb, *b = bs->limb; + __uint128_t accum0, accum1; + + accum0 = widemul(2*a[8], b[8]); + accum1 = widemul(a[0], b[7]); + accum0 += widemul(a[1], b[6]); + accum1 += widemul(a[2], b[5]); + accum0 += widemul(a[3], b[4]); + accum1 += widemul(a[4], b[3]); + accum0 += widemul(a[5], b[2]); + accum1 += widemul(a[6], b[1]); + accum0 += widemul(a[7], b[0]); + accum1 += accum0; + c[7] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[8-0]); + accum0 += widemul(a[1], b[8-1]); + accum1 += widemul(a[2], b[8-2]); + accum0 += widemul(a[3], b[8-3]); + accum1 += widemul(a[4], b[8-4]); + accum0 += widemul(a[5], b[8-5]); + accum1 += widemul(a[6], b[8-6]); + accum0 += widemul(a[7], b[8-7]); + accum1 += widemul(a[8], b[8-8]); + accum1 += accum0; + c[8] = accum1 & ((1ull<<57)-1); + accum1 >>= 57; + + accum0 = 0; + accum0 += widemul(a[1], b[0+9-1]); + accum0 += widemul(a[2], b[0+9-2]); + accum0 += widemul(a[3], b[0+9-3]); + accum0 += widemul(a[4], b[0+9-4]); + accum1 += widemul(a[0], b[0-0]); + accum0 += widemul(a[5], b[0+9-5]); + accum0 += widemul(a[6], b[0+9-6]); + accum0 += widemul(a[7], b[0+9-7]); + accum0 += widemul(a[8], b[0+9-8]); + accum1 += accum0 << 1; + c[0] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[2], b[1+9-2]); + accum0 += widemul(a[3], b[1+9-3]); + accum1 += widemul(a[0], b[1-0]); + accum0 += widemul(a[4], b[1+9-4]); + accum0 += widemul(a[5], b[1+9-5]); + accum1 += widemul(a[1], b[1-1]); + accum0 += widemul(a[6], b[1+9-6]); + accum0 += widemul(a[7], b[1+9-7]); + accum0 += widemul(a[8], b[1+9-8]); + accum1 += accum0 << 1; + c[1] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[3], b[2+9-3]); + accum1 += widemul(a[0], b[2-0]); + accum0 += widemul(a[4], b[2+9-4]); + accum0 += widemul(a[5], b[2+9-5]); + accum1 += widemul(a[1], b[2-1]); + accum0 += widemul(a[6], b[2+9-6]); + accum0 += widemul(a[7], b[2+9-7]); + accum1 += widemul(a[2], b[2-2]); + accum0 += widemul(a[8], b[2+9-8]); + accum1 += accum0 << 1; + c[2] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[4], b[3+9-4]); + accum1 += widemul(a[0], b[3-0]); + accum0 += widemul(a[5], b[3+9-5]); + accum1 += widemul(a[1], b[3-1]); + accum0 += widemul(a[6], b[3+9-6]); + accum1 += widemul(a[2], b[3-2]); + accum0 += widemul(a[7], b[3+9-7]); + accum1 += widemul(a[3], b[3-3]); + accum0 += widemul(a[8], b[3+9-8]); + accum1 += accum0 << 1; + c[3] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[4-0]); + accum0 += widemul(a[5], b[4+9-5]); + accum1 += widemul(a[1], b[4-1]); + accum0 += widemul(a[6], b[4+9-6]); + accum1 += widemul(a[2], b[4-2]); + accum0 += widemul(a[7], b[4+9-7]); + accum1 += widemul(a[3], b[4-3]); + accum0 += widemul(a[8], b[4+9-8]); + accum1 += widemul(a[4], b[4-4]); + accum1 += accum0 << 1; + c[4] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[5-0]); + accum0 += widemul(a[6], b[5+9-6]); + accum1 += widemul(a[1], b[5-1]); + accum1 += widemul(a[2], b[5-2]); + accum0 += widemul(a[7], b[5+9-7]); + accum1 += widemul(a[3], b[5-3]); + accum1 += widemul(a[4], b[5-4]); + accum0 += widemul(a[8], b[5+9-8]); + accum1 += widemul(a[5], b[5-5]); + accum1 += accum0 << 1; + c[5] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[6-0]); + accum1 += widemul(a[1], b[6-1]); + accum0 += widemul(a[7], b[6+9-7]); + accum1 += widemul(a[2], b[6-2]); + accum1 += widemul(a[3], b[6-3]); + accum1 += widemul(a[4], b[6-4]); + accum0 += widemul(a[8], b[6+9-8]); + accum1 += widemul(a[5], b[6-5]); + accum1 += widemul(a[6], b[6-6]); + accum1 += accum0 << 1; + c[6] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum1 += c[7]; + c[7] = accum1 & ((1ull<<58)-1); + + c[8] += accum1 >> 58; +} + +void +p521_mulw ( + p521_t *__restrict__ cs, + const p521_t *as, + uint64_t b +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum3 = 0, accum6 = 0; + uint64_t mask = (1ull<<58) - 1; + + int i; + for (i=0; i<3; i++) { + accum0 += widemul(b, a[i]); + accum3 += widemul(b, a[i+3]); + accum6 += widemul(b, a[i+6]); + c[i] = accum0 & mask; accum0 >>= 58; + c[i+3] = accum3 & mask; accum3 >>= 58; + if (i==2) { + c[i+6] = accum6 & (mask>>1); accum6 >>= 57; + } else { + c[i+6] = accum6 & mask; accum6 >>= 58; + } + } + + accum0 += c[3]; + c[3] = accum0 & mask; + c[4] += accum0 >> 58; + + accum3 += c[6]; + c[6] = accum3 & mask; + c[7] += accum3 >> 58; + + accum6 += c[0]; + c[0] = accum6 & mask; + c[1] += accum6 >> 58; +} + +void +p521_sqr ( + p521_t *__restrict__ cs, + const p521_t *as +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb; + __uint128_t accum0, accum1; + + accum0 = widemul(a[8], a[8]); + accum1 = widemul(a[0], a[7]); + accum0 += widemul(a[1], a[6]); + accum1 += widemul(a[2], a[5]); + accum0 += widemul(a[3], a[4]); + accum1 += accum0; + c[7] = 2 * (accum1 & ((1ull<<57)-1)); + accum1 >>= 57; + + accum0 = 0; + accum0 = 0; + accum1 += widemul(a[4], a[4]); + accum0 += widemul(a[1], a[7]); + accum1 += widemul(2*a[2], a[6]); + accum0 += widemul(a[3], a[5]); + accum1 += widemul(2*a[0], a[8]); + accum1 += 2*accum0; + c[8] = accum1 & ((1ull<<57)-1); + accum1 >>= 57; + + accum0 = 0; + accum1 += widemul(a[0], a[0]); + accum0 += widemul(a[1], a[8]); + accum0 += widemul(a[2], a[7]); + accum0 += widemul(a[3], a[6]); + accum0 += widemul(a[4], a[5]); + accum1 += accum0 << 2; + c[0] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[2], a[8]); + accum0 += widemul(a[3], a[7]); + accum0 += widemul(a[4], a[6]); + accum0 <<= 1; + accum0 += widemul(a[5], a[5]); + accum0 += widemul(a[0], a[1]); + accum1 += accum0 << 1; + c[1] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[1], a[1]); + + accum0 += widemul(a[3], a[8]); + accum0 += widemul(a[4], a[7]); + accum0 += widemul(a[5], a[6]); + accum0 <<= 1; + accum0 += widemul(a[0], a[2]); + accum1 += accum0 << 1; + c[2] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[6], a[6]); + accum0 += widemul(2*a[5], a[7]); + accum0 += widemul(2*a[4], a[8]); + accum0 += widemul(a[0], a[3]); + accum0 += widemul(a[1], a[2]); + accum1 += accum0 << 1; + c[3] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[6], a[7]); + accum0 += widemul(a[5], a[8]); + accum0 <<= 1; + accum1 += widemul(a[2], a[2]); + accum0 += widemul(a[0], a[4]); + accum0 += widemul(a[1], a[3]); + accum1 += accum0 << 1; + c[4] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(2*a[6], a[8]); + accum0 += widemul(a[7], a[7]); + accum0 += widemul(a[0], a[5]); + accum0 += widemul(a[1], a[4]); + accum0 += widemul(a[2], a[3]); + accum1 += accum0 << 1; + c[5] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[3], a[3]); + accum0 += widemul(a[0], a[6]); + accum0 += widemul(a[1], a[5]); + accum0 += widemul(2*a[7], a[8]); + accum0 += widemul(a[2], a[4]); + accum1 += accum0 << 1; + c[6] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum1 += c[7]; + c[7] = accum1 & ((1ull<<58)-1); + + c[8] += accum1 >> 58; +} + +void +p521_strong_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; + + /* first, clear high */ + __int128_t scarry = a->limb[8]>>57; + a->limb[8] &= mask2; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + int i; + for (i=0; i<9; i++) { + scarry = scarry + a->limb[i] - ((i==8) ? mask2 : mask); + a->limb[i] = scarry & ((i==8) ? mask2 : mask); + scarry >>= (i==8) ? 57 : 58; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^521 + * so let's add back in p. will carry back off the top for 2^521. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<9; i++) { + carry = carry + a->limb[i] + ((i==8)?(scarry_mask>>1):scarry_mask); + a->limb[i] = carry & ((i==8) ? mask>>1 : mask); + carry >>= (i==8) ? 57 : 58; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p521_is_zero ( + const struct p521_t *a +) { + struct p521_t b; + p521_copy(&b,a); + p521_strong_reduce(&b); + + uint64_t any = 0; + int i; + for (i=0; i<9; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +) { + int i,k=0; + p521_t red; + p521_copy(&red, x); + p521_strong_reduce(&red); + + uint64_t r=0; + int bits = 0; + for (i=0; i<9; i++) { + r |= red.limb[i] << bits; + for (bits += 58; bits >= 8; bits -= 8) { + serial[k++] = r; + r >>= 8; + } + assert(bits <= 6); + } + assert(bits); + serial[k++] = r; +} + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +) { + int i,k=0,bits=0; + __uint128_t out = 0; + uint64_t mask = (1ull<<58)-1; + for (i=0; i<9; i++) { + out >>= 58; + for (; bits<58; bits+=8) { + out |= ((__uint128_t)serial[k++])<limb[i] = out & mask; + bits -= 58; + } + + /* Check for reduction. First, high has to be < 2^57 */ + mask_t good = is_zero(out>>57); + + uint64_t and = -1ull; + for (i=0; i<8; i++) { + and &= x->limb[i]; + } + and &= (2*out+1); + good &= is_zero((and+1)>>58); + + return good; +} diff --git a/src/p521/arch_ref64/p521.h b/src/p521/arch_ref64/p521.h new file mode 100644 index 0000000..c4dbf69 --- /dev/null +++ b/src/p521/arch_ref64/p521.h @@ -0,0 +1,244 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P521_H__ +#define __P521_H__ 1 + +#include +#include +#include + +#include "word.h" + +typedef struct p521_t { + uint64_t limb[9]; +} p521_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p521_set_ui ( + p521_t *out, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_neg ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_addw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_subw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_copy ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_weak_reduce ( + p521_t *inout +) __attribute__((unused)); + +void +p521_strong_reduce ( + p521_t *inout +); + +mask_t +p521_is_zero ( + const p521_t *in +); + +static __inline__ void +p521_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +static __inline__ void +p521_really_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +void +p521_mul ( + p521_t *__restrict__ out, + const p521_t *a, + const p521_t *b +); + +void +p521_mulw ( + p521_t *__restrict__ out, + const p521_t *a, + uint64_t b +); + +void +p521_sqr ( + p521_t *__restrict__ out, + const p521_t *a +); + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +); + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +); + +/* -------------- Inline functions begin here -------------- */ + +void +p521_set_ui ( + p521_t *out, + uint64_t x +) { + int i; + out->limb[0] = x; + for (i=1; i<9; i++) { + out->limb[i] = 0; + } +} + +void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1); + } + p521_weak_reduce(out); +} + +void +p521_neg ( + struct p521_t *out, + const p521_t *a +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_addw ( + p521_t *a, + uint64_t x +) { + a->limb[0] += x; + a->limb[1] += a->limb[0]>>58; + a->limb[0] &= (1ull<<58)-1; +} + +void +p521_subw ( + p521_t *a, + uint64_t x +) { + a->limb[0] -= x; + p521_really_bias(a, 1); + p521_weak_reduce(a); +} + +void +p521_copy ( + p521_t *out, + const p521_t *a +) { + memcpy(out,a,sizeof(*a)); +} + +void +p521_really_bias ( + p521_t *a, + int amt +) { + uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt; + int i; + for (i=0; i<9; i++) { + a->limb[i] += (i==8) ? co2 : co1; + } +} + +void +p521_bias ( + p521_t *a, + int amt +) { + (void) a; + (void) amt; +} + +void +p521_weak_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58) - 1; + uint64_t tmp = a->limb[8] >> 57; + int i; + for (i=8; i>0; i--) { + a->limb[i] = (a->limb[i] & ((i==8) ? mask>>1 : mask)) + (a->limb[i-1]>>58); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P521_H__ */ diff --git a/src/p521/arch_x86_64/arch_config.h b/src/p521/arch_x86_64/arch_config.h new file mode 100644 index 0000000..58758cc --- /dev/null +++ b/src/p521/arch_x86_64/arch_config.h @@ -0,0 +1 @@ +#define WORD_BITS 64 diff --git a/src/p521/arch_x86_64/p521.c b/src/p521/arch_x86_64/p521.c new file mode 100644 index 0000000..55d82fc --- /dev/null +++ b/src/p521/arch_x86_64/p521.c @@ -0,0 +1,439 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p521.h" + +typedef uint64x4_t uint64x3_t; /* fit it in a vector register */ +static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 }; + +typedef struct { + uint64x3_t lo, hi; +} hexad_t; + +/* Currently requires CLANG. Sorry. */ +static inline uint64x3_t timesW (uint64x3_t u) { + return u.zxyw + u.zwww; +} + +/* Store three vectors. Currently requries AVX2 (TODO: remove) */ +static const uint64x4_t ls_mask_3 = { -1ull, -1ull, -1ull, 0 }; +static void store3 (uint64_t *x, uint64x3_t v) { + _mm256_maskstore_epi64((long long *) x, ls_mask_3, v); +} + +static __inline__ uint64_t is_zero(uint64_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((__uint128_t)a)-1)>>64; +} + +static __inline__ __uint128_t widemul( + const uint64_t a, + const uint64_t b +) { + return ((__uint128_t)a) * ((__uint128_t)b); +} + +static inline __uint128_t widemulu(const uint64_t a, const uint64_t b) { + return ((__uint128_t)(a)) * b; +} + +static inline __int128_t widemuls(const int64_t a, const int64_t b) { + return ((__int128_t)(a)) * b; +} + +/* This is a trick to prevent terrible register allocation by hiding things from clang's optimizer */ +static inline uint64_t opacify(uint64_t x) { + __asm__ volatile("" : "+r"(x)); + return x; +} + +static inline void hexad_mul ( + hexad_t *hex, + const uint64_t *a, + const uint64_t *b +) { + __uint128_t xu, xv, xw; + + uint64_t tmp = opacify(a[2]); + xw = widemulu(tmp, b[0]); + tmp <<= 1; + xu = widemulu(tmp, b[1]); + xv = widemulu(tmp, b[2]); + + tmp = opacify(a[1]); + xw += widemulu(tmp, b[1]); + xv += widemulu(tmp, b[0]); + tmp <<= 1; + xu += widemulu(tmp, b[2]); + + tmp = opacify(a[0]); + xu += widemulu(tmp, b[0]); + xv += widemulu(tmp, b[1]); + xw += widemulu(tmp, b[2]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_mul_signed ( + hexad_t *hex, + const int64_t *a, + const int64_t *b +) { + __int128_t xu, xv, xw; + + int64_t tmp = opacify(a[2]); + xw = widemuls(tmp, b[0]); + tmp <<= 1; + xu = widemuls(tmp, b[1]); + xv = widemuls(tmp, b[2]); + + tmp = opacify(a[1]); + xw += widemuls(tmp, b[1]); + xv += widemuls(tmp, b[0]); + tmp <<= 1; + xu += widemuls(tmp, b[2]); + + tmp = opacify(a[0]); + xu += widemuls(tmp, b[0]); + xv += widemuls(tmp, b[1]); + xw += widemuls(tmp, b[2]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_sqr ( + hexad_t *hex, + const uint64_t *a +) { + __uint128_t xu, xv, xw; + + int64_t tmp = a[2]; + tmp <<= 1; + xw = widemulu(tmp, a[0]); + xv = widemulu(tmp, a[2]); + tmp <<= 1; + xu = widemulu(tmp, a[1]); + + tmp = a[1]; + xw += widemulu(tmp, a[1]); + tmp <<= 1; + xv += widemulu(tmp, a[0]); + + tmp = a[0]; + xu += widemulu(tmp, a[0]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_sqr_signed ( + hexad_t *hex, + const int64_t *a +) { + __uint128_t xu, xv, xw; + + int64_t tmp = a[2]; + tmp <<= 1; + xw = widemuls(tmp, a[0]); + xv = widemuls(tmp, a[2]); + tmp <<= 1; + xu = widemuls(tmp, a[1]); + + tmp = a[1]; + xw += widemuls(tmp, a[1]); + tmp <<= 1; + xv += widemuls(tmp, a[0]); + + tmp = a[0]; + xu += widemuls(tmp, a[0]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + + + +void +p521_mul ( + p521_t *__restrict__ cs, + const p521_t *as, + const p521_t *bs +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb, *b = bs->limb; + + hexad_t ad, be, cf, abde, bcef, acdf; + hexad_mul(&ad, &a[0], &b[0]); + hexad_mul(&be, &a[3], &b[3]); + hexad_mul(&cf, &a[6], &b[6]); + + uint64_t amt = 32; + uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, + vhi2 = { 0, 0, -amt<<57, 0 }; + + uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; + + int64_t ta[3], tb[3]; + // it seems to be faster not to vectorize these loops + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+3]; + tb[i] = b[i]-b[i+3]; + } + hexad_mul_signed(&abde,ta,tb); + + for (int i=0; i<3; i++) { + ta[i] = a[i+3]-a[i+6]; + tb[i] = b[i+3]-b[i+6]; + } + hexad_mul_signed(&bcef,ta,tb); + + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+6]; + tb[i] = b[i]-b[i+6]; + } + hexad_mul_signed(&acdf,ta,tb); + + uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); + uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); + uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; + + uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); + uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); + uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); + + store3(&c[0], out0); + store3(&c[3], out1); + store3(&c[6], out2); +} + + +void +p521_sqr ( + p521_t *__restrict__ cs, + const p521_t *as +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb; + + hexad_t ad, be, cf, abde, bcef, acdf; + hexad_sqr(&ad, &a[0]); + hexad_sqr(&be, &a[3]); + hexad_sqr(&cf, &a[6]); + + uint64_t amt = 32; + uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, + vhi2 = { 0, 0, -amt<<57, 0 }; + + uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; + + int64_t ta[3]; + // it seems to be faster not to vectorize these loops + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+3]; + } + hexad_sqr_signed(&abde,ta); + + for (int i=0; i<3; i++) { + ta[i] = a[i+3]-a[i+6]; + } + hexad_sqr_signed(&bcef,ta); + + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+6]; + } + hexad_sqr_signed(&acdf,ta); + + uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); + uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); + uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; + + uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); + uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); + uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); + + store3(&c[0], out0); + store3(&c[3], out1); + store3(&c[6], out2); +} + +void +p521_mulw ( + p521_t *__restrict__ cs, + const p521_t *as, + uint64_t b +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum3 = 0, accum6 = 0; + uint64_t mask = (1ull<<58) - 1; + + int i; + for (i=0; i<3; i++) { + accum0 += widemul(b, a[LIMBPERM(i)]); + accum3 += widemul(b, a[LIMBPERM(i+3)]); + accum6 += widemul(b, a[LIMBPERM(i+6)]); + c[LIMBPERM(i)] = accum0 & mask; accum0 >>= 58; + c[LIMBPERM(i+3)] = accum3 & mask; accum3 >>= 58; + if (i==2) { + c[LIMBPERM(i+6)] = accum6 & (mask>>1); accum6 >>= 57; + } else { + c[LIMBPERM(i+6)] = accum6 & mask; accum6 >>= 58; + } + } + + accum0 += c[LIMBPERM(3)]; + c[LIMBPERM(3)] = accum0 & mask; + c[LIMBPERM(4)] += accum0 >> 58; + + accum3 += c[LIMBPERM(6)]; + c[LIMBPERM(6)] = accum3 & mask; + c[LIMBPERM(7)] += accum3 >> 58; + + accum6 += c[LIMBPERM(0)]; + c[LIMBPERM(0)] = accum6 & mask; + c[LIMBPERM(1)] += accum6 >> 58; +} + + +void +p521_strong_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; + + /* first, clear high */ + __int128_t scarry = a->limb[LIMBPERM(8)]>>57; + a->limb[LIMBPERM(8)] &= mask2; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + int i; + for (i=0; i<9; i++) { + scarry = scarry + a->limb[LIMBPERM(i)] - ((i==8) ? mask2 : mask); + a->limb[LIMBPERM(i)] = scarry & ((i==8) ? mask2 : mask); + scarry >>= (i==8) ? 57 : 58; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^521 + * so let's add back in p. will carry back off the top for 2^521. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<9; i++) { + carry = carry + a->limb[LIMBPERM(i)] + ((i==8)?(scarry_mask>>1):scarry_mask); + a->limb[LIMBPERM(i)] = carry & ((i==8) ? mask>>1 : mask); + carry >>= (i==8) ? 57 : 58; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p521_is_zero ( + const struct p521_t *a +) { + struct p521_t b; + p521_copy(&b,a); + p521_strong_reduce(&b); + + uint64_t any = 0; + int i; + for (i=0; i<9; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +) { + int i,k=0; + p521_t red; + p521_copy(&red, x); + p521_strong_reduce(&red); + + uint64_t r=0; + int bits = 0; + for (i=0; i<9; i++) { + r |= red.limb[LIMBPERM(i)] << bits; + for (bits += 58; bits >= 8; bits -= 8) { + serial[k++] = r; + r >>= 8; + } + assert(bits <= 6); + } + assert(bits); + serial[k++] = r; +} + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[LIMBPERM(66)] +) { + int i,k=0,bits=0; + __uint128_t out = 0; + uint64_t mask = (1ull<<58)-1; + for (i=0; i<9; i++) { + out >>= 58; + for (; bits<58; bits+=8) { + out |= ((__uint128_t)serial[k++])<limb[LIMBPERM(i)] = out & mask; + bits -= 58; + } + + /* Check for reduction. First, high has to be < 2^57 */ + mask_t good = is_zero(out>>57); + + uint64_t and = -1ull; + for (i=0; i<8; i++) { + and &= x->limb[i]; + } + and &= (2*out+1); + good &= is_zero((and+1)>>58); + + return good; +} diff --git a/src/p521/arch_x86_64/p521.h b/src/p521/arch_x86_64/p521.h new file mode 100644 index 0000000..c173b9e --- /dev/null +++ b/src/p521/arch_x86_64/p521.h @@ -0,0 +1,247 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P521_H__ +#define __P521_H__ 1 + +#include +#include +#include + +#include "word.h" + +#define LIMBPERM(x) (((x)%3)*3 + (x)/3) +#define USE_P521_3x3_TRANSPOSE + +typedef struct p521_t { + uint64_t limb[9]; +} p521_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p521_set_ui ( + p521_t *out, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_neg ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_addw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_subw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_copy ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_weak_reduce ( + p521_t *inout +) __attribute__((unused)); + +void +p521_strong_reduce ( + p521_t *inout +); + +mask_t +p521_is_zero ( + const p521_t *in +); + +static __inline__ void +p521_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +static __inline__ void +p521_really_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +void +p521_mul ( + p521_t *__restrict__ out, + const p521_t *a, + const p521_t *b +); + +void +p521_mulw ( + p521_t *__restrict__ out, + const p521_t *a, + uint64_t b +); + +void +p521_sqr ( + p521_t *__restrict__ out, + const p521_t *a +); + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +); + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +); + +/* -------------- Inline functions begin here -------------- */ + +void +p521_set_ui ( + p521_t *out, + uint64_t x +) { + int i; + out->limb[0] = x; + for (i=1; i<9; i++) { + out->limb[i] = 0; + } +} + +void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1); + } + p521_weak_reduce(out); +} + +void +p521_neg ( + struct p521_t *out, + const p521_t *a +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_addw ( + p521_t *a, + uint64_t x +) { + a->limb[0] += x; + a->limb[LIMBPERM(1)] += a->limb[0]>>58; + a->limb[0] &= (1ull<<58)-1; +} + +void +p521_subw ( + p521_t *a, + uint64_t x +) { + a->limb[0] -= x; + p521_really_bias(a, 1); + p521_weak_reduce(a); +} + +void +p521_copy ( + p521_t *out, + const p521_t *a +) { + memcpy(out,a,sizeof(*a)); +} + +void +p521_really_bias ( + p521_t *a, + int amt +) { + uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt; + int i; + for (i=0; i<9; i++) { + a->limb[i] += (i==8) ? co2 : co1; + } +} + +void +p521_bias ( + p521_t *a, + int amt +) { + (void) a; + (void) amt; +} + +void +p521_weak_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58) - 1; + uint64_t tmp = a->limb[8] >> 57; + int i; + for (i=8; i>0; i--) { + a->limb[LIMBPERM(i)] = (a->limb[LIMBPERM(i)] & ((i==8) ? mask>>1 : mask)) + (a->limb[LIMBPERM(i-1)]>>58); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P521_H__ */ diff --git a/src/p521/magic.c b/src/p521/magic.c index 7a34886..75d93c0 100644 --- a/src/p521/magic.c +++ b/src/p521/magic.c @@ -18,13 +18,13 @@ const uint8_t FIELD_MODULUS[FIELD_BYTES] = { const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { U64LE(0xbf15dbca0ae7f294), - U60LE(0x04273ba96570e0ba), - U60LE(0xc94750a1813ac0fb), - U60LE(0xea4939b8b9037a08), - U60LE(0x0000000000000002), - U60LE(0x0000000000000000), - U60LE(0x0000000000000000), - U60LE(0x0000000000000000), + U64LE(0x04273ba96570e0ba), + U64LE(0xc94750a1813ac0fb), + U64LE(0xea4939b8b9037a08), + U64LE(0x0000000000000002), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), 0x80, U64LE(0x7e2bb79415cfe529), @@ -40,6 +40,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { const struct affine_t goldilocks_base_point = { {{ +#ifdef USE_P521_3x3_TRANSPOSE + U58LE(0x02a940a2f19ba6c), + U58LE(0x3331c90d2c6ba52), + U58LE(0x2878a3bfd9f42fc), + U58LE(0x03ec4cd920e2a8c), + U58LE(0x0c6203913f6ecc5), + U58LE(0x06277e432c8a5ac), + U58LE(0x1d568fc99c6059d), + U58LE(0x1b2063b22fcf270), + U58LE(0x0752cb45c48648b) +#else U58LE(0x02a940a2f19ba6c), U58LE(0x03ec4cd920e2a8c), U58LE(0x1d568fc99c6059d), @@ -49,6 +60,7 @@ const struct affine_t goldilocks_base_point = { U58LE(0x2878a3bfd9f42fc), U58LE(0x06277e432c8a5ac), U58LE(0x0752cb45c48648b) +#endif }}, {{ 12 }} }; @@ -69,6 +81,17 @@ const struct barrett_prime_t curve_prime_order = { const struct field_t sqrt_d_minus_1 = {{ +#ifdef USE_P521_3x3_TRANSPOSE + U58LE(0x1e2be72c1c81990), + U58LE(0x207dfc238a33e46), + U58LE(0x2264cfb418c4c30), + U58LE(0x1135002ad596c69), + U58LE(0x0e30107cd79d1f6), + U58LE(0x0524b9e715937f5), + U58LE(0x2ab3a257a22666d), + U58LE(0x2d80cc2936a1824), + U58LE(0x0a9ea3ac10d6aed) +#else U58LE(0x1e2be72c1c81990), U58LE(0x1135002ad596c69), U58LE(0x2ab3a257a22666d), @@ -78,4 +101,5 @@ sqrt_d_minus_1 = {{ U58LE(0x2264cfb418c4c30), U58LE(0x0524b9e715937f5), U58LE(0x0a9ea3ac10d6aed) +#endif }}; diff --git a/src/scalarmul.c b/src/scalarmul.c index 502dd3f..b85a42c 100644 --- a/src/scalarmul.c +++ b/src/scalarmul.c @@ -163,7 +163,9 @@ scalarmul ( copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[NTABLE]; + struct tw_pniels_t + pn VECTOR_ALIGNED, + multiples[NTABLE] VECTOR_ALIGNED; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); @@ -225,7 +227,9 @@ scalarmul_vlook ( copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[NTABLE]; + struct tw_pniels_t + pn VECTOR_ALIGNED, + multiples[NTABLE] VECTOR_ALIGNED; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); diff --git a/test/bench.c b/test/bench.c index 399337d..69a5dd0 100644 --- a/test/bench.c +++ b/test/bench.c @@ -535,11 +535,11 @@ int main(int argc, char **argv) { printf("%02x", hsk.opaque[i]); } printf("\nss1 = "); - for (i=0; ilimb[0]); i++) { - int radix_bits = sizeof(x->limb[0]) * FIELD_BITS / sizeof(*x); + int radix_bits = 1 + (sizeof(x->limb[0]) * FIELD_BITS - 1) / sizeof(*x); word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ? (1ull<limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) { @@ -184,6 +184,11 @@ static mask_t test_isr ( return succ; } +void dbg_gmp_printf(const mpz_t x); +void dbg_gmp_printf(const mpz_t x) { + gmp_printf("DEBUG: 0x%Zx\n", x); +} + int test_arithmetic (void) { int j, ntests = 100000; diff --git a/test/test_pointops.c b/test/test_pointops.c index 6d4230d..4f29868 100644 --- a/test/test_pointops.c +++ b/test/test_pointops.c @@ -270,6 +270,11 @@ int test_pointops (void) { for (i=0; i<1000; i++) { uint8_t ser[FIELD_BYTES]; crandom_generate(&crand, ser, sizeof(ser)); + + + #if (FIELD_BITS % 8) + ser[FIELD_BYTES-1] &= (1<<(FIELD_BITS%8)) - 1; + #endif /* TODO: we need a field generate, which can return random or pathological. */ mask_t succ = field_deserialize(&serf, ser); diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c index 80636cf..b1a9c41 100644 --- a/test/test_scalarmul.c +++ b/test/test_scalarmul.c @@ -283,6 +283,19 @@ single_scalarmul_commutativity_test ( } } +static void crandom_generate_f(struct crandom_state_t *crand, uint8_t *scalar, int n) { + crandom_generate(crand, scalar, n); + int i; + for (i = FIELD_BYTES; i= FIELD_BYTES) { + scalar[FIELD_BYTES-1] &= (1<<(FIELD_BITS%8)) - 1; + } +#endif +} + int test_scalarmul_commutativity (void) { int i,j,k,got; @@ -296,7 +309,7 @@ int test_scalarmul_commutativity (void) { for (k=0; k<128 && !got; k++) { uint8_t ser[FIELD_BYTES]; word_t scalar1[SCALAR_WORDS], scalar2[SCALAR_WORDS]; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1)); crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); @@ -338,7 +351,7 @@ int test_linear_combo (void) { crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); field_t base1; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); mask_t succ = field_deserialize(&base1, ser); if (!succ) continue; @@ -377,7 +390,7 @@ int test_scalarmul_compatibility (void) { for (k=0; k<128 && !got; k++) { uint8_t ser[FIELD_BYTES]; word_t scalar[SCALAR_WORDS]; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); crandom_generate(&crand, (uint8_t *)scalar, sizeof(scalar)); field_t base;