diff --git a/HISTORY.txt b/HISTORY.txt index 3e5f946..702513e 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,3 +1,46 @@ +May 3, 2104: + Minor changes to internal routines mean that this version is not + compatible with the previous one. + + Added ARM NEON code. + + Added the ability to precompute multiples of a partner's public key. This + takes slightly longer than a signature verification, but reduces future + verifications with the precomputed key by ~63% and ECDH by ~70%. + + goldilocks_precompute_public_key + goldilocks_destroy_precomputed_public_key + goldilocks_verify_precomputed + goldilocks_shared_secret_precomputed + + The precomputation feature are is protected by a macro + GOLDI_IMPLEMENT_PRECOMPUTED_KEYS + which can be #defined to 0 to compile these functions out. Unlike most + of Goldilocks' functions, goldilocks_precompute_public_key uses malloc() + (and goldilocks_destroy_precomputed_public_key uses free()). + + Changed private keys to be derived from just the symmetric part. This + means that you can compress them to 32 bytes for cold storage, or derive + keypairs from crypto secrets from other systems. + goldilocks_derive_private_key + goldilocks_underive_private_key + goldilocks_private_to_public + + Fixed a number of bugs related to vector alignment on Sandy Bridge, which + has AVX but uses SSE2 alignment (because it doesn't have AVX2). Maybe I + should just switch it to use AVX2 alignment? + + Beginning to factor out curve-specific magic, so as to build other curves + with the Goldilocks framework. That would enable fair tests against eg + E-521, Ed25519 etc. Still would be a lot of work. + + More thorough testing of arithmetic. Now uses GMP for testing framework, + but not in the actual library. + + Added some high-level tests for the whole library, including some (bs) + negative testing. Obviously, effective negative testing is a very difficult + proposition in a crypto library. + March 29, 2014: Added a test directory with various tests. Currently testing SHA512 Monte Carlo, compatibility of the different scalarmul functions, and some diff --git a/Makefile b/Makefile index 3e03193..49a1e3f 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,57 @@ # Copyright (c) 2014 Cryptography Research, Inc. # Released under the MIT License. See LICENSE.txt for license information. + +UNAME := $(shell uname) +MACHINE := $(shell uname -m) + +ifeq ($(UNAME),Darwin) CC = clang -LD = clang +else +CC = gcc +endif +LD = $(CC) + +ifneq (,$(findstring x86_64,$(MACHINE))) +ARCH ?= arch_x86_64 +else +# no i386 port yet +ARCH ?= arch_arm_32 +endif -ARCH = arch_x86_64 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ - -Wgcc-compat -Wmissing-declarations + -Wmissing-declarations -Wunused-function $(EXWARN) + + INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) LANGFLAGS = -std=c99 -GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC +GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC OFLAGS = -O3 -#XFLAGS = -DN_TESTS_BASE=1000 -ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 -#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16 -CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS) -LDFLAGS = $(ARCHFLAGS) +ifneq (,$(findstring arm,$(MACHINE))) +ifneq (,$(findstring neon,$(ARCH))) +ARCHFLAGS += -mfpu=neon +else +ARCHFLAGS += -mfpu=vfpv3-d16 +endif +ARCHFLAGS += -mcpu=cortex-a9 # FIXME +GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow +else +ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO +endif + +ifeq ($(CC),clang) +WARNFLAGS += -Wgcc-compat +endif + +ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC))) +# ARCHFLAGS += -m32 +ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1 +endif + +CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS) +LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS) ASFLAGS = $(ARCHFLAGS) .PHONY: clean all test bench todo doc lib @@ -29,7 +63,7 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ - build/test_pointops.o + build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o BENCHCOMPONENTS=build/bench.o @@ -45,15 +79,20 @@ build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS) $(LD) $(LDFLAGS) -o $@ $^ build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS) - $(LD) $(LDFLAGS) -o $@ $^ + $(LD) $(LDFLAGS) -o $@ $^ -lgmp lib: build/goldilocks.so build/goldilocks.so: $(LIBCOMPONENTS) rm -f $@ +ifeq ($(UNAME),Darwin) libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \ - -exported_symbols_list src/exported.sym \ $(LIBCOMPONENTS) +else + $(LD) -shared -Wl,-soname,goldilocks.so.1 -Wl,--gc-sections -o $@ $(LIBCOMPONENTS) + strip --discard-all $@ + ln -sf $@ build/goldilocks.so.1 +endif build/timestamp: mkdir -p build @@ -80,9 +119,9 @@ doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/ todo:: @(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \ - 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' + 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' @echo '=============================' - @(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \ + @(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \ (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \ /bin/echo -n $$i' ' | head -c 10; \ (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \ @@ -90,7 +129,7 @@ todo:: @echo '=============================' @echo -n 'Total ' @(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \ - 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l + 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l bench: build/bench ./$< diff --git a/include/goldilocks.h b/include/goldilocks.h index 7476a6c..f012adb 100644 --- a/include/goldilocks.h +++ b/include/goldilocks.h @@ -12,13 +12,42 @@ #include +#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS +/** If nonzero, implement precomputation for verify and ECDH. */ +#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1 +#endif + +/** The size of the Goldilocks field, in bits. */ +#define GOLDI_FIELD_BITS 448 + +/** The size of the Goldilocks scalars, in bits. */ +#define GOLDI_SCALAR_BITS 446 + +/** The same size, in bytes. */ +#define GOLDI_FIELD_BYTES (GOLDI_FIELD_BITS/8) + +/** The size of a Goldilocks public key, in bytes. */ +#define GOLDI_PUBLIC_KEY_BYTES GOLDI_FIELD_BYTES + +/** The extra bytes in a Goldilocks private key for the symmetric key. */ +#define GOLDI_SYMKEY_BYTES 32 + +/** The size of a shared secret. */ +#define GOLDI_SHARED_SECRET_BYTES 64 + +/** The size of a Goldilocks private key, in bytes. */ +#define GOLDI_PRIVATE_KEY_BYTES (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES) + +/** The size of a Goldilocks private key, in bytes. */ +#define GOLDI_SIGNATURE_BYTES (2*GOLDI_FIELD_BYTES) + /** * @brief Serialized form of a Goldilocks public key. * * @warning This isn't even my final form! */ struct goldilocks_public_key_t { - uint8_t opaque[56]; /**< Serialized data. */ + uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */ }; /** @@ -30,7 +59,7 @@ struct goldilocks_public_key_t { * @warning This isn't even my final form! */ struct goldilocks_private_key_t { - uint8_t opaque[144]; /**< Serialized data. */ + uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */ }; #ifdef __cplusplus @@ -72,7 +101,7 @@ static const int GOLDI_EALREADYINIT = 44805; */ int goldilocks_init () -__attribute__((warn_unused_result)); +__attribute__((warn_unused_result,visibility ("default"))); /** @@ -90,7 +119,40 @@ int goldilocks_keygen ( struct goldilocks_private_key_t *privkey, struct goldilocks_public_key_t *pubkey -) __attribute__((warn_unused_result,nonnull(1,2))); +) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default"))); + +/** + * @brief Derive a key from its compressed form. + * @param [out] privkey The derived private key. + * @param [in] proto The compressed or proto-key, which must be 32 random bytes. + * + * @warning This isn't even my final form! + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_derive_private_key ( + struct goldilocks_private_key_t *privkey, + const unsigned char proto[GOLDI_SYMKEY_BYTES] +) __attribute__((nonnull(1,2),visibility ("default"))); + +/** + * @brief Compress a private key (by copying out the proto-key) + * @param [out] proto The proto-key. + * @param [in] privkey The private key. + * + * @warning This isn't even my final form! + * @todo test. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +void +goldilocks_underive_private_key ( + unsigned char proto[GOLDI_SYMKEY_BYTES], + const struct goldilocks_private_key_t *privkey +) __attribute__((nonnull(1,2),visibility ("default"))); /** * @brief Extract the public key from a private key. @@ -107,7 +169,7 @@ int goldilocks_private_to_public ( struct goldilocks_public_key_t *pubkey, const struct goldilocks_private_key_t *privkey -) __attribute__((nonnull(1,2))); +) __attribute__((nonnull(1,2),visibility ("default"))); /** * @brief Generate a Diffie-Hellman shared secret in constant time. @@ -140,10 +202,10 @@ goldilocks_private_to_public ( */ int goldilocks_shared_secret ( - uint8_t shared[64], + uint8_t shared[GOLDI_SHARED_SECRET_BYTES], const struct goldilocks_private_key_t *my_privkey, const struct goldilocks_public_key_t *your_pubkey -) __attribute__((warn_unused_result,nonnull(1,2,3))); +) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default"))); /** * @brief Sign a message. @@ -166,11 +228,11 @@ goldilocks_shared_secret ( */ int goldilocks_sign ( - uint8_t signature_out[56*2], + uint8_t signature_out[GOLDI_SIGNATURE_BYTES], const uint8_t *message, uint64_t message_len, const struct goldilocks_private_key_t *privkey -) __attribute__((nonnull(1,2,4))); +) __attribute__((nonnull(1,2,4),visibility ("default"))); /** * @brief Verify a signature. @@ -197,11 +259,108 @@ goldilocks_sign ( */ int goldilocks_verify ( - const uint8_t signature[56*2], + const uint8_t signature[GOLDI_SIGNATURE_BYTES], const uint8_t *message, uint64_t message_len, const struct goldilocks_public_key_t *pubkey -) __attribute__((warn_unused_result,nonnull(1,2,4))); +) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default"))); + +#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS + +/** A public key which has been expanded by precomputation for higher speed. */ +struct goldilocks_precomputed_public_key_t; + +/** + * @brief Expand a public key by precomputation. + * + * @todo Give actual error returns, instead of ambiguous NULL. + * + * @warning This isn't even my final form! + * + * @param [in] pub The public key. + * @retval NULL We ran out of memory, or the + */ +struct goldilocks_precomputed_public_key_t * +goldilocks_precompute_public_key ( + const struct goldilocks_public_key_t *pub +) __attribute__((warn_unused_result,nonnull(1),visibility ("default"))); + +/** + * @brief Overwrite an expanded public key with zeros, then destroy it. + * + * If the input is NULL, this function does nothing. + * + * @param [in] precom The public key. + */ +void +goldilocks_destroy_precomputed_public_key ( + struct goldilocks_precomputed_public_key_t *precom +) __attribute__((visibility ("default"))); + +/** + * @brief Verify a signature. + * + * This function is fairly strict. It will correctly detect when + * the signature has the wrong cofactor component, or when the sig + * values aren't less than p or q. + * + * @warning This isn't even my final form! + * + * @param [in] signature The signature. + * @param [in] message The message to be verified. + * @param [in] message_len The length of the message to be verified. + * @param [in] pubkey The signer's public key, expanded by precomputation. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_EINVAL The public key or signature is corrupt. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_verify_precomputed ( + const uint8_t signature[GOLDI_SIGNATURE_BYTES], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_precomputed_public_key_t *pubkey +) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default"))); + +/** + * @brief Generate a Diffie-Hellman shared secret in constant time. + * Uses a precomputation on the other party's public key for efficiency. + * + * This function uses some compile-time flags whose merit remains to + * be decided. + * + * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes + * of zeros to the secret before hashing. In the case that the other + * party's key is detectably corrupt, instead the symmetric part + * of the secret key is used to produce a pseudorandom value. + * + * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of + * the two parties' public keys is prepended to the hash. + * + * In the current version, this function can safely be run even without + * goldilocks_init(). But this property is not guaranteed for future + * versions, so call it anyway. + * + * @warning This isn't even my final form! + * + * @param [out] shared The shared secret established with the other party. + * @param [in] my_privkey My private key. + * @param [in] your_pubkey The other party's precomputed public key. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_ECORRUPT My key is corrupt. + * @retval GOLDI_EINVAL The other party's key is corrupt. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_shared_secret_precomputed ( + uint8_t shared[GOLDI_SHARED_SECRET_BYTES], + const struct goldilocks_private_key_t *my_privkey, + const struct goldilocks_precomputed_public_key_t *your_pubkey +) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default"))); + +#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */ #ifdef __cplusplus }; /* extern "C" */ diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c index c764955..fa3c583 100644 --- a/src/arch_arm_32/p448.c +++ b/src/arch_arm_32/p448.c @@ -28,6 +28,8 @@ smlal ( const uint32_t a, const uint32_t b ) { + +#ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc)>>32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" @@ -35,6 +37,9 @@ smlal ( : [a]"r"(a), [b]"r"(b)); *acc = lo + (((uint64_t)hi)<<32); +#else + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; +#endif } static inline void __attribute__((gnu_inline,always_inline)) @@ -43,6 +48,7 @@ smlal2 ( const uint32_t a, const uint32_t b ) { +#ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc)>>32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" @@ -50,6 +56,9 @@ smlal2 ( : [a]"r"(a), [b]"r"(2*b)); *acc = lo + (((uint64_t)hi)<<32); +#else + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); +#endif } static inline void __attribute__((gnu_inline,always_inline)) @@ -58,6 +67,7 @@ smull ( const uint32_t a, const uint32_t b ) { +#ifdef __ARMEL__ uint32_t lo, hi; __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" @@ -65,6 +75,9 @@ smull ( : [a]"r"(a), [b]"r"(b)); *acc = lo + (((uint64_t)hi)<<32); +#else + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; +#endif } static inline void __attribute__((gnu_inline,always_inline)) @@ -73,6 +86,7 @@ smull2 ( const uint32_t a, const uint32_t b ) { +#ifdef __ARMEL__ uint32_t lo, hi; __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" @@ -80,6 +94,9 @@ smull2 ( : [a]"r"(a), [b]"r"(2*b)); *acc = lo + (((uint64_t)hi)<<32); +#else + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); +#endif } void @@ -760,13 +777,13 @@ p448_mulw ( const p448_t *as, uint64_t b ) { - const uint32_t bhi = b>>28, blo = b & (1<<28)-1; + uint32_t mask = (1ull<<28)-1; + const uint32_t bhi = b>>28, blo = b & mask; const uint32_t *a = as->limb; uint32_t *c = cs->limb; uint64_t accum0, accum8; - uint32_t mask = (1ull<<28)-1; int i; @@ -957,7 +974,7 @@ p448_deserialize ( for (j=0; j<7; j++) { out |= ((uint64_t)serial[7*i+j])<<(8*j); } - x->limb[2*i] = out & (1ull<<28)-1; + x->limb[2*i] = out & ((1ull<<28)-1); x->limb[2*i+1] = out >> 28; } diff --git a/src/arch_arm_32/p448.h b/src/arch_arm_32/p448.h index 4628a89..befc9e0 100644 --- a/src/arch_arm_32/p448.h +++ b/src/arch_arm_32/p448.h @@ -173,7 +173,7 @@ p448_set_ui ( uint64_t x ) { int i; - out->limb[0] = x & (1<<28)-1; + out->limb[0] = x & ((1<<28)-1); out->limb[1] = x>>28; for (i=2; i<16; i++) { out->limb[i] = 0; @@ -188,7 +188,11 @@ p448_cond_swap ( ) { big_register_t *aa = (big_register_t*)a; big_register_t *bb = (big_register_t*)b; +#if __ARM_NEON__ + big_register_t m = vdupq_n_u32(doswap); +#else big_register_t m = doswap; +#endif unsigned int i; for (i=0; iy, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_weak_reduce( &d->y ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 1 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 1 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_weak_reduce( &e->t ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + mask_t L0, L1, L2; + struct p448_t L3, L4, L5, L6; + p448_mul ( &L6, &a->z0, &a->zd ); + p448_sub ( &L4, &L6, &a->xd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &a->za, &L4 ); + p448_mul ( &L5, &a->z0, &a->xd ); + p448_sub ( &L4, &L5, &a->zd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L3, &a->xa, &L4 ); + p448_add ( &L5, &L3, &L6 ); + p448_sub ( &L4, &L6, &L3 ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &L4, &L5 ); + p448_copy ( &L5, &a->z0 ); + p448_addw ( &L5, 1 ); + p448_sqr ( &L4, &L5 ); + p448_mulw ( &L5, &L4, 39082 ); + p448_neg ( &L4, &L5 ); + p448_add ( &L5, &a->z0, &a->z0 ); + p448_bias ( &L5, 1 ); + p448_add ( &L3, &L5, &L5 ); + p448_add ( &L5, &L3, &L4 ); + p448_weak_reduce( &L5 ); + p448_mul ( &L3, &a->xd, &L5 ); + L1 = p448_is_zero( &a->zd ); + L2 = - L1; + p448_mask ( &L4, &L3, L1 ); + p448_add ( &L5, &L4, &a->zd ); + L0 = ~ L1; + p448_mul ( &L4, sbz, &L6 ); + p448_addw ( &L4, L2 ); + p448_mul ( &L6, &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_mul ( &L5, &L6, &a->xd ); + p448_mul ( &L6, &L4, &L5 ); + p448_isr ( &L3, &L6 ); + p448_mul ( &L5, &L4, &L3 ); + p448_sqr ( &L4, &L3 ); + p448_mul ( &L3, &L6, &L4 ); + p448_mask ( b, &L5, L0 ); + p448_subw ( &L3, 1 ); + p448_bias ( &L3, 1 ); + L1 = p448_is_zero( &L3 ); + L0 = p448_is_zero( sbz ); + return L1 | L0; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_weak_reduce( b ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->x, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L3, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L3 ); + p448_isr ( &L2, &b->x ); + p448_mul ( &b->u, &b->t, &L2 ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &b->t, &b->x, &L3 ); + p448_add ( &b->x, &a->y, &a->x ); + p448_weak_reduce( &b->x ); + p448_sub ( &L2, &a->x, &a->y ); + p448_bias ( &L2, 2 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L3, &b->t, &L2 ); + p448_add ( &L2, &L3, &b->x ); + p448_sub ( &b->t, &b->x, &L3 ); + p448_bias ( &b->t, 2 ); + p448_weak_reduce( &b->t ); + p448_mul ( &b->x, &L2, &b->u ); + L0 = p448_is_zero( &b->y ); + L1 = - L0; + p448_addw ( &b->x, L1 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L0 = p448_is_zero( &b->z ); + L1 = - L0; + p448_addw ( &b->y, L1 ); + p448_weak_reduce( &b->y ); + L1 = p448_is_zero( &a->y ); + L0 = L1 + 1; + p448_set_ui( &b->z, L0 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_isr ( &L2, x ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &L2, x, &L3 ); + p448_subw ( &L2, 1 ); + p448_bias ( &L2, 1 ); + L1 = p448_is_zero( &L2 ); + L0 = p448_is_zero( x ); + return L1 | L0; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_weak_reduce( &L3 ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_weak_reduce( &a->y ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + mask_t L0, L1; + struct p448_t L2; + p448_sub ( &L2, &a->x, &b->x ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_sub ( &L2, &a->y, &b->y ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + p448_sqr ( &a->x, r ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L9, &a->y ); + p448_bias ( &L9, 2 ); + p448_weak_reduce( &L9 ); + p448_sqr ( &L2, &L9 ); + p448_mulw ( &L8, &L2, 1527402724 ); + p448_mulw ( &L7, &L3, 6108985600 ); + p448_add ( &a->y, &L7, &L8 ); + p448_weak_reduce( &a->y ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); + p448_bias ( &L7, 2 ); + p448_weak_reduce( &L7 ); + p448_mulw ( &L4, &a->y, 78160 ); + p448_mul ( &L6, &L7, &L9 ); + p448_mul ( &L8, &L6, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_sqr ( &L6, &L5 ); + p448_mul ( &L5, &L8, &L6 ); + p448_mul ( &L8, &L7, &L5 ); + p448_mul ( &L7, &L8, &L5 ); + p448_copy ( &L5, &a->x ); + p448_subw ( &L5, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L6, &a->x, &L8 ); + p448_sub ( &a->x, &L5, &L6 ); + p448_bias ( &a->x, 3 ); + p448_weak_reduce( &a->x ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_weak_reduce( &L3 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L9 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L2, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L2 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L3, &ext->x, &ext->y ); + p448_neg ( &L2, &L3 ); + p448_add ( &L3, &L2, &L4 ); + p448_bias ( &L3, 2 ); + L1 = p448_is_zero( &L3 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L2, &L4 ); + p448_addw ( &L2, 0 ); + p448_sqr ( &L3, &ext->x ); + p448_add ( &L4, &L3, &L2 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L3, &ext->t ); + p448_mul ( &L2, &L3, &L5 ); + p448_mulw ( &L3, &L2, 39081 ); + p448_neg ( &L5, &L3 ); + p448_add ( &L3, &L5, &L4 ); + p448_neg ( &L5, &L2 ); + p448_add ( &L4, &L5, &L3 ); + p448_sqr ( &L3, &ext->z ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 4 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L3, &L4 ); + p448_addw ( &L3, 0 ); + p448_sqr ( &L2, &ext->z ); + p448_add ( &L4, &L2, &L3 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L2, &ext->t ); + p448_mul ( &L3, &L2, &L5 ); + p448_mulw ( &L5, &L3, 39081 ); + p448_neg ( &L2, &L5 ); + p448_add ( &L3, &L2, &L4 ); + p448_sqr ( &L2, &ext->x ); + p448_neg ( &L4, &L2 ); + p448_add ( &L2, &L4, &L3 ); + p448_bias ( &L2, 4 ); + L1 = p448_is_zero( &L2 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L3, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L3 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L2, &ext->x, &ext->y ); + p448_neg ( &L3, &L2 ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + + diff --git a/src/arch_neon/neon_emulation.h b/src/arch_neon/neon_emulation.h new file mode 100644 index 0000000..6fecbc7 --- /dev/null +++ b/src/arch_neon/neon_emulation.h @@ -0,0 +1,150 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +/** + * @file "neon_emulation.h" + * @brief NEON intrinsic emulation using clang's vector extensions. + * + * This lets you test and debug NEON code on x86. + */ +#ifndef __NEON_EMULATION_H__ +#define __NEON_EMULATION_H__ 1 + +#include "word.h" + +#include +#include + +static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) { + a.x += b.x; + a.y += b.y; + return a; +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline)) +xx_vaddup_s64(int64x2_t x) { + x.y += x.x; + return x; +} + +typedef struct { int32x2_t val[2]; } int32x2x2_t; +static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) { + int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}}; + return out; +} + +static __inline__ void __attribute__((gnu_inline,always_inline)) +xx_vtrnq_s64 ( + int64x2_t *x, + int64x2_t *y +) { + int64_t tmp = (*x).y; + (*x).y = (*y).x; + (*y).x = tmp; +} + +int64x2_t vsraq_n_s64 ( + int64x2_t a, + int64x2_t v, + const int x +) { + return a + (v >> x); +} + +int64x2_t vshrq_n_s64 ( + int64x2_t v, + const int x +) { + return v >> x; +} + +static inline int64_t vgetq_lane_s64 ( + int64x2_t acc, + const int lane +) { + return lane ? acc.y : acc.x; +} + +static inline int32_t vget_lane_s32 ( + int32x2_t acc, + const int lane +) { + return lane ? acc.y : acc.x; +} + +static inline int64x2_t vmlal_lane_s32 ( + int64x2_t acc, + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + return acc + xx*(lane?yy.yy:yy.xx); +} + +static inline int64x2_t vmlsl_lane_s32 ( + int64x2_t acc, + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + return acc - xx*(lane?yy.yy:yy.xx); +} + +static inline int64x2_t vqdmlsl_lane_s32 ( + int64x2_t acc, + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + int64x2_t tmp = xx*(lane?yy.yy:yy.xx); + assert(tmp.x >> 63 == tmp.x>>62); + assert(tmp.y >> 63 == tmp.y>>62); + return acc - 2*tmp; +} + +static inline int64x2_t vqdmlal_lane_s32 ( + int64x2_t acc, + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + int64x2_t tmp = xx*(lane?yy.yy:yy.xx); + assert(tmp.x >> 63 == tmp.x>>62); + assert(tmp.y >> 63 == tmp.y>>62); + return acc + 2*tmp; +} + +static inline int64x2_t vqdmull_lane_s32 ( + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + int64x2_t tmp = xx*(lane?yy.yy:yy.xx); + assert(tmp.x >> 63 == tmp.x>>62); + assert(tmp.y >> 63 == tmp.y>>62); + return 2*tmp; +} + +static inline int32x2_t vmovn_s64( + int64x2_t x +) { + int32x2_t y = {x.x,x.y}; + return y; +} + +static inline int64x2_t vmull_lane_s32 ( + int32x2_t x, + int32x2_t y, + int lane +) { + int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; + return xx*(lane?yy.yy:yy.xx); +} + +#endif /* __NEON_EMULATION_H__ */ diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c new file mode 100644 index 0000000..6cd78aa --- /dev/null +++ b/src/arch_neon/p448.c @@ -0,0 +1,749 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" +#include "p448.h" + +static inline mask_t __attribute__((always_inline)) +is_zero ( + word_t x +) { + dword_t xx = x; + xx--; + return xx >> WORD_BITS; +} + +static uint64_t widemul_32 ( + const uint32_t a, + const uint32_t b +) { + return ((uint64_t)a)* b; +} + +#ifdef __ARM_NEON__ +static __inline__ void __attribute__((gnu_inline,always_inline)) +xx_vtrnq_s64 ( + int64x2_t *x, + int64x2_t *y +) { + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y)); +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline)) +xx_vaddup_s64(int64x2_t x) { + __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); + return x; +} +#else +#include "neon_emulation.h" +#endif // ARM_NEON + +static inline void __attribute__((gnu_inline,always_inline)) +smlal ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline)) +smlal2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +static inline void __attribute__((gnu_inline,always_inline)) +smull ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline)) +smull2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +// static inline int64x2_t copy_now(int64x2_t x) { +// int64x2_t y; +// __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x)); +// return y; +// } + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + const uint32_t *a = as->limb, *b = bs->limb; + uint32_t *c = cs->limb; + + const int32x2_t + *val = (const int32x2_t *)a, + *vbl = (const int32x2_t *)b, + *vah = (const int32x2_t *)(&a[8]), + *vbh = (const int32x2_t *)(&b[8]); + + int32x2_t + *vcl = (int32x2_t *)c, + *vch = (int32x2_t *)(&c[8]), + vmask = {(1<<28) - 1, (1<<28)-1}; + + int64x2_t accumx0a, accumx0b; + int64x2_t accumx1a, accumx1b; + int64x2_t accumx2a, accumx2b; + int64x2_t accumx3a, accumx3b; + int64x2_t accumx4a, accumx4b; + int64x2_t accumx5a, accumx5b; + int64x2_t accumx6a, accumx6b; + int64x2_t accumx7a, accumx7b; + int64x2_t carry; + int32x2x2_t trn_res; + int32x2_t delta; + + accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); + accumx1a = vmull_lane_s32( delta, vbh[3], 1); + accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); + accumx3a = vmull_lane_s32( delta, vbh[3], 1); + accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0); + accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); + accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); + accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); + accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0); + accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); + accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); + accumx3b = vmull_lane_s32( delta, vbh[1], 1); + accumx0b = vmull_lane_s32( delta, vbh[0], 0); + accumx1b = vmull_lane_s32( delta, vbh[0], 1); + accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); + accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); + accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); + accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); + accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); + accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); + accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); + accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); + accumx2b += accumx2a; + accumx3b += accumx3a; + accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); + accumx0b += accumx0a; + accumx1b += accumx1a; + accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); + accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); + accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); + accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); + accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); + accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0); + accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); + accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); + accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); + accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0); + accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); + accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); + accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); + accumx2a += accumx2b; + accumx3a += accumx3b; + accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0); + accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); + accumx0a += accumx0b; + accumx1a += accumx1b; + accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); + accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); + accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0); + accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); + xx_vtrnq_s64(&accumx0a, &accumx0b); + xx_vtrnq_s64(&accumx1a, &accumx1b); + xx_vtrnq_s64(&accumx2a, &accumx2b); + xx_vtrnq_s64(&accumx3a, &accumx3b); + accumx0b += accumx1a; + accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); + accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); + accumx2a += accumx1b; + accumx2b += accumx3a; + accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); + accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); + trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); + vcl[0] = trn_res.val[1] & vmask; + vch[0] = trn_res.val[0] & vmask; + trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); + vcl[1] = trn_res.val[1] & vmask; + vch[1] = trn_res.val[0] & vmask; + carry = accumx3b; + + accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); + accumx5a = vmull_lane_s32( delta, vbh[3], 1); + accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); + accumx7b = vmull_lane_s32( delta, vbh[3], 1); + accumx4b = accumx4a; + accumx5b = accumx5a; + accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0); + accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); + accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0); + accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); + accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0); + accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); + accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); + accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); + accumx6a = accumx6b; + accumx7a = accumx7b; + accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); + accumx4a += accumx4b; + accumx5a += accumx5b; + accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); + accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); + accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); + accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); + accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); + accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); + accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); + accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); + /**/ + accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0); + accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); + accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); + accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0); + accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); + accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); + accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0); + accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); + accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); + accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0); + accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); + + xx_vtrnq_s64(&accumx4a, &accumx4b); + xx_vtrnq_s64(&accumx5a, &accumx5b); + xx_vtrnq_s64(&accumx6a, &accumx6b); + xx_vtrnq_s64(&accumx7a, &accumx7b); + accumx4a += carry; + accumx4b += accumx5a; + accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); + accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); + accumx6a += accumx5b; + accumx6b += accumx7a; + + trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); + vcl[2] = trn_res.val[1] & vmask; + vch[2] = trn_res.val[0] & vmask; + accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); + accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); + trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); + vcl[3] = trn_res.val[1] & vmask; + vch[3] = trn_res.val[0] & vmask; + + accumx7b = xx_vaddup_s64(accumx7b); + + int32x2_t t0 = vcl[0], t1 = vch[0]; + trn_res = vtrn_s32(t0,t1); + t0 = trn_res.val[0]; t1 = trn_res.val[1]; + + accumx7b = vaddw_s32(accumx7b, t0); + t0 = vmovn_s64(accumx7b) & vmask; + + accumx7b = vshrq_n_s64(accumx7b,28); + accumx7b = vaddw_s32(accumx7b, t1); + t1 = vmovn_s64(accumx7b) & vmask; + trn_res = vtrn_s32(t0,t1); + vcl[0] = trn_res.val[0]; + vch[0] = trn_res.val[1]; + accumx7b = vshrq_n_s64(accumx7b,28); + + t0 = vmovn_s64(accumx7b); + + uint32_t + c0 = vget_lane_s32(t0,0), + c1 = vget_lane_s32(t0,1); + c[2] += c0; + c[10] += c1; +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *as +) { + /* FUTURE possible improvements: + * don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end + * or use phi/nega-phi for everything, montgomery style + * or find some sort of phi algorithm which doesn't have this problem + * break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks + * + * These improvements are all pretty minor, but I guess together they might matter? + */ + + const uint32_t *b = as->limb; + uint32_t *c = cs->limb; + + int32x2_t vbm[4]; + + const int32x2_t + *vbl = (const int32x2_t *)b, + *vbh = (const int32x2_t *)(&b[8]); + + int i; + for (i=0; i<4; i++) { + vbm[i] = vbl[i] - vbh[i]; + } + + int32x2_t + *vcl = (int32x2_t *)c, + *vch = (int32x2_t *)(&c[8]), + vmask = {(1<<28) - 1, (1<<28)-1}; + + int64x2_t accumx0a, accumx0b; + int64x2_t accumx1a, accumx1b; + int64x2_t accumx2a, accumx2b; + int64x2_t accumx3a, accumx3b; + int64x2_t accumx4a, accumx4b; + int64x2_t accumx5a, accumx5b; + int64x2_t accumx6a, accumx6b; + int64x2_t accumx7a, accumx7b; + int64x2_t carry; + int32x2x2_t trn_res; + + accumx0a = vqdmull_lane_s32( vbh[1], vbh[3], 0); + accumx1a = vqdmull_lane_s32( vbh[1], vbh[3], 1); + accumx2a = vqdmull_lane_s32( vbh[2], vbh[3], 0); + accumx3a = vqdmull_lane_s32( vbh[2], vbh[3], 1); + accumx0a = vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0); + accumx1a = vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1); + accumx2b = accumx2a; + accumx3b = accumx3a; + accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0); + accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1); + accumx0b = accumx0a; + accumx1b = accumx1a; + accumx0b = vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0); + accumx1b = vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1); + accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0); + accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1); + accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0); + accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1); + accumx0b = vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0); + accumx1b = vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1); + accumx2a += accumx2b; + accumx3a += accumx3b; + accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0); + accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1); + accumx0a += accumx0b; + accumx1a += accumx1b; + accumx0a = vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0); + accumx1a = vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1); + accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0); + accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1); + accumx0a = vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0); + accumx1a = vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1); + accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0); + accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1); + accumx0b += accumx0a; + accumx1b += accumx1a; + accumx0b = vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0); + accumx1b = vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1); + accumx2b += accumx2a; + accumx3b += accumx3a; + accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0); + accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1); + xx_vtrnq_s64(&accumx0b, &accumx0a); + xx_vtrnq_s64(&accumx1b, &accumx1a); + xx_vtrnq_s64(&accumx2b, &accumx2a); + xx_vtrnq_s64(&accumx3b, &accumx3a); + accumx0a += accumx1b; + accumx0a = vsraq_n_s64(accumx0a,accumx0b,28); + accumx1a = vsraq_n_s64(accumx1a,accumx0a,28); + accumx2b += accumx1a; + accumx2a += accumx3b; + accumx2a = vsraq_n_s64(accumx2a,accumx2b,28); + accumx3a = vsraq_n_s64(accumx3a,accumx2a,28); + trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a)); + vcl[0] = trn_res.val[1] & vmask; + vch[0] = trn_res.val[0] & vmask; + trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a)); + vcl[1] = trn_res.val[1] & vmask; + vch[1] = trn_res.val[0] & vmask; + carry = accumx3a; + + accumx4a = vmull_lane_s32( vbh[3], vbh[3], 0); + accumx5a = vmull_lane_s32( vbh[3], vbh[3], 1); + accumx6b = vqdmull_lane_s32( vbh[0], vbh[3], 0); + accumx7b = vqdmull_lane_s32( vbh[0], vbh[3], 1); + accumx4b = accumx4a; + accumx5b = accumx5a; + accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0); + accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1); + accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0); + accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1); + accumx4b = vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0); + accumx5b = vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1); + accumx4b = vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0); + accumx5b = vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1); + accumx6a = accumx6b; + accumx7a = accumx7b; + accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0); + accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1); + accumx4a += accumx4b; + accumx5a += accumx5b; + accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0); + accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1); + accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0); + accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1); + accumx4a = vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0); + accumx5a = vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1); + accumx4a = vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0); + accumx5a = vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1); + accumx6b += accumx6a; + accumx7b += accumx7a; + accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0); + accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1); + accumx4b += accumx4a; + accumx5b += accumx5a; + accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0); + accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1); + accumx4b = vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0); + accumx5b = vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1); + accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0); + accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1); + + xx_vtrnq_s64(&accumx4b, &accumx4a); + xx_vtrnq_s64(&accumx5b, &accumx5a); + xx_vtrnq_s64(&accumx6b, &accumx6a); + xx_vtrnq_s64(&accumx7b, &accumx7a); + accumx4b += carry; + accumx4a += accumx5b; + accumx4a = vsraq_n_s64(accumx4a,accumx4b,28); + accumx5a = vsraq_n_s64(accumx5a,accumx4a,28); + accumx6b += accumx5a; + accumx6a += accumx7b; + + trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a)); + vcl[2] = trn_res.val[1] & vmask; + vch[2] = trn_res.val[0] & vmask; + accumx6a = vsraq_n_s64(accumx6a,accumx6b,28); + accumx7a = vsraq_n_s64(accumx7a,accumx6a,28); + trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a)); + vcl[3] = trn_res.val[1] & vmask; + vch[3] = trn_res.val[0] & vmask; + + accumx7a = xx_vaddup_s64(accumx7a); + + int32x2_t t0 = vcl[0], t1 = vch[0]; + trn_res = vtrn_s32(t0,t1); + t0 = trn_res.val[0]; t1 = trn_res.val[1]; + + accumx7a = vaddw_s32(accumx7a, t0); + t0 = vmovn_s64(accumx7a) & vmask; + + accumx7a = vshrq_n_s64(accumx7a,28); + accumx7a = vaddw_s32(accumx7a, t1); + t1 = vmovn_s64(accumx7a) & vmask; + trn_res = vtrn_s32(t0,t1); + vcl[0] = trn_res.val[0]; + vch[0] = trn_res.val[1]; + accumx7a = vshrq_n_s64(accumx7a,28); + + t0 = vmovn_s64(accumx7a); + + uint32_t + c0 = vget_lane_s32(t0,0), + c1 = vget_lane_s32(t0,1); + c[2] += c0; + c[10] += c1; +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); + + const uint32_t *a = as->limb; + uint32_t *c = cs->limb; + + uint64_t accum0, accum8; + uint32_t mask = (1ull<<28)-1; + + int i; + + uint32_t c0, c8, n0, n8; + accum0 = widemul_32(bhi, a[15]); + accum8 = widemul_32(bhi, a[15] + a[7]); + c0 = a[0]; c8 = a[8]; + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[0] = accum0 & mask; accum0 >>= 28; + c[8] = accum8 & mask; accum8 >>= 28; + + i=1; + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + + accum0 += accum8 + c[8]; + c[8] = accum0 & mask; + c[9] += accum0 >> 28; + + accum8 += c[0]; + c[0] = accum8 & mask; + c[1] += accum8 >> 28; +} + +void +p448_strong_reduce ( + p448_t *a +) { + word_t mask = (1ull<<28)-1; + + /* first, clear high */ + a->limb[8] += a->limb[15]>>28; + a->limb[0] += a->limb[15]>>28; + a->limb[15] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + dsword_t scarry = 0; + int i; + for (i=0; i<16; i++) { + scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask); + a->limb[i] = scarry & mask; + scarry >>= 28; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + word_t scarry_mask = scarry & mask; + dword_t carry = 0; + + /* add it back */ + for (i=0; i<16; i++) { + carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask); + a->limb[i] = carry & mask; + carry >>= 28; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint32_t any = 0; + int i; + for (i=0; i<16; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + for (i=0; i<8; i++) { + uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28); + for (j=0; j<7; j++) { + serial[7*i+j] = limb; + limb >>= 8; + } + assert(limb == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + uint64_t out = 0; + for (j=0; j<7; j++) { + out |= ((uint64_t)serial[7*i+j])<<(8*j); + } + x->limb[2*i] = out & ((1ull<<28)-1); + x->limb[2*i+1] = out >> 28; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + uint32_t ge = -1, mask = (1ull<<28)-1; + for (i=0; i<8; i++) { + ge &= x->limb[i]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); + + /* Propagate the rest */ + for (i=9; i<16; i++) { + ge &= x->limb[i]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_neon/p448.h b/src/arch_neon/p448.h new file mode 100644 index 0000000..94dacd7 --- /dev/null +++ b/src/arch_neon/p448.h @@ -0,0 +1,378 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include "word.h" + +#include +#include + +typedef struct p448_t { + uint32_t limb[16]; +} __attribute__((aligned(32))) p448_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + out->limb[0] = x & ((1<<28)-1); + out->limb[1] = x>>28; + for (i=2; i<16; i++) { + out->limb[i] = 0; + } +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = br_set_to_mask(doswap); + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + */ +} + +void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = br_set_to_mask(doNegate); + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint32_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; + uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; + uint32x4_t *aa = (uint32x4_t*) a; + aa[0] += lo; + aa[1] += lo; + aa[2] += hi; + aa[3] += lo; +} + +void +p448_weak_reduce ( + p448_t *a +) { + uint64_t mask = (1ull<<28) - 1; + uint64_t tmp = a->limb[15] >> 28; + int i; + a->limb[8] += tmp; + for (i=15; i>0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c index 7a37195..4abc788 100644 --- a/src/arch_x86_64/p448.c +++ b/src/arch_x86_64/p448.c @@ -17,13 +17,14 @@ p448_mul ( __uint128_t accum0 = 0, accum1 = 0, accum2; uint64_t mask = (1ull<<56) - 1; - uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))); + uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32))); /* For some reason clang doesn't vectorize this without prompting? */ unsigned int i; for (i=0; i>= 56; accum1 >>= 56; - accum2 = widemul(&aa[2],&bb[3]); - msb(&accum0, &a[2], &b[3]); - mac(&accum1, &a[6], &b[7]); + accum2 = widemul(&a[2],&b[7]); + mac(&accum0, &a[6], &bb[3]); + mac(&accum1, &aa[2], &bbb[3]); - mac(&accum2, &aa[3], &bb[2]); - msb(&accum0, &a[3], &b[2]); - mac(&accum1, &a[7], &b[6]); + mac(&accum2, &a[3], &b[6]); + mac(&accum0, &a[7], &bb[2]); + mac(&accum1, &aa[3], &bbb[2]); - accum1 += accum2; - accum0 += accum2; - - accum2 = widemul(&a[0],&b[1]); + mac(&accum2, &a[0],&b[1]); mac(&accum1, &aa[0], &bb[1]); mac(&accum0, &a[4], &b[5]); @@ -109,14 +107,11 @@ p448_mul ( accum0 >>= 56; accum1 >>= 56; - accum2 = widemul(&aa[3],&bb[3]); - msb(&accum0, &a[3], &b[3]); - mac(&accum1, &a[7], &b[7]); - - accum1 += accum2; - accum0 += accum2; + accum2 = widemul(&a[3],&b[7]); + mac(&accum0, &a[7], &bb[3]); + mac(&accum1, &aa[3], &bbb[3]); - accum2 = widemul(&a[0],&b[2]); + mac(&accum2, &a[0],&b[2]); mac(&accum1, &aa[0], &bb[2]); mac(&accum0, &a[4], &b[6]); @@ -186,11 +181,9 @@ p448_mulw ( c[3] = accum0 & mask; accum0 >>= 56; c[7] = accum4 & mask; accum4 >>= 56; - c[4] += accum0 + accum4; - c[0] += accum4; + // c[4] += accum0 + accum4; + // c[0] += accum4; - /* - * TODO: double-check that this is not necessary. accum0 += accum4 + c[4]; c[4] = accum0 & mask; c[5] += accum0 >> 56; @@ -198,7 +191,6 @@ p448_mulw ( accum4 += c[0]; c[0] = accum4 & mask; c[1] += accum4 >> 56; - */ } void diff --git a/src/arch_x86_64/p448.h b/src/arch_x86_64/p448.h index b0b4dc0..2e2fbed 100644 --- a/src/arch_x86_64/p448.h +++ b/src/arch_x86_64/p448.h @@ -290,7 +290,10 @@ p448_copy ( p448_t *out, const p448_t *a ) { - *out = *a; + unsigned int i; + for (i=0; ilimb[i] += (i==4) ? co2 : co1; + } +#endif } void diff --git a/src/exported.sym b/src/exported.sym deleted file mode 100644 index e26f3db..0000000 --- a/src/exported.sym +++ /dev/null @@ -1,6 +0,0 @@ -_goldilocks_init -_goldilocks_keygen -_goldilocks_shared_secret -_goldilocks_sign -_goldilocks_verify -_goldilocks_private_to_public diff --git a/src/goldilocks.c b/src/goldilocks.c index f178d7a..4314e46 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -32,7 +32,10 @@ #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0 #endif -/* FUTURE: auto */ +#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS)) +#define GOLDI_DIVERSIFY_BYTES 8 + +/* FUTURE: auto. MAGIC */ const struct affine_t goldilocks_base_point = { {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), @@ -42,11 +45,12 @@ const struct affine_t goldilocks_base_point = { {{ 19 }} }; +/* These are just unique identifiers */ static const char *G_INITING = "initializing"; static const char *G_INITED = "initialized"; static const char *G_FAILED = "failed to initialize"; -/* FUTURE: auto */ +/* FUTURE: auto. MAGIC */ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { U64LE(0xdc873d6d54a7bb0d), U64LE(0xde933d8d723a70aa), @@ -54,19 +58,45 @@ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { 0x8335dc16 }; const struct barrett_prime_t goldi_q448 = { - 448/WORD_BITS, + GOLDI_FIELD_WORDS, 62 % WORD_BITS, sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]), goldi_q448_lo }; -/* FUTURE: auto */ +/* MAGIC */ +static const struct p448_t +sqrt_d_minus_1 = {{ + U58LE(0xd2e21836749f46), + U58LE(0x888db42b4f0179), + U58LE(0x5a189aabdeea38), + U58LE(0x51e65ca6f14c06), + U58LE(0xa49f7b424d9770), + U58LE(0xdcac4628c5f656), + U58LE(0x49443b8748734a), + U58LE(0x12fec0c0b25b7a) +}}; + +struct goldilocks_precomputed_public_key_t { + struct goldilocks_public_key_t pub; + struct fixed_base_table_t table; +}; + +#ifndef USE_BIG_TABLES +#if __ARM_NEON__ +#define USE_BIG_TABLES 1 +#else +#define USE_BIG_TABLES (WORD_BITS==64) +#endif +#endif + +/* FUTURE: auto. MAGIC */ struct { const char * volatile state; #if GOLDILOCKS_USE_PTHREAD pthread_mutex_t mutex; #endif - struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64]; + struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64]; struct fixed_base_table_t fixed_base; struct tw_niels_t wnafs[32]; struct crandom_state_t rand; @@ -107,7 +137,7 @@ goldilocks_init () { /* Precompute the tables. */ mask_t succ; - int big = (WORD_BITS==64); + int big = USE_BIG_TABLES; uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; succ = precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs); @@ -135,55 +165,77 @@ fail: return -1; } -static const struct p448_t -sqrt_d_minus_1 = {{ - U58LE(0xd2e21836749f46), - U58LE(0x888db42b4f0179), - U58LE(0x5a189aabdeea38), - U58LE(0x51e65ca6f14c06), - U58LE(0xa49f7b424d9770), - U58LE(0xdcac4628c5f656), - U58LE(0x49443b8748734a), - U58LE(0x12fec0c0b25b7a) -}}; - int -goldilocks_keygen ( +goldilocks_derive_private_key ( struct goldilocks_private_key_t *privkey, - struct goldilocks_public_key_t *pubkey + const unsigned char proto[GOLDI_SYMKEY_BYTES] ) { if (!goldilocks_check_init()) { return GOLDI_EUNINIT; } - word_t sk[448*2/WORD_BITS]; + memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES); + + unsigned char skb[SHA512_OUTPUT_BYTES]; + word_t sk[GOLDI_FIELD_WORDS]; + assert(sizeof(skb) >= sizeof(sk)); + struct sha512_ctx_t ctx; struct tw_extensible_t exta; struct p448_t pk; + + sha512_init(&ctx); + sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES); + sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES); + sha512_final(&ctx, (unsigned char *)skb); + + barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448); + barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES); + + scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); + untwist_and_double_and_serialize(&pk, &exta); + + p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk); + + return GOLDI_EOK; +} + +void +goldilocks_underive_private_key ( + unsigned char proto[GOLDI_SYMKEY_BYTES], + const struct goldilocks_private_key_t *privkey +) { + memcpy(proto, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); +} + +int +goldilocks_keygen ( + struct goldilocks_private_key_t *privkey, + struct goldilocks_public_key_t *pubkey +) { + if (!goldilocks_check_init()) { + return GOLDI_EUNINIT; + } + + unsigned char proto[GOLDI_SYMKEY_BYTES]; #if GOLDILOCKS_USE_PTHREAD int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex); if (ml_ret) return ml_ret; #endif - int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk)); - int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32); - if (!ret) ret = ret2; + int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto)); #if GOLDILOCKS_USE_PTHREAD ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex); if (ml_ret) abort(); #endif - barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448); - barrett_serialize(privkey->opaque, sk, 448/8); - - scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base); - //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta); - untwist_and_double_and_serialize(&pk, &exta); + int ret2 = goldilocks_derive_private_key(privkey, proto); + if (!ret) ret = ret2; - p448_serialize(pubkey->opaque, &pk); - memcpy(&privkey->opaque[56], pubkey->opaque, 56); + ret2 = goldilocks_private_to_public(pubkey, privkey); + if (!ret) ret = ret2; return ret ? GOLDI_ENODICE : GOLDI_EOK; } @@ -194,7 +246,7 @@ goldilocks_private_to_public ( const struct goldilocks_private_key_t *privkey ) { struct p448_t pk; - mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]); + mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]); if (msucc) { p448_serialize(pubkey->opaque, &pk); @@ -204,30 +256,46 @@ goldilocks_private_to_public ( } } -int -goldilocks_shared_secret ( - uint8_t shared[64], +static int +goldilocks_shared_secret_core ( + uint8_t shared[GOLDI_SHARED_SECRET_BYTES], const struct goldilocks_private_key_t *my_privkey, - const struct goldilocks_public_key_t *your_pubkey + const struct goldilocks_public_key_t *your_pubkey, + const struct goldilocks_precomputed_public_key_t *pre ) { /* This function doesn't actually need anything in goldilocks_global, * so it doesn't check init. */ - word_t sk[448/WORD_BITS]; + assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES); + + word_t sk[GOLDI_FIELD_WORDS]; struct p448_t pk; mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1; #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS struct p448_t sum, prod; - msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]); + msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]); p448_mul(&prod,&pk,&sum); p448_add(&sum,&pk,&sum); #endif msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448); - succ &= montgomery_ladder(&pk,&pk,sk,446,2); + +#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS + if (pre) { + struct tw_extensible_t tw; + succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table); + untwist_and_double_and_serialize(&pk, &tw); + } else { + succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1); + } +#else + (void)pre; + succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1); +#endif + p448_serialize(shared,&pk); @@ -236,28 +304,28 @@ goldilocks_shared_secret ( sha512_init(&ctx); #ifdef EXPERIMENT_ECDH_OBLITERATE_CT - uint8_t oblit[40]; + uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES]; unsigned i; - for (i=0; i<8; i++) { + for (i=0; iopaque[112+i] & ~(succ&msucc); + for (i=0; iopaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc); } - sha512_update(&ctx, oblit, 40); + sha512_update(&ctx, oblit, sizeof(oblit)); #endif #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS /* stir in the sum and product of the pubkeys. */ - uint8_t a_pk[56]; + uint8_t a_pk[GOLDI_FIELD_BYTES]; p448_serialize(a_pk, &sum); - sha512_update(&ctx, a_pk, 56); + sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); p448_serialize(a_pk, &prod); - sha512_update(&ctx, a_pk, 56); + sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); #endif /* stir in the shared key and finish */ - sha512_update(&ctx, shared, 56); + sha512_update(&ctx, shared, GOLDI_FIELD_BYTES); sha512_final(&ctx, shared); return (GOLDI_ECORRUPT & ~msucc) @@ -265,9 +333,42 @@ goldilocks_shared_secret ( | (GOLDI_EOK & msucc & succ); } +int +goldilocks_shared_secret ( + uint8_t shared[GOLDI_SHARED_SECRET_BYTES], + const struct goldilocks_private_key_t *my_privkey, + const struct goldilocks_public_key_t *your_pubkey +) { + return goldilocks_shared_secret_core( + shared, + my_privkey, + your_pubkey, + NULL + ); +} + +static void +goldilocks_derive_challenge( + word_t challenge[GOLDI_FIELD_WORDS], + const unsigned char pubkey[GOLDI_FIELD_BYTES], + const unsigned char gnonce[GOLDI_FIELD_BYTES], + const unsigned char *message, + uint64_t message_len +) { + /* challenge = H(pk, [nonceG], message). */ + unsigned char sha_out[SHA512_OUTPUT_BYTES]; + struct sha512_ctx_t ctx; + sha512_init(&ctx); + sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES); + sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES); + sha512_update(&ctx, message, message_len); + sha512_final(&ctx, sha_out); + barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448); +} + int goldilocks_sign ( - uint8_t signature_out[56*2], + uint8_t signature_out[GOLDI_SIGNATURE_BYTES], const uint8_t *message, uint64_t message_len, const struct goldilocks_private_key_t *privkey @@ -277,7 +378,7 @@ goldilocks_sign ( } /* challenge = H(pk, [nonceG], message). */ - word_t skw[448/WORD_BITS]; + word_t skw[GOLDI_FIELD_WORDS]; mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448); if (!succ) { memset(skw,0,sizeof(skw)); @@ -285,48 +386,50 @@ goldilocks_sign ( } /* Derive a nonce. TODO: use HMAC. FUTURE: factor. */ - unsigned char sha_out[512/8]; - word_t tk[448/WORD_BITS]; + unsigned char sha_out[SHA512_OUTPUT_BYTES]; + word_t tk[GOLDI_FIELD_WORDS]; struct sha512_ctx_t ctx; sha512_init(&ctx); sha512_update(&ctx, (const unsigned char *)"signonce", 8); - sha512_update(&ctx, &privkey->opaque[112], 32); + sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); sha512_update(&ctx, message, message_len); - sha512_update(&ctx, &privkey->opaque[112], 32); + sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448); + barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448); /* 4[nonce]G */ - uint8_t signature_tmp[56]; + uint8_t signature_tmp[GOLDI_FIELD_BYTES]; struct tw_extensible_t exta; struct p448_t gsk; - scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base); + scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); double_tw_extensible(&exta); untwist_and_double_and_serialize(&gsk, &exta); p448_serialize(signature_tmp, &gsk); - word_t challenge[448/WORD_BITS]; - sha512_update(&ctx, &privkey->opaque[56], 56); - sha512_update(&ctx, signature_tmp, 56); - sha512_update(&ctx, message, message_len); - sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); + word_t challenge[GOLDI_FIELD_WORDS]; + goldilocks_derive_challenge ( + challenge, + &privkey->opaque[GOLDI_FIELD_BYTES], + signature_tmp, + message, + message_len + ); // reduce challenge and sub. - barrett_negate(challenge,448/WORD_BITS,&goldi_q448); + barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448); barrett_mac( - tk,448/WORD_BITS, - challenge,448/WORD_BITS, - skw,448/WORD_BITS, + tk,GOLDI_FIELD_WORDS, + challenge,GOLDI_FIELD_WORDS, + skw,GOLDI_FIELD_WORDS, &goldi_q448 ); - word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1); - barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448); + word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1); + barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448); - memcpy(signature_out, signature_tmp, 56); - barrett_serialize(signature_out+56, tk, 448/8); + memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); + barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); memset((unsigned char *)tk,0,sizeof(tk)); memset((unsigned char *)skw,0,sizeof(skw)); memset((unsigned char *)challenge,0,sizeof(challenge)); @@ -342,7 +445,7 @@ goldilocks_sign ( int goldilocks_verify ( - const uint8_t signature[56*2], + const uint8_t signature[GOLDI_SIGNATURE_BYTES], const uint8_t *message, uint64_t message_len, const struct goldilocks_public_key_t *pubkey @@ -352,24 +455,16 @@ goldilocks_verify ( } struct p448_t pk; - word_t s[448/WORD_BITS]; + word_t s[GOLDI_FIELD_WORDS]; mask_t succ = p448_deserialize(&pk,pubkey->opaque); if (!succ) return GOLDI_EINVAL; - succ = barrett_deserialize(s, &signature[56], &goldi_q448); + succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); if (!succ) return GOLDI_EINVAL; - /* challenge = H(pk, [nonceG], message). */ - unsigned char sha_out[512/8]; - word_t challenge[448/WORD_BITS]; - struct sha512_ctx_t ctx; - sha512_init(&ctx); - sha512_update(&ctx, pubkey->opaque, 56); - sha512_update(&ctx, signature, 56); - sha512_update(&ctx, message, message_len); - sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); + word_t challenge[GOLDI_FIELD_WORDS]; + goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len); struct p448_t eph; struct tw_extensible_t pk_text; @@ -381,7 +476,102 @@ goldilocks_verify ( succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); if (!succ) return GOLDI_EINVAL; - linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 ); + linear_combo_var_fixed_vt( &pk_text, + challenge, GOLDI_SCALAR_BITS, + s, GOLDI_SCALAR_BITS, + goldilocks_global.wnafs, 5 ); + + untwist_and_double_and_serialize( &pk, &pk_text ); + p448_sub(&eph, &eph, &pk); + p448_bias(&eph, 2); + + succ = p448_is_zero(&eph); + + return succ ? 0 : GOLDI_EINVAL; +} + +#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS + +struct goldilocks_precomputed_public_key_t * +goldilocks_precompute_public_key ( + const struct goldilocks_public_key_t *pub +) { + struct goldilocks_precomputed_public_key_t *precom; + precom = (struct goldilocks_precomputed_public_key_t *) + malloc(sizeof(*precom)); + + if (!precom) return NULL; + + struct tw_extensible_t pk_text; + + struct p448_t pk; + mask_t succ = p448_deserialize(&pk, pub->opaque); + if (!succ) { + free(precom); + return NULL; + } + + succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); + if (!succ) { + free(precom); + return NULL; + } + + int big = USE_BIG_TABLES; + uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; + + succ = precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL); + if (!succ) { + free(precom); + return NULL; + } + + memcpy(&precom->pub,pub,sizeof(*pub)); + + return precom; +} + +void +goldilocks_destroy_precomputed_public_key ( + struct goldilocks_precomputed_public_key_t *precom +) { + if (!precom) return; + destroy_fixed_base(&precom->table); + memset(&precom->pub.opaque, 0, sizeof(precom->pub)); + free(precom); +} + +int +goldilocks_verify_precomputed ( + const uint8_t signature[GOLDI_SIGNATURE_BYTES], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_precomputed_public_key_t *pubkey +) { + if (!goldilocks_check_init()) { + return GOLDI_EUNINIT; + } + + word_t s[GOLDI_FIELD_WORDS]; + mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); + if (!succ) return GOLDI_EINVAL; + + word_t challenge[GOLDI_FIELD_WORDS]; + goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len); + + struct p448_t eph, pk; + struct tw_extensible_t pk_text; + + /* deserialize [nonce]G */ + succ = p448_deserialize(&eph, signature); + if (!succ) return GOLDI_EINVAL; + + succ = linear_combo_combs_vt ( + &pk_text, + challenge, GOLDI_SCALAR_BITS, &pubkey->table, + s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base + ); + if (!succ) return GOLDI_EINVAL; untwist_and_double_and_serialize( &pk, &pk_text ); p448_sub(&eph, &eph, &pk); @@ -391,3 +581,20 @@ goldilocks_verify ( return succ ? 0 : GOLDI_EINVAL; } + +int +goldilocks_shared_secret_precomputed ( + uint8_t shared[GOLDI_SHARED_SECRET_BYTES], + const struct goldilocks_private_key_t *my_privkey, + const struct goldilocks_precomputed_public_key_t *your_pubkey +) { + return goldilocks_shared_secret_core( + shared, + my_privkey, + &your_pubkey->pub, + your_pubkey + ); +} + +#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS + diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h index 02a8a1e..1dac686 100644 --- a/src/include/intrinsics.h +++ b/src/include/intrinsics.h @@ -12,7 +12,9 @@ #include +#if __i386__ || __x86_64__ #include +#endif #define INTRINSIC \ static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused)) diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h index 122fccc..8b42fd7 100644 --- a/src/include/scalarmul.h +++ b/src/include/scalarmul.h @@ -26,7 +26,7 @@ struct fixed_base_table_t { struct tw_niels_t *table; /** Adjustments to the scalar in even and odd cases, respectively. */ - word_t scalar_adjustments[2*(448/WORD_BITS)]; + word_t scalar_adjustments[2*(448/WORD_BITS)]; /* MAGIC */ /** The number of combs in the table. */ unsigned int n; @@ -103,7 +103,7 @@ montgomery_ladder ( void scalarmul ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] + const word_t scalar[448/WORD_BITS] /* MAGIC */ /* TODO? int nbits */ ); @@ -124,7 +124,7 @@ scalarmul ( void scalarmul_vlook ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] + const word_t scalar[448/WORD_BITS] /* MAGIC */ /* TODO? int nbits */ ); @@ -209,7 +209,7 @@ scalarmul_fixed_base ( void scalarmul_vt ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] + const word_t scalar[448/WORD_BITS] /* MAGIC */ ); @@ -274,14 +274,42 @@ scalarmul_fixed_base_wnaf_vt ( void linear_combo_var_fixed_vt ( struct tw_extensible_t *working, - const word_t scalar_var[448/WORD_BITS], + const word_t scalar_var[448/WORD_BITS], /* MAGIC */ unsigned int nbits_var, - const word_t scalar_pre[448/WORD_BITS], + const word_t scalar_pre[448/WORD_BITS], /* MAGIC */ unsigned int nbits_pre, const struct tw_niels_t *precmp, unsigned int table_bits_pre ); +/** + * Variable-time scalar linear combination of two fixed points. + * + * @warning This function takes variable time. It is intended for + * signature verification. + * + * @param [out] working The output point. + * @param [in] scalar1 The first scalar. + * @param [in] nbits1 The number of bits in the first scalar. + * @param [in] table1 The first precomputed table. + * @param [in] scalar2 The second scalar. + * @param [in] nbits1 The number of bits in the second scalar. + * @param [in] table1 The second precomputed table. + * + * @retval MASK_SUCCESS Success. + * @retval MASK_FAILURE Failure, because eg the tables are too small. + */ +mask_t +linear_combo_combs_vt ( + struct tw_extensible_t *out, + const word_t scalar1[448/WORD_BITS], + unsigned int nbits1, + const struct fixed_base_table_t *table1, + const word_t scalar2[448/WORD_BITS], + unsigned int nbits2, + const struct fixed_base_table_t *table2 +); + #ifdef __cplusplus }; #endif diff --git a/src/include/sha512.h b/src/include/sha512.h index cad1588..760e31e 100644 --- a/src/include/sha512.h +++ b/src/include/sha512.h @@ -10,6 +10,8 @@ extern "C" { #endif +#define SHA512_OUTPUT_BYTES 64 + /** * SHA512 hashing context. * @@ -37,7 +39,7 @@ sha512_update ( void sha512_final ( struct sha512_ctx_t *ctx, - uint8_t result[64] + uint8_t result[SHA512_OUTPUT_BYTES] ); #ifdef __cplusplus diff --git a/src/include/word.h b/src/include/word.h index 0fc7427..c638785 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -8,11 +8,22 @@ /* for posix_memalign */ #define _XOPEN_SOURCE 600 +#ifndef __APPLE__ +#define _BSD_SOURCE +#include +#endif + #include #include #include #include +#if __ARM_NEON__ +#include +#elif __SSE2__ +#include +#endif + #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT) /* It's a 64-bit machine if: * // limits.h thinks so @@ -33,6 +44,7 @@ typedef __int128_t dsword_t; #define PRIxWORD58 "%014" PRIx64 #define U64LE(x) x##ull #define U58LE(x) x##ull +#define letohWORD letoh64 #else typedef uint16_t hword_t; typedef uint32_t word_t; @@ -45,15 +57,19 @@ typedef int64_t dsword_t; #define PRIxWORD58 "%07" PRIx32 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32 #define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 +#define letohWORD letoh32 #endif #define WORD_BITS (sizeof(word_t) * 8) -/* TODO: vector width for procs like ARM; gcc support */ -typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4))); - +typedef word_t mask_t; static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1; + + +#ifdef __ARM_NEON__ +typedef uint32x4_t vecmask_t; +#else /* FIXME this only works on clang */ typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); @@ -61,8 +77,13 @@ typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); +typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); +/* TODO: vector width for procs like ARM; gcc support */ +typedef word_t vecmask_t __attribute__((ext_vector_type(4))); +#endif #if __AVX2__ typedef uint32x8_t big_register_t; @@ -82,11 +103,28 @@ typedef uint32_t big_register_t; #endif -#if __AVX2__ || __SSE2__ || __ARM_NEON__ +#ifdef __ARM_NEON__ +static __inline__ big_register_t +br_set_to_mask(mask_t x) { + return vdupq_n_u32(x); +} +#else +static __inline__ big_register_t +br_set_to_mask(mask_t x) { + return (big_register_t)x; +} +#endif + +#if __AVX2__ || __SSE2__ static __inline__ big_register_t br_is_zero(big_register_t x) { return (big_register_t)(x == (big_register_t)0); } +#elif __ARM_NEON__ +static __inline__ big_register_t +br_is_zero(big_register_t x) { + return vceqq_u32(x,x^x); +} #else static __inline__ mask_t br_is_zero(word_t x) { @@ -96,6 +134,22 @@ br_is_zero(word_t x) { + +#ifdef __APPLE__ +static inline uint64_t +htobe64 (uint64_t x) { + __asm__ ("bswapq %0" : "+r"(x)); + return x; +} +static inline uint64_t +htole64 (uint64_t x) { return x; } + +static inline uint64_t +letoh64 (uint64_t x) { return x; } +#endif + + + /** * Allocate memory which is sufficiently aligned to be used for the * largest vector on the system (for now that's a big_register_t). diff --git a/src/scalarmul.c b/src/scalarmul.c index 1ad856c..89891db 100644 --- a/src/scalarmul.c +++ b/src/scalarmul.c @@ -63,14 +63,14 @@ cond_negate_tw_pniels ( cond_negate_tw_niels(&n->n, doNegate); } -void +static __inline__ void constant_time_lookup_tw_pniels ( struct tw_pniels_t *out, const struct tw_pniels_t *in, int nin, int idx ) { - big_register_t big_one = 1, big_i = idx; + big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); big_register_t *o = (big_register_t *)out; const big_register_t *i = (const big_register_t *)in; int j; @@ -85,14 +85,14 @@ constant_time_lookup_tw_pniels ( } } -static __inline__ void +static __inline__ void constant_time_lookup_tw_niels ( struct tw_niels_t *out, const struct tw_niels_t *in, int nin, int idx ) { - big_register_t big_one = 1, big_i = idx; + big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); big_register_t *o = (big_register_t *)out; const big_register_t *i = (const big_register_t *)in; int j; @@ -139,64 +139,73 @@ scalarmul ( struct tw_extensible_t *working, const word_t scalar[448/WORD_BITS] ) { - - const int nbits=448; /* HACK? */ + const int nbits=450; /* MAGIC */ word_t prepared_data[448*2/WORD_BITS] = { - U64LE(0x9595b847fdf73126), - U64LE(0x9bb9b8a856af5200), - U64LE(0xb3136e22f37d5c4f), - U64LE(0x0000000189a19442), + + U64LE(0xebec9967f5d3f5c2), + U64LE(0x0aa09b49b16c9a02), + U64LE(0x7f6126aec172cd8e), + U64LE(0x00000007b027e54d), U64LE(0x0000000000000000), U64LE(0x0000000000000000), U64LE(0x4000000000000000), - - U64LE(0x721cf5b5529eec33), - U64LE(0x7a4cf635c8e9c2ab), - U64LE(0xeec492d944a725bf), - U64LE(0x000000020cd77058), + + U64LE(0xc873d6d54a7bb0cf), + U64LE(0xe933d8d723a70aad), + U64LE(0xbb124b65129c96fd), + U64LE(0x00000008335dc163), U64LE(0x0000000000000000), U64LE(0x0000000000000000), U64LE(0x0000000000000000) - }; /* TODO: split off */ + }; /* MAGIC */ word_t scalar2[448/WORD_BITS]; convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); + + const int WINDOW = 5, /* MAGIC */ + WINDOW_MASK = (1<> 1, + NTABLE = 1<<(WINDOW-1); struct tw_extensible_t tabulator; copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[8]; + struct tw_pniels_t pn, multiples[NTABLE]; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); - int i; - for (i=1; i<8; i++) { + int i,j; + for (i=1; i> (i%WORD_BITS) & 0xF, - inv = (bits>>3)-1; + i = nbits - WINDOW; + int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK, + inv = (bits>>(WINDOW-1))-1; bits ^= inv; - constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); + constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK); cond_negate_tw_pniels(&pn, inv); convert_tw_pniels_to_tw_extensible(working, &pn); - for (i-=4; i>=0; i-=4) { - double_tw_extensible(working); - double_tw_extensible(working); - double_tw_extensible(working); - double_tw_extensible(working); + for (i-=WINDOW; i>=0; i-=WINDOW) { + for (j=0; j> (i%WORD_BITS) & 0xF; - inv = (bits>>3)-1; + bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); + + if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { + bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); + } + + bits &= WINDOW_MASK; + inv = (bits>>(WINDOW-1))-1; bits ^= inv; - constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); + constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK); cond_negate_tw_pniels(&pn, inv); add_tw_pniels_to_tw_extensible(working, &pn); } @@ -207,81 +216,89 @@ scalarmul_vlook ( struct tw_extensible_t *working, const word_t scalar[448/WORD_BITS] ) { - - const int nbits=448; /* HACK? */ + const int nbits=450; /* HACK? */ word_t prepared_data[448*2/WORD_BITS] = { - U64LE(0x9595b847fdf73126), - U64LE(0x9bb9b8a856af5200), - U64LE(0xb3136e22f37d5c4f), - U64LE(0x0000000189a19442), + + U64LE(0xebec9967f5d3f5c2), + U64LE(0x0aa09b49b16c9a02), + U64LE(0x7f6126aec172cd8e), + U64LE(0x00000007b027e54d), U64LE(0x0000000000000000), U64LE(0x0000000000000000), U64LE(0x4000000000000000), - - U64LE(0x721cf5b5529eec33), - U64LE(0x7a4cf635c8e9c2ab), - U64LE(0xeec492d944a725bf), - U64LE(0x000000020cd77058), + + U64LE(0xc873d6d54a7bb0cf), + U64LE(0xe933d8d723a70aad), + U64LE(0xbb124b65129c96fd), + U64LE(0x00000008335dc163), U64LE(0x0000000000000000), U64LE(0x0000000000000000), U64LE(0x0000000000000000) - }; /* TODO: split off */ + }; /* MAGIC: split off */ word_t scalar2[448/WORD_BITS]; convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); + + const int WINDOW = 5, /* MAGIC */ + WINDOW_MASK = (1<> 1, + NTABLE = 1<<(WINDOW-1); struct tw_extensible_t tabulator; copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[8]; + struct tw_pniels_t pn, multiples[NTABLE]; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); - int i; - for (i=1; i<8; i++) { + int i,j; + for (i=1; i> (i%WORD_BITS) & 0xF, - inv = (bits>>3)-1; + i = nbits - WINDOW; + int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK, + inv = (bits>>(WINDOW-1))-1; bits ^= inv; - copy_tw_pniels(&pn, &multiples[bits&7]); + copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]); cond_negate_tw_pniels(&pn, inv); convert_tw_pniels_to_tw_extensible(working, &pn); - for (i-=4; i>=0; i-=4) { - double_tw_extensible(working); - double_tw_extensible(working); - double_tw_extensible(working); - double_tw_extensible(working); + for (i-=WINDOW; i>=0; i-=WINDOW) { + for (j=0; j> (i%WORD_BITS) & 0xF; - inv = (bits>>3)-1; + bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); + + if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { + bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); + } + + bits &= WINDOW_MASK; + inv = (bits>>(WINDOW-1))-1; bits ^= inv; - copy_tw_pniels(&pn, &multiples[bits&7]); + copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]); cond_negate_tw_pniels(&pn, inv); add_tw_pniels_to_tw_extensible(working, &pn); } } - -mask_t -scalarmul_fixed_base ( - struct tw_extensible_t *out, - const word_t scalar[448/WORD_BITS], +static mask_t +schedule_scalar_for_combs ( + word_t *scalar2, + const word_t *scalar, unsigned int nbits, const struct fixed_base_table_t *table ) { + unsigned int i; unsigned int n = table->n, t = table->t, s = table->s; - assert(n >= 1 && t >= 1 && s >= 1); - if (n*t*s < nbits) { + if (n*t*s < nbits || n < 1 || t < 1 || s < 1) { return MASK_FAILURE; } @@ -289,10 +306,9 @@ scalarmul_fixed_base ( scalar2_words = scalar_words; if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS; - word_t scalar2[scalar2_words], scalar3[scalar2_words]; + word_t scalar3[scalar2_words]; /* Copy scalar to scalar3, but clear its high bits (if there are any) */ - unsigned int i,j,k; for (i=0; iscalar_adjustments , 448 / WORD_BITS ); + return MASK_SUCCESS; +} + +mask_t +scalarmul_fixed_base ( + struct tw_extensible_t *out, + const word_t scalar[448/WORD_BITS], + unsigned int nbits, + const struct fixed_base_table_t *table +) { + unsigned int i,j,k; + unsigned int n = table->n, t = table->t, s = table->s; + + unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS; + if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS; + + word_t scalar2[scalar2_words]; + + mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table); + if (!succ) return MASK_FAILURE; + +#ifdef __clang_analyzer__ + assert(t >= 1); +#endif + struct tw_niels_t ni; for (i=0; is, s2 = table2->s, smax = (s1 > s2) ? s1 : s2; + + unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS; + if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS; + + unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS; + if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS; + + word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words]; + + /* Schedule the scalars */ + mask_t succ; + succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1); + if (!succ) return MASK_FAILURE; + + succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2); + if (!succ) return MASK_FAILURE; + +#ifdef __clang_analyzer__ + assert(table1->t >= 1); + assert(table2->t >= 1); +#endif + + struct tw_niels_t ni; + + unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0; + word_t *scalars[2] = {scalar1b,scalar2b}; + + for (i=0; is; + if (ii < 0) continue; + assert(ii < (int)table->s); + + for (j=0; jn; j++) { + + int tab = 0; + + for (k=0; kt; k++) { + unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t); + if (bit < swords[sc] * WORD_BITS) { + tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k; + } + } + + mask_t invert = (tab>>(table->t-1))-1; + tab ^= invert; + tab &= (1<<(table->t-1)) - 1; + + copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]); + cond_negate_tw_niels(&ni,invert); + + if (started) { + add_tw_niels_to_tw_extensible(out, &ni); + } else { + convert_tw_niels_to_tw_extensible(out, &ni); + started = 1; + } + + } + } + + assert(started); + } + + return MASK_SUCCESS; +} + + mask_t precompute_fixed_base ( struct fixed_base_table_t *out, @@ -354,7 +479,7 @@ precompute_fixed_base ( unsigned int s, struct tw_niels_t *prealloc ) { - if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { + if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */ memset(out, 0, sizeof(*out)); return 0; } @@ -402,7 +527,7 @@ precompute_fixed_base ( adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS); - /* FIXME: factor out somehow */ + /* MAGIC: factor out somehow */ const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { U64LE(0xdc873d6d54a7bb0d), U64LE(0xde933d8d723a70aa), @@ -462,13 +587,13 @@ precompute_fixed_base ( /* Gray-code phase */ for (j=0;; j++) { int gray = j ^ (j>>1); - int idx = ((i+1)<<(t-1))-1 ^ gray; + int idx = (((i+1)<<(t-1))-1) ^ gray; convert_tw_extensible_to_tw_pniels(&pn_tmp, &start); copy_tw_niels(&table[idx], &pn_tmp.n); p448_copy(&zs[idx], &pn_tmp.z); - if (j >= (1<<(t-1)) - 1) break; + if (j >= (1u<<(t-1)) - 1) break; int delta = (j+1) ^ ((j+1)>>1) ^ gray; for (k=0; delta>1; k++) @@ -777,7 +902,7 @@ linear_combo_var_fixed_vt( const struct tw_niels_t *precmp, unsigned int table_bits_pre ) { - const int table_bits_var = 3; + const int table_bits_var = 4; struct smvt_control control_var[nbits_var/(table_bits_var+1)+3]; struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3]; diff --git a/src/sha512.c b/src/sha512.c index dd1468b..3e46287 100644 --- a/src/sha512.c +++ b/src/sha512.c @@ -2,12 +2,8 @@ * Copyright (c) 2014 Cryptography Research, Inc. * Released under the MIT License. See LICENSE.txt for license information. */ -#ifndef __APPLE__ -#define _BSD_SOURCE -#include -#endif - #include "sha512.h" +#include "word.h" #include #include @@ -20,14 +16,6 @@ rotate_r ( return (x >> d) | (x << (64-d)); } -#ifdef __APPLE__ -static inline uint64_t -htobe64 (uint64_t x) { - __asm__ ("bswapq %0" : "+r"(x)); - return x; -} -#endif - static const uint64_t sha512_init_state[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, diff --git a/test/bench.c b/test/bench.c index b54488f..2e90e9a 100644 --- a/test/bench.c +++ b/test/bench.c @@ -17,23 +17,28 @@ #include "goldilocks.h" #include "sha512.h" -double now() { +static __inline__ void +ignore_result ( int result ) { + (void)result; +} + +static double now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + tv.tv_usec/1000000.0; } -void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) { +static void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) { crandom_generate(crand, (unsigned char *)a, sizeof(*a)); p448_strong_reduce(a); } -void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) { +static void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) { crandom_generate(crand, (unsigned char *)sk, 448/8); } -void p448_print( const char *descr, const struct p448_t *a ) { +static void p448_print( const char *descr, const struct p448_t *a ) { p448_t b; p448_copy(&b, a); p448_strong_reduce(&b); @@ -45,17 +50,21 @@ void p448_print( const char *descr, const struct p448_t *a ) { printf("\n"); } -void p448_print_full( const char *descr, const struct p448_t *a ) { +static void __attribute__((unused)) +p448_print_full ( + const char *descr, + const struct p448_t *a +) { int j; printf("%s = 0x", descr); for (j=15; j>=0; j--) { printf("%02" PRIxWORD "_" PRIxWORD58 " ", - a->limb[j]>>28, a->limb[j]&(1<<28)-1); + a->limb[j]>>28, a->limb[j]&((1<<28)-1)); } printf("\n"); } -void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) { +static void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) { int j; printf("%s = 0x", descr); for (j=448/WORD_BITS-1; j>=0; j--) { @@ -295,7 +304,7 @@ int main(int argc, char **argv) { when = now(); for (i=0; i +#include +#include + +mpz_t mp_p448; + +static mask_t mpz_to_p448 ( + struct p448_t *out, + const mpz_t in +) { + uint8_t ser[56]; + mpz_t modded; + memset(ser,0,sizeof(ser)); + mpz_init(modded); + mpz_mod(modded, in, mp_p448); + mpz_export(ser, NULL, -1, 1, -1, 0, modded); + mask_t succ = p448_deserialize(out, ser); + return succ; +} + +static mask_t p448_assert_eq_gmp( + const char *descr, + const struct p448_t *x, + const mpz_t y, + float lowBound, + float highBound +) { + uint8_t xser[56], yser[56]; + mpz_t modded; + + memset(yser,0,sizeof(yser)); + + p448_serialize(xser, x); + + mpz_init(modded); + mpz_mod(modded, y, mp_p448); + mpz_export(yser, NULL, -1, 1, -1, 0, modded); + + unsigned int i; + for (i=0; ilimb[0]); i++) { + int bits = sizeof(x->limb[0]) * 448 / sizeof(*x); + word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ? + (1ull<limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) { + youfail(); + printf(" P448 limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n", + i, x->limb[i], lowBound, highBound, descr, yardstick); + break; + } + } + + if (memcmp(xser,yser,56)) { + youfail(); + printf(" Failed arithmetic test %s\n", descr); + p448_print(" p448", x); + printf(" gmp = 0x"); + int j; + for (j=55; j>=0; j--) { + printf("%02x", yser[j]); + } + printf("\n"); + return MASK_FAILURE; + } + + mpz_clear(modded); + return MASK_SUCCESS; +} + +static mask_t test_add_sub ( + const mpz_t x, + const mpz_t y, + word_t word +) { + struct p448_t xx,yy,tt; + mpz_t t; + mask_t succ = MASK_SUCCESS; + succ = mpz_to_p448(&xx,x); + succ &= mpz_to_p448(&yy,y); + mpz_init(t); + + p448_add(&tt,&xx,&yy); + mpz_add(t,x,y); + succ &= p448_assert_eq_gmp("add",&tt,t,0,2.1); + + p448_sub(&tt,&xx,&yy); + p448_bias(&tt,2); + mpz_sub(t,x,y); + succ &= p448_assert_eq_gmp("sub",&tt,t,0,3.1); + + p448_copy(&tt,&xx); + p448_addw(&tt,word); + mpz_add_ui(t,x,word); + succ &= p448_assert_eq_gmp("addw",&tt,t,0,2.1); + + p448_copy(&tt,&xx); + p448_subw(&tt,word); + p448_bias(&tt,1); + mpz_sub_ui(t,x,word); + succ &= p448_assert_eq_gmp("subw",&tt,t,0,2.1); + + if (!succ) { + p448_print(" x", &xx); + p448_print(" y", &yy); + } + + mpz_clear(t); + + return succ; +} + +static mask_t test_mul_sqr ( + const mpz_t x, + const mpz_t y, + word_t word +) { + struct p448_t xx,yy,tt; + mpz_t t; + mask_t succ = MASK_SUCCESS; + succ = mpz_to_p448(&xx,x); + succ &= mpz_to_p448(&yy,y); + mpz_init(t); + + p448_mul(&tt,&xx,&yy); + mpz_mul(t,x,y); + succ &= p448_assert_eq_gmp("mul",&tt,t,0,1.1); + + p448_mulw(&tt,&xx,word); + mpz_mul_ui(t,x,word); + succ &= p448_assert_eq_gmp("mulw",&tt,t,0,1.1); + + p448_sqr(&tt,&xx); + mpz_mul(t,x,x); + succ &= p448_assert_eq_gmp("sqrx",&tt,t,0,1.1); + + p448_sqr(&tt,&yy); + mpz_mul(t,y,y); + succ &= p448_assert_eq_gmp("sqy",&tt,t,0,1.1); + + if (!succ) { + p448_print(" x", &xx); + p448_print(" y", &yy); + } + + mpz_clear(t); + + return succ; +} + +int test_arithmetic () { + int j, ntests = 100000; + + gmp_randstate_t state; + gmp_randinit_mt(state); + + uint8_t pser[56]; + for (j=0; j<56; j++) { + pser[j] = (j==28) ? 0xFE : 0xFF; + } + mpz_init(mp_p448); + mpz_import(mp_p448, 56, -1, 1, -1, 0, pser); + + mpz_t x,y; + mpz_init(x); + mpz_init(y); + + mask_t succ = MASK_SUCCESS; + + int bits = sizeof(word_t) * 448 / sizeof(p448_t); + + for (j=0; j +#include +#include + +int test_goldilocks () { + const char *message1 = "hello world"; + const char *message2 = "Jello world"; + + unsigned char signature[GOLDI_SIGNATURE_BYTES]; + + unsigned char + ss12[GOLDI_SHARED_SECRET_BYTES], + ss21[GOLDI_SHARED_SECRET_BYTES], + ss21p[GOLDI_SHARED_SECRET_BYTES], + proto[GOLDI_SYMKEY_BYTES]; + + struct goldilocks_public_key_t pub, pub2; + struct goldilocks_private_key_t priv, priv2; + struct goldilocks_precomputed_public_key_t *pre = NULL; + + int i, ret, good = 1; + + ret = goldilocks_init(); + if (ret) { + youfail(); + printf(" Failed init.\n"); + } + + for (i=0; i<1000 && good; i++) { + + ret = goldilocks_keygen(&priv, &pub); + if (ret) { + youfail(); + printf(" Failed keygen trial %d.\n", i); + good = 0; + } + + goldilocks_destroy_precomputed_public_key( pre ); + pre = goldilocks_precompute_public_key ( &pub ); + if (!pre) { + youfail(); + printf(" Failed precomp-public trial %d.\n", i); + return -1; + } + + ret = goldilocks_sign( + signature, + (const unsigned char *)message1, + strlen(message1), + &priv + ); + if (ret) { + youfail(); + printf(" Failed sign trial %d.\n", i); + good = 0; + } + + ret = goldilocks_verify( + signature, + (const unsigned char *)message1, + strlen(message1), + &pub + ); + if (ret) { + youfail(); + printf(" Failed verify trial %d.\n", i); + good = 0; + } + + ret = goldilocks_verify_precomputed ( + signature, + (const unsigned char *)message1, + strlen(message1), + pre + ); + if (ret) { + youfail(); + printf(" Failed verify-pre trial %d.\n", i); + good = 0; + } + + /* terrible negative test */ + ret = goldilocks_verify( + signature, + (const unsigned char *)message2, + strlen(message1), + &pub + ); + if (ret != GOLDI_EINVAL) { + youfail(); + printf(" Failed nega-verify trial %d.\n", i); + good = 0; + } + ret = goldilocks_verify_precomputed( + signature, + (const unsigned char *)message2, + strlen(message1), + pre + ); + if (ret != GOLDI_EINVAL) { + youfail(); + printf(" Failed nega-verify-pre trial %d.\n", i); + good = 0; + } + + /* honestly a slightly better negative test */ + memset(signature,0,sizeof(signature)); + ret = goldilocks_verify( + signature, + (const unsigned char *)message1, + strlen(message1), + &pub + ); + if (ret != GOLDI_EINVAL) { + youfail(); + printf(" Failed nega-verify-0 trial %d.\n", i); + good = 0; + } + ret = goldilocks_verify_precomputed( + signature, + (const unsigned char *)message1, + strlen(message1), + pre + ); + if (ret != GOLDI_EINVAL) { + youfail(); + printf(" Failed nega-verify-pre-0 trial %d.\n", i); + good = 0; + } + + /* ecdh */ + ret = goldilocks_keygen(&priv2, &pub2); + if (ret) { + youfail(); + printf(" Failed keygen2 trial %d.\n", i); + good = 0; + } + + ret = goldilocks_shared_secret ( ss12, &priv, &pub2 ); + if (ret) { + youfail(); + printf(" Failed ss12 trial %d.\n", i); + good = 0; + } + + ret = goldilocks_shared_secret ( ss21, &priv2, &pub ); + if (ret) { + youfail(); + printf(" Failed ss21 trial %d.\n", i); + good = 0; + } + + ret = goldilocks_shared_secret_precomputed ( ss21p, &priv2, pre ); + if (ret) { + youfail(); + printf(" Failed ss21p trial %d.\n", i); + good = 0; + } + + if (memcmp(ss12,ss21,sizeof(ss12))) { + youfail(); + printf(" Failed shared-secret trial %d.\n", i); + good = 0; + } + + if (memcmp(ss21,ss21p,sizeof(ss21))) { + youfail(); + printf(" Failed shared-secret precomp trial %d.\n", i); + good = 0; + } + + /* test derive / underive / priv to pub */ + goldilocks_underive_private_key ( proto, &priv ); + ret = goldilocks_derive_private_key ( &priv2, proto ); + if (ret || memcmp(&priv,&priv2,sizeof(priv))) { + youfail(); + printf(" Failed derive round-trip trial %d.\n", i); + good = 0; + } + + ret = goldilocks_private_to_public ( &pub2, &priv ); + if (ret || memcmp(&pub,&pub2,sizeof(pub))) { + youfail(); + printf(" Failed private-to-public trial %d.\n", i); + good = 0; + } + + } + + goldilocks_destroy_precomputed_public_key( pre ); + + return good ? 0 : -1; +} diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c index d98cfd8..d6f16c7 100644 --- a/test/test_scalarmul.c +++ b/test/test_scalarmul.c @@ -159,6 +159,92 @@ single_scalarmul_compatibility_test ( return ret; } +static int +single_linear_combo_test ( + const struct p448_t *base1, + const word_t *scalar1, + int nbits1, + const struct p448_t *base2, + const word_t *scalar2, + int nbits2 +) { + /* MAGIC */ + const struct p448_t + sqrt_d_minus_1 = {{ + U58LE(0xd2e21836749f46), + U58LE(0x888db42b4f0179), + U58LE(0x5a189aabdeea38), + U58LE(0x51e65ca6f14c06), + U58LE(0xa49f7b424d9770), + U58LE(0xdcac4628c5f656), + U58LE(0x49443b8748734a), + U58LE(0x12fec0c0b25b7a) + }}; + + struct tw_extensible_t text1, text2, working; + struct tw_pniels_t pn; + struct p448_t result_comb, result_combo, result_wnaf; + + mask_t succ = + deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1) + & deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2); + if (!succ) return 1; + + struct fixed_base_table_t t1, t2; + struct tw_niels_t wnaf[32]; + memset(&t1,0,sizeof(t1)); + memset(&t2,0,sizeof(t2)); + + succ = precompute_fixed_base(&t1, &text1, 5, 5, 18, NULL); + succ &= precompute_fixed_base(&t2, &text2, 6, 3, 25, NULL); + succ &= precompute_fixed_base_wnaf(wnaf, &text2, 5); + + if (!succ) { + destroy_fixed_base(&t1); + destroy_fixed_base(&t2); + return -1; + } + + /* use the dedicated wNAF linear combo algorithm */ + copy_tw_extensible(&working, &text1); + linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5); + untwist_and_double_and_serialize(&result_wnaf, &working); + + /* use the dedicated combs algorithm */ + succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2); + untwist_and_double_and_serialize(&result_combo, &working); + + /* use two combs */ + succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1); + convert_tw_extensible_to_tw_pniels(&pn, &working); + succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2); + add_tw_pniels_to_tw_extensible(&working, &pn); + untwist_and_double_and_serialize(&result_comb, &working); + + mask_t consistent = MASK_SUCCESS; + consistent &= p448_eq(&result_combo, &result_wnaf); + consistent &= p448_eq(&result_comb, &result_wnaf); + + if (!succ || !consistent) { + youfail(); + printf(" Failed linear combo consistency test with nbits=%d,%d.\n",nbits1,nbits2); + + p448_print(" base1", base1); + scalar_print(" scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS); + p448_print(" base2", base2); + scalar_print(" scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS); + p448_print(" combs", &result_comb); + p448_print(" combo", &result_combo); + p448_print(" wNAFs", &result_wnaf); + return -1; + } + + destroy_fixed_base(&t1); + destroy_fixed_base(&t2); + + return 0; +} + /* 0 = succeed, 1 = inval, -1 = fail */ static int single_scalarmul_commutativity_test ( @@ -251,6 +337,49 @@ int test_scalarmul_commutativity () { return 0; } +int test_linear_combo () { + int i,j,k,got; + + struct crandom_state_t crand; + crandom_init_from_buffer(&crand, "scalarmul_linear_combos_test RNG"); + + for (i=0; i<=448; i+=7) { + for (j=0; j<=448; j+=7) { + got = 0; + + for (k=0; k<128 && !got; k++) { + uint8_t ser[56]; + word_t scalar1[7], scalar2[7]; + crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1)); + crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); + + p448_t base1; + crandom_generate(&crand, ser, sizeof(ser)); + mask_t succ = p448_deserialize(&base1, ser); + if (!succ) continue; + + p448_t base2; + crandom_generate(&crand, ser, sizeof(ser)); + succ = p448_deserialize(&base2, ser); + if (!succ) continue; + + int ret = single_linear_combo_test (&base1, scalar1, i, &base2, scalar2, j); + got = !ret; + if (ret == -1) return -1; + } + + if (!got) { + youfail(); + printf(" Unlikely: rejected 128 scalars in a row.\n"); + return -1; + } + + } + } + + return 0; +} + int test_scalarmul_compatibility () { int i,j,k,got;