diff --git a/Doxyfile b/Doxyfile index 3be58a0..55b844f 100644 --- a/Doxyfile +++ b/Doxyfile @@ -508,7 +508,7 @@ HIDE_SCOPE_NAMES = NO # the files that are included by a file in the documentation of that file. # The default value is: YES. -SHOW_INCLUDE_FILES = YES +SHOW_INCLUDE_FILES = NO # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader @@ -777,7 +777,7 @@ FILE_PATTERNS = # be searched for input files as well. # The default value is: NO. -RECURSIVE = NO +RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a diff --git a/HISTORY.txt b/HISTORY.txt index 4d0ae13..3e5f946 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,3 +1,51 @@ +March 29, 2014: + Added a test directory with various tests. Currently testing SHA512 Monte + Carlo, compatibility of the different scalarmul functions, and some + identities on EC point ops. Began moving these tests out of benchmarker. + + Added scan-build support. + + Improved some internal interfaces. Made a structure for Barrett primes + instead of passing parameters individually. Moved some field operations + to places that make more sense, eg Barrett serialize and deserialize. The + deserialize operation now checks that its argument is in [0,q). + + Added more documentation. + + Changed the names of a bunch of functions. Still not entirely consistent, + but getting more so. + + Some minor speed improvements. For example, multiply is now a couple cycles + faster. + + Added a hackish attempt at thread-safety and initialization sanity checking + in the Goldilocks top-level routines. + + Fixed some vector alignment bugs. Compiling with -O0 should now work. + + Slightly simplified recode_wnaf. + + Add a config.h file for future configuration. EXPERIMENT flags moved here. + + I've decided against major changes to SHA512 for the moment. They add speed + but also significantly bloat the code, which is going to hurt L1 cache + performance. Perhaps we should link to OpenSSL if a faster SHA512 is desired. + + Reorganize the source tree into src, test; factor arch stuff into src/arch_*. + + Make most of the code 32-bit clean. There's now a 32-bit generic and 32-bit + vectorless ARM version. No NEON version yet because I don't have a test + machine (could use my phone in a pinch I guess?). The 32-bit version still + isn't heavily optimized, but on ARM it's using a nicely reworked signed/phi-adic + multiplier. The squaring is also based on this, but could really stand some + improvement. + + When passed an even exponent (or extra doubles), the Montgomery ladder should + now be accept points if and only if they lie on the curve. This needs + additional testing, but it passes the zero bit exponent test. + + On 32-bit, use 8x4x14 instead of 5x5x18 table organization. Probably there's + a better heuristic. March 5, 2014: First revision. diff --git a/Makefile b/Makefile index a1c6d6e..3e03193 100644 --- a/Makefile +++ b/Makefile @@ -2,61 +2,101 @@ # Released under the MIT License. See LICENSE.txt for license information. CC = clang -CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror \ - -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 \ - -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC \ - -DEXPERIMENT_ECDH_OBLITERATE_CT=1 -DEXPERIMENT_ECDH_STIR_IN_PUBKEYS=1 +LD = clang -.PHONY: clean all runbench todo doc +ARCH = arch_x86_64 + +WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ + -Wgcc-compat -Wmissing-declarations +INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) +LANGFLAGS = -std=c99 +GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC +OFLAGS = -O3 +#XFLAGS = -DN_TESTS_BASE=1000 +ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 +#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16 + +CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS) +LDFLAGS = $(ARCHFLAGS) +ASFLAGS = $(ARCHFLAGS) + +.PHONY: clean all test bench todo doc lib .PRECIOUS: build/%.s - + HEADERS= Makefile $(shell find . -name "*.h") build/timestamp LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o -all: bench +TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ + build/test_pointops.o + +BENCHCOMPONENTS=build/bench.o + +all: lib build/test build/bench + +scan: clean + scan-build --use-analyzer=`which clang` \ + -enable-checker deadcode -enable-checker llvm \ + -enable-checker osx -enable-checker security -enable-checker unix \ + make build/bench build/test build/goldilocks.so + +build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS) + $(LD) $(LDFLAGS) -o $@ $^ + +build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS) + $(LD) $(LDFLAGS) -o $@ $^ + +lib: build/goldilocks.so + +build/goldilocks.so: $(LIBCOMPONENTS) + rm -f $@ + libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \ + -exported_symbols_list src/exported.sym \ + $(LIBCOMPONENTS) -bench: *.h *.c - $(CC) $(CFLAGS) -o $@ *.c - build/timestamp: mkdir -p build touch $@ build/%.o: build/%.s - $(CC) -c -o $@ $< + $(CC) $(ASFLAGS) -c -o $@ $< -build/%.s: %.c $(HEADERS) +build/%.s: src/%.c $(HEADERS) $(CC) $(CFLAGS) -S -c -o $@ $< -build/goldilocks.so: $(LIBCOMPONENTS) - rm -f $@ - libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \ - -exported_symbols_list exported.sym \ - $(LIBCOMPONENTS) +build/%.s: test/%.c $(HEADERS) + $(CC) $(CFLAGS) -S -c -o $@ $< + +build/%.s: src/$(ARCH)/%.c $(HEADERS) + $(CC) $(CFLAGS) -S -c -o $@ $< doc/timestamp: mkdir -p doc touch $@ -doc: Doxyfile doc/timestamp *.c *.h +doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h doxygen todo:: - @egrep --color=auto -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c + @(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \ + 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' @echo '=============================' @(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \ - egrep -w -i $$i *.h *.c > /dev/null || continue; \ + (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \ /bin/echo -n $$i' ' | head -c 10; \ - egrep -w -i $$i *.h *.c | wc -l; \ + (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \ done) @echo '=============================' @echo -n 'Total ' - @egrep -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c | wc -l + @(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \ + 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l + +bench: build/bench + ./$< -runbench: bench +test: build/test ./$< clean: - rm -fr build bench *.o *.s + rm -fr build doc diff --git a/TODO.txt b/TODO.txt index fb0fc96..e1d05f2 100644 --- a/TODO.txt +++ b/TODO.txt @@ -23,7 +23,7 @@ Important work items for Ed448-Goldilocks: * Word_t, mask_t, bigregister_t, etc. * Generate asm intrinsics with a script? -* Bugfix: make sure that init() and randomization are thread-safe. +* [DONE] Bugfix: make sure that init() and randomization are thread-safe. * Security: check on deserialization that points are < p. * Check also that they're nonzero or otherwise non-pathological? @@ -80,30 +80,29 @@ Important work items for Ed448-Goldilocks: * Portability: make the inner layers of the code 32-bit clean. * Write new versions of the field code. * 28-bit limbs give less headroom for carries. - * NEON and vectorless ARM. + * Now have a vectorless ARM version; need NEON. + * Improve speed of 32-bit field code. * Run through the SAGE tool to generate new bias & bound. -* Portability: make the outer layers of the code 32-bit clean. - * There are endian bugs in the signing algorithm. - * NEON and vectorless constant-time comparison. +* [DONE] Portability: make the outer layers of the code 32-bit clean. -* Performance: write and incorporate some extra routines - * Deserialize_and_isogeny - * Unconditional negate (or just plain subtract) - -* Performance: fixed parameters? +* Performance/flexibility: decide which parameters should be hard-coded. * Perhaps useful for comb precomputation. * Performance: Improve SHA512. - * Improve portability. + * [DONE?] Improve portability. * Improve speed. + * Except not, because this adds too much code size. + * Link OpenSSL if a fast SHA is desired. + +* Protocol: * Decide what things to stir into hashes for various functions. * Performance: improve the Barrett field code. * Support other primes? * Capture prime shape into a struct instead of passing 3 params. - * Make 32-bit clean. (SAGE?) + * [DONE] Make 32-bit clean. * Automation: * Improve the SAGE tool to cover more cases @@ -111,6 +110,10 @@ Important work items for Ed448-Goldilocks: * Constant-time selection * Intrinsics code * Field code? + + * SAGE tool is impossibly slow on 32-bit + * Currently stuck on Elligator after 19 hours. + * [FIXED] at least for now. * Vector-mul-chains * Negation "bubble pushing" optimization diff --git a/include/goldilocks.h b/include/goldilocks.h new file mode 100644 index 0000000..7476a6c --- /dev/null +++ b/include/goldilocks.h @@ -0,0 +1,210 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +/** + * @file goldilocks.h + * @author Mike Hamburg + * @brief Goldilocks high-level functions. + */ +#ifndef __GOLDILOCKS_H__ +#define __GOLDILOCKS_H__ 1 + +#include + +/** + * @brief Serialized form of a Goldilocks public key. + * + * @warning This isn't even my final form! + */ +struct goldilocks_public_key_t { + uint8_t opaque[56]; /**< Serialized data. */ +}; + +/** + * @brief Serialized form of a Goldilocks private key. + * + * Contains 56 bytes of actual private key, 56 bytes of + * public key, and 32 bytes of symmetric key for randomization. + * + * @warning This isn't even my final form! + */ +struct goldilocks_private_key_t { + uint8_t opaque[144]; /**< Serialized data. */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/** @brief No error. */ +static const int GOLDI_EOK = 0; + +/** @brief Error: your key or other state is corrupt. */ +static const int GOLDI_ECORRUPT = 44801; + +/** @brief Error: other party's key is corrupt. */ +static const int GOLDI_EINVAL = 44802; + +/** @brief Error: not enough entropy. */ +static const int GOLDI_ENODICE = 44804; + +/** @brief Error: you need to initialize the library first. */ +static const int GOLDI_EUNINIT = 44805; + +/** @brief Error: called init() but we are already initialized. */ +static const int GOLDI_EALREADYINIT = 44805; + +/** + * @brief Initialize Goldilocks' precomputed tables and + * random number generator. This function must be called before + * any of the other Goldilocks routines (except + * goldilocks_shared_secret in the current version) and should be + * called only once per process. + * + * There is currently no way to tear down this state. It is possible + * that a future version of this library will not require this function. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_EALREADYINIT Already initialized. + * @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing. + * @retval Nonzero An error occurred. + */ +int +goldilocks_init () +__attribute__((warn_unused_result)); + + +/** + * @brief Generate a new random keypair. + * @param [out] privkey The generated private key. + * @param [out] pubkey The generated public key. + * + * @warning This isn't even my final form! + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_ENODICE Insufficient entropy. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_keygen ( + struct goldilocks_private_key_t *privkey, + struct goldilocks_public_key_t *pubkey +) __attribute__((warn_unused_result,nonnull(1,2))); + +/** + * @brief Extract the public key from a private key. + * + * This is essentially a memcpy from the public part of the privkey. + * + * @param [out] pubkey The extracted private key. + * @param [in] privkey The private key. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_ECORRUPT The private key is corrupt. + */ +int +goldilocks_private_to_public ( + struct goldilocks_public_key_t *pubkey, + const struct goldilocks_private_key_t *privkey +) __attribute__((nonnull(1,2))); + +/** + * @brief Generate a Diffie-Hellman shared secret in constant time. + * + * This function uses some compile-time flags whose merit remains to + * be decided. + * + * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes + * of zeros to the secret before hashing. In the case that the other + * party's key is detectably corrupt, instead the symmetric part + * of the secret key is used to produce a pseudorandom value. + * + * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of + * the two parties' public keys is prepended to the hash. + * + * In the current version, this function can safely be run even without + * goldilocks_init(). But this property is not guaranteed for future + * versions, so call it anyway. + * + * @warning This isn't even my final form! + * + * @param [out] shared The shared secret established with the other party. + * @param [in] my_privkey My private key. + * @param [in] your_pubkey The other party's public key. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_ECORRUPT My key is corrupt. + * @retval GOLDI_EINVAL The other party's key is corrupt. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_shared_secret ( + uint8_t shared[64], + const struct goldilocks_private_key_t *my_privkey, + const struct goldilocks_public_key_t *your_pubkey +) __attribute__((warn_unused_result,nonnull(1,2,3))); + +/** + * @brief Sign a message. + * + * The signature is deterministic, using the symmetric secret found in the + * secret key to form a nonce. + * + * The technique used in signing is a modified Schnorr system, like EdDSA. + * + * @warning This isn't even my final form! + * + * @param [out] signature_out Space for the output signature. + * @param [in] message The message to be signed. + * @param [in] message_len The length of the message to be signed. + * @param [in] privkey My private key. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_ECORRUPT My key is corrupt. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_sign ( + uint8_t signature_out[56*2], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_private_key_t *privkey +) __attribute__((nonnull(1,2,4))); + +/** + * @brief Verify a signature. + * + * This function is fairly strict. It will correctly detect when + * the signature has the wrong cofactor component, or when the sig + * values aren't less than p or q. + * + * Currently this function does not detect when the public key is weird, + * eg 0, has cofactor, etc. As a result, a party with a bogus public + * key could create signatures that succeed on some systems and fail on + * others. + * + * @warning This isn't even my final form! + * + * @param [in] signature The signature. + * @param [in] message The message to be verified. + * @param [in] message_len The length of the message to be verified. + * @param [in] pubkey The signer's public key. + * + * @retval GOLDI_EOK Success. + * @retval GOLDI_EINVAL The public key or signature is corrupt. + * @retval GOLDI_EUNINIT You must call goldilocks_init() first. + */ +int +goldilocks_verify ( + const uint8_t signature[56*2], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_public_key_t *pubkey +) __attribute__((warn_unused_result,nonnull(1,2,4))); + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __GOLDILOCKS_H__ */ diff --git a/src/arch_32/ec_point.c b/src/arch_32/ec_point.c new file mode 100644 index 0000000..823e43d --- /dev/null +++ b/src/arch_32/ec_point.c @@ -0,0 +1,959 @@ +/** + * @cond internal + * @file ec_point.c + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#include "ec_point.h" + + +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L1, x ); + p448_mul ( &L2, x, &L1 ); + p448_sqr ( &L1, &L2 ); + p448_mul ( &L2, x, &L1 ); + p448_sqrn ( &L1, &L2, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L1, &L0, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L2, &L0, 9 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( &L2, x, &L0 ); + p448_sqrn ( &L0, &L2, 18 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqrn ( &L0, &L2, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 111 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L0, &L2 ); + p448_mul ( &L1, x, &L0 ); + p448_sqrn ( &L0, &L1, 223 ); + p448_mul ( a, &L2, &L0 ); +} + +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( a, x, &L0 ); +} + +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_weak_reduce( &d->y ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 1 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 1 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_weak_reduce( &e->t ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + mask_t L0, L1, L2; + struct p448_t L3, L4, L5, L6; + p448_mul ( &L6, &a->z0, &a->zd ); + p448_sub ( &L4, &L6, &a->xd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &a->za, &L4 ); + p448_mul ( &L5, &a->z0, &a->xd ); + p448_sub ( &L4, &L5, &a->zd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L3, &a->xa, &L4 ); + p448_add ( &L5, &L3, &L6 ); + p448_sub ( &L4, &L6, &L3 ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &L4, &L5 ); + p448_copy ( &L5, &a->z0 ); + p448_addw ( &L5, 1 ); + p448_sqr ( &L4, &L5 ); + p448_mulw ( &L5, &L4, 39082 ); + p448_neg ( &L4, &L5 ); + p448_add ( &L5, &a->z0, &a->z0 ); + p448_bias ( &L5, 1 ); + p448_add ( &L3, &L5, &L5 ); + p448_add ( &L5, &L3, &L4 ); + p448_weak_reduce( &L5 ); + p448_mul ( &L3, &a->xd, &L5 ); + L1 = p448_is_zero( &a->zd ); + L2 = - L1; + p448_mask ( &L4, &L3, L1 ); + p448_add ( &L5, &L4, &a->zd ); + L0 = ~ L1; + p448_mul ( &L4, sbz, &L6 ); + p448_addw ( &L4, L2 ); + p448_mul ( &L6, &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_mul ( &L5, &L6, &a->xd ); + p448_mul ( &L6, &L4, &L5 ); + p448_isr ( &L3, &L6 ); + p448_mul ( &L5, &L4, &L3 ); + p448_sqr ( &L4, &L3 ); + p448_mul ( &L3, &L6, &L4 ); + p448_mask ( b, &L5, L0 ); + p448_subw ( &L3, 1 ); + p448_bias ( &L3, 1 ); + L1 = p448_is_zero( &L3 ); + L0 = p448_is_zero( sbz ); + return L1 | L0; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_weak_reduce( b ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->x, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L3, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L3 ); + p448_isr ( &L2, &b->x ); + p448_mul ( &b->u, &b->t, &L2 ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &b->t, &b->x, &L3 ); + p448_add ( &b->x, &a->y, &a->x ); + p448_weak_reduce( &b->x ); + p448_sub ( &L2, &a->x, &a->y ); + p448_bias ( &L2, 2 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L3, &b->t, &L2 ); + p448_add ( &L2, &L3, &b->x ); + p448_sub ( &b->t, &b->x, &L3 ); + p448_bias ( &b->t, 2 ); + p448_weak_reduce( &b->t ); + p448_mul ( &b->x, &L2, &b->u ); + L0 = p448_is_zero( &b->y ); + L1 = - L0; + p448_addw ( &b->x, L1 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L0 = p448_is_zero( &b->z ); + L1 = - L0; + p448_addw ( &b->y, L1 ); + p448_weak_reduce( &b->y ); + L1 = p448_is_zero( &a->y ); + L0 = L1 + 1; + p448_set_ui( &b->z, L0 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_isr ( &L2, x ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &L2, x, &L3 ); + p448_subw ( &L2, 1 ); + p448_bias ( &L2, 1 ); + L1 = p448_is_zero( &L2 ); + L0 = p448_is_zero( x ); + return L1 | L0; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_weak_reduce( &L3 ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_weak_reduce( &a->y ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + mask_t L0, L1; + struct p448_t L2; + p448_sub ( &L2, &a->x, &b->x ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_sub ( &L2, &a->y, &b->y ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + p448_sqr ( &a->x, r ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L9, &a->y ); + p448_bias ( &L9, 2 ); + p448_weak_reduce( &L9 ); + p448_sqr ( &L2, &L9 ); + p448_mulw ( &L8, &L2, 1527402724 ); + p448_mulw ( &L7, &L3, 6108985600 ); + p448_add ( &a->y, &L7, &L8 ); + p448_weak_reduce( &a->y ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); + p448_bias ( &L7, 2 ); + p448_weak_reduce( &L7 ); + p448_mulw ( &L4, &a->y, 78160 ); + p448_mul ( &L6, &L7, &L9 ); + p448_mul ( &L8, &L6, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_sqr ( &L6, &L5 ); + p448_mul ( &L5, &L8, &L6 ); + p448_mul ( &L8, &L7, &L5 ); + p448_mul ( &L7, &L8, &L5 ); + p448_copy ( &L5, &a->x ); + p448_subw ( &L5, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L6, &a->x, &L8 ); + p448_sub ( &a->x, &L5, &L6 ); + p448_bias ( &a->x, 3 ); + p448_weak_reduce( &a->x ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_weak_reduce( &L3 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L9 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L2, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L2 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L3, &ext->x, &ext->y ); + p448_neg ( &L2, &L3 ); + p448_add ( &L3, &L2, &L4 ); + p448_bias ( &L3, 2 ); + L1 = p448_is_zero( &L3 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L2, &L4 ); + p448_addw ( &L2, 0 ); + p448_sqr ( &L3, &ext->x ); + p448_add ( &L4, &L3, &L2 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L3, &ext->t ); + p448_mul ( &L2, &L3, &L5 ); + p448_mulw ( &L3, &L2, 39081 ); + p448_neg ( &L5, &L3 ); + p448_add ( &L3, &L5, &L4 ); + p448_neg ( &L5, &L2 ); + p448_add ( &L4, &L5, &L3 ); + p448_sqr ( &L3, &ext->z ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 4 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L3, &L4 ); + p448_addw ( &L3, 0 ); + p448_sqr ( &L2, &ext->z ); + p448_add ( &L4, &L2, &L3 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L2, &ext->t ); + p448_mul ( &L3, &L2, &L5 ); + p448_mulw ( &L5, &L3, 39081 ); + p448_neg ( &L2, &L5 ); + p448_add ( &L3, &L2, &L4 ); + p448_sqr ( &L2, &ext->x ); + p448_neg ( &L4, &L2 ); + p448_add ( &L2, &L4, &L3 ); + p448_bias ( &L2, 4 ); + L1 = p448_is_zero( &L2 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L3, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L3 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L2, &ext->x, &ext->y ); + p448_neg ( &L3, &L2 ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + + diff --git a/src/arch_32/p448.c b/src/arch_32/p448.c new file mode 100644 index 0000000..d3b2956 --- /dev/null +++ b/src/arch_32/p448.c @@ -0,0 +1,300 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" +#include "p448.h" +//#include "x86-64-arith.h" + +static inline mask_t __attribute__((always_inline)) +is_zero ( + word_t x +) { + dword_t xx = x; + xx--; + return xx >> WORD_BITS; +} + +static uint64_t widemul_32 ( + const uint32_t a, + const uint32_t b +) { + return ((uint64_t)a)* b; +} + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + // p448_t ar, br; +// p448_copy(&ar,as); +// p448_copy(&br,bs); +// p448_weak_reduce(&ar); +// p448_weak_reduce(&br); + + const uint32_t *a = as->limb, *b = bs->limb; + uint32_t *c = cs->limb; + + uint64_t accum0 = 0, accum1 = 0, accum2 = 0; + uint32_t mask = (1<<28) - 1; + + uint32_t aa[8], bb[8]; + + /* For some reason clang doesn't vectorize this without prompting? */ + // unsigned int i; + // for (i=0; i>= 28; + accum1 >>= 28; + } + + accum0 += accum1; + accum0 += c[8]; + accum1 += c[0]; + c[8] = ((uint32_t)(accum0)) & mask; + c[0] = ((uint32_t)(accum1)) & mask; + + accum0 >>= 28; + accum1 >>= 28; + c[9] += ((uint32_t)(accum0)); + c[1] += ((uint32_t)(accum1)); +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + const uint32_t bhi = b>>28, blo = b & (1<<28)-1; + + const uint32_t *a = as->limb; + uint32_t *c = cs->limb; + + uint64_t accum0, accum8; + uint32_t mask = (1ull<<28)-1; + + int i; + + accum0 = widemul_32(blo, a[0]); + accum8 = widemul_32(blo, a[8]); + accum0 += widemul_32(bhi, a[15]); + accum8 += widemul_32(bhi, a[15] + a[7]); + + c[0] = accum0 & mask; accum0 >>= 28; + c[8] = accum8 & mask; accum8 >>= 28; + + for (i=1; i<8; i++) { + accum0 += widemul_32(blo, a[i]); + accum8 += widemul_32(blo, a[i+8]); + + accum0 += widemul_32(bhi, a[i-1]); + accum8 += widemul_32(bhi, a[i+7]); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + } + + accum0 += accum8 + c[8]; + c[8] = accum0 & mask; + c[9] += accum0 >> 28; + + accum8 += c[0]; + c[0] = accum8 & mask; + c[1] += accum8 >> 28; +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *as +) { + p448_mul(cs,as,as); // PERF +} + +void +p448_strong_reduce ( + p448_t *a +) { + word_t mask = (1ull<<28)-1; + + /* first, clear high */ + a->limb[8] += a->limb[15]>>28; + a->limb[0] += a->limb[15]>>28; + a->limb[15] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + dsword_t scarry = 0; + int i; + for (i=0; i<16; i++) { + scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask); + a->limb[i] = scarry & mask; + scarry >>= 28; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + word_t scarry_mask = scarry & mask; + dword_t carry = 0; + + /* add it back */ + for (i=0; i<16; i++) { + carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask); + a->limb[i] = carry & mask; + carry >>= 28; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint32_t any = 0; + int i; + for (i=0; i<16; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + for (i=0; i<8; i++) { + uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28); + for (j=0; j<7; j++) { + serial[7*i+j] = limb; + limb >>= 8; + } + assert(limb == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + uint64_t out = 0; + for (j=0; j<7; j++) { + out |= ((uint64_t)serial[7*i+j])<<(8*j); + } + x->limb[2*i] = out & (1ull<<28)-1; + x->limb[2*i+1] = out >> 28; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + uint32_t ge = -1, mask = (1ull<<28)-1; + for (i=0; i<8; i++) { + ge &= x->limb[i]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); + + /* Propagate the rest */ + for (i=9; i<16; i++) { + ge &= x->limb[i]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_32/p448.h b/src/arch_32/p448.h new file mode 100644 index 0000000..4628a89 --- /dev/null +++ b/src/arch_32/p448.h @@ -0,0 +1,378 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include "word.h" + +#include +#include + +typedef struct p448_t { + uint32_t limb[16]; +} __attribute__((aligned(32))) p448_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + out->limb[0] = x & (1<<28)-1; + out->limb[1] = x>>28; + for (i=2; i<16; i++) { + out->limb[i] = 0; + } +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = doswap; + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + */ +} + +void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = doNegate; + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint32_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; + uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; + uint32x4_t *aa = (uint32x4_t*) a; + aa[0] += lo; + aa[1] += lo; + aa[2] += hi; + aa[3] += lo; +} + +void +p448_weak_reduce ( + p448_t *a +) { + uint64_t mask = (1ull<<28) - 1; + uint64_t tmp = a->limb[15] >> 28; + int i; + a->limb[8] += tmp; + for (i=15; i>0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/arch_arm_32/ec_point.c b/src/arch_arm_32/ec_point.c new file mode 100644 index 0000000..823e43d --- /dev/null +++ b/src/arch_arm_32/ec_point.c @@ -0,0 +1,959 @@ +/** + * @cond internal + * @file ec_point.c + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#include "ec_point.h" + + +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L1, x ); + p448_mul ( &L2, x, &L1 ); + p448_sqr ( &L1, &L2 ); + p448_mul ( &L2, x, &L1 ); + p448_sqrn ( &L1, &L2, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L1, &L0, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L2, &L0, 9 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( &L2, x, &L0 ); + p448_sqrn ( &L0, &L2, 18 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqrn ( &L0, &L2, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 111 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L0, &L2 ); + p448_mul ( &L1, x, &L0 ); + p448_sqrn ( &L0, &L1, 223 ); + p448_mul ( a, &L2, &L0 ); +} + +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( a, x, &L0 ); +} + +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_weak_reduce( &d->y ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 1 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 1 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_weak_reduce( &e->t ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + mask_t L0, L1, L2; + struct p448_t L3, L4, L5, L6; + p448_mul ( &L6, &a->z0, &a->zd ); + p448_sub ( &L4, &L6, &a->xd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &a->za, &L4 ); + p448_mul ( &L5, &a->z0, &a->xd ); + p448_sub ( &L4, &L5, &a->zd ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L3, &a->xa, &L4 ); + p448_add ( &L5, &L3, &L6 ); + p448_sub ( &L4, &L6, &L3 ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_mul ( &L6, &L4, &L5 ); + p448_copy ( &L5, &a->z0 ); + p448_addw ( &L5, 1 ); + p448_sqr ( &L4, &L5 ); + p448_mulw ( &L5, &L4, 39082 ); + p448_neg ( &L4, &L5 ); + p448_add ( &L5, &a->z0, &a->z0 ); + p448_bias ( &L5, 1 ); + p448_add ( &L3, &L5, &L5 ); + p448_add ( &L5, &L3, &L4 ); + p448_weak_reduce( &L5 ); + p448_mul ( &L3, &a->xd, &L5 ); + L1 = p448_is_zero( &a->zd ); + L2 = - L1; + p448_mask ( &L4, &L3, L1 ); + p448_add ( &L5, &L4, &a->zd ); + L0 = ~ L1; + p448_mul ( &L4, sbz, &L6 ); + p448_addw ( &L4, L2 ); + p448_mul ( &L6, &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_mul ( &L5, &L6, &a->xd ); + p448_mul ( &L6, &L4, &L5 ); + p448_isr ( &L3, &L6 ); + p448_mul ( &L5, &L4, &L3 ); + p448_sqr ( &L4, &L3 ); + p448_mul ( &L3, &L6, &L4 ); + p448_mask ( b, &L5, L0 ); + p448_subw ( &L3, 1 ); + p448_bias ( &L3, 1 ); + L1 = p448_is_zero( &L3 ); + L0 = p448_is_zero( sbz ); + return L1 | L0; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_weak_reduce( b ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->x, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L3, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L3 ); + p448_isr ( &L2, &b->x ); + p448_mul ( &b->u, &b->t, &L2 ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &b->t, &b->x, &L3 ); + p448_add ( &b->x, &a->y, &a->x ); + p448_weak_reduce( &b->x ); + p448_sub ( &L2, &a->x, &a->y ); + p448_bias ( &L2, 2 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L3, &b->t, &L2 ); + p448_add ( &L2, &L3, &b->x ); + p448_sub ( &b->t, &b->x, &L3 ); + p448_bias ( &b->t, 2 ); + p448_weak_reduce( &b->t ); + p448_mul ( &b->x, &L2, &b->u ); + L0 = p448_is_zero( &b->y ); + L1 = - L0; + p448_addw ( &b->x, L1 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L0 = p448_is_zero( &b->z ); + L1 = - L0; + p448_addw ( &b->y, L1 ); + p448_weak_reduce( &b->y ); + L1 = p448_is_zero( &a->y ); + L0 = L1 + 1; + p448_set_ui( &b->z, L0 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + mask_t L0, L1; + struct p448_t L2, L3; + p448_isr ( &L2, x ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &L2, x, &L3 ); + p448_subw ( &L2, 1 ); + p448_bias ( &L2, 1 ); + L1 = p448_is_zero( &L2 ); + L0 = p448_is_zero( x ); + return L1 | L0; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_weak_reduce( &L3 ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_weak_reduce( &a->y ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + mask_t L0, L1; + struct p448_t L2; + p448_sub ( &L2, &a->x, &b->x ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_sub ( &L2, &a->y, &b->y ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + p448_sqr ( &a->x, r ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L9, &a->y ); + p448_bias ( &L9, 2 ); + p448_weak_reduce( &L9 ); + p448_sqr ( &L2, &L9 ); + p448_mulw ( &L8, &L2, 1527402724 ); + p448_mulw ( &L7, &L3, 6108985600 ); + p448_add ( &a->y, &L7, &L8 ); + p448_weak_reduce( &a->y ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); + p448_bias ( &L7, 2 ); + p448_weak_reduce( &L7 ); + p448_mulw ( &L4, &a->y, 78160 ); + p448_mul ( &L6, &L7, &L9 ); + p448_mul ( &L8, &L6, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_sqr ( &L6, &L5 ); + p448_mul ( &L5, &L8, &L6 ); + p448_mul ( &L8, &L7, &L5 ); + p448_mul ( &L7, &L8, &L5 ); + p448_copy ( &L5, &a->x ); + p448_subw ( &L5, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L6, &a->x, &L8 ); + p448_sub ( &a->x, &L5, &L6 ); + p448_bias ( &a->x, 3 ); + p448_weak_reduce( &a->x ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_weak_reduce( &L3 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L9 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L2, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L2 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L3, &ext->x, &ext->y ); + p448_neg ( &L2, &L3 ); + p448_add ( &L3, &L2, &L4 ); + p448_bias ( &L3, 2 ); + L1 = p448_is_zero( &L3 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L2, &L4 ); + p448_addw ( &L2, 0 ); + p448_sqr ( &L3, &ext->x ); + p448_add ( &L4, &L3, &L2 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L3, &ext->t ); + p448_mul ( &L2, &L3, &L5 ); + p448_mulw ( &L3, &L2, 39081 ); + p448_neg ( &L5, &L3 ); + p448_add ( &L3, &L5, &L4 ); + p448_neg ( &L5, &L2 ); + p448_add ( &L4, &L5, &L3 ); + p448_sqr ( &L3, &ext->z ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 4 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L3, &L4 ); + p448_addw ( &L3, 0 ); + p448_sqr ( &L2, &ext->z ); + p448_add ( &L4, &L2, &L3 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L2, &ext->t ); + p448_mul ( &L3, &L2, &L5 ); + p448_mulw ( &L5, &L3, 39081 ); + p448_neg ( &L2, &L5 ); + p448_add ( &L3, &L2, &L4 ); + p448_sqr ( &L2, &ext->x ); + p448_neg ( &L4, &L2 ); + p448_add ( &L2, &L4, &L3 ); + p448_bias ( &L2, 4 ); + L1 = p448_is_zero( &L2 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L3, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L3 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L2, &ext->x, &ext->y ); + p448_neg ( &L3, &L2 ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; +} + + diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c new file mode 100644 index 0000000..c764955 --- /dev/null +++ b/src/arch_arm_32/p448.c @@ -0,0 +1,1021 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" +#include "p448.h" +//#include "x86-64-arith.h" + +static inline mask_t __attribute__((always_inline)) +is_zero ( + word_t x +) { + dword_t xx = x; + xx--; + return xx >> WORD_BITS; +} + +static uint64_t widemul_32 ( + const uint32_t a, + const uint32_t b +) { + return ((uint64_t)a)* b; +} + +static inline void __attribute__((gnu_inline,always_inline)) +smlal ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + uint32_t lo = *acc, hi = (*acc)>>32; + + __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" + : [lo]"+&r"(lo), [hi]"+&r"(hi) + : [a]"r"(a), [b]"r"(b)); + + *acc = lo + (((uint64_t)hi)<<32); +} + +static inline void __attribute__((gnu_inline,always_inline)) +smlal2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + uint32_t lo = *acc, hi = (*acc)>>32; + + __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" + : [lo]"+&r"(lo), [hi]"+&r"(hi) + : [a]"r"(a), [b]"r"(2*b)); + + *acc = lo + (((uint64_t)hi)<<32); +} + +static inline void __attribute__((gnu_inline,always_inline)) +smull ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + uint32_t lo, hi; + + __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" + : [lo]"=&r"(lo), [hi]"=&r"(hi) + : [a]"r"(a), [b]"r"(b)); + + *acc = lo + (((uint64_t)hi)<<32); +} + +static inline void __attribute__((gnu_inline,always_inline)) +smull2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + uint32_t lo, hi; + + __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" + : [lo]"=&r"(lo), [hi]"=&r"(hi) + : [a]"r"(a), [b]"r"(2*b)); + + *acc = lo + (((uint64_t)hi)<<32); +} + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + // p448_t ar, br; +// p448_copy(&ar,as); +// p448_copy(&br,bs); +// p448_weak_reduce(&ar); +// p448_weak_reduce(&br); + + const uint32_t *a = as->limb, *b = bs->limb; + uint32_t *c = cs->limb; + + uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1; + uint32_t mask = (1<<28) - 1; + + uint32_t aa[8], bm[8]; + + /* For some reason clang doesn't vectorize this without prompting? */ + // unsigned int i; + // for (i=0; i> 28; + accum3 += accum1 >> 28; + + c[0] = ((uint32_t)(accum0)) & mask; + c[1] = ((uint32_t)(accum2)) & mask; + c[8] = ((uint32_t)(accum1)) & mask; + c[9] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + /* t^3 terms */ + smull(&accum1, ax = aa[3], bx = b[15]); + smull(&accum3, ax = aa[4], bx); + smlal(&accum1, ax, bx = b[14]); + smlal(&accum3, ax = aa[5], bx); + smlal(&accum1, ax, bx = b[13]); + smlal(&accum3, ax = aa[6], bx); + smlal(&accum1, ax, bx = b[12]); + smlal(&accum3, ax = aa[7], bx); + smlal(&accum1, ax, bx = b[11]); + + accum0 = accum1; + accum2 = accum3; + + /* t^2 terms */ + smlal(&accum2, ax = aa[0], bx); + smlal(&accum0, ax, bx = b[10]); + smlal(&accum2, ax = aa[1], bx); + smlal(&accum0, ax, bx = b[9]); + smlal(&accum2, ax = aa[2], bx); + smlal(&accum0, ax, bx = b[8]); + smlal(&accum2, ax = aa[3], bx); + + smlal(&accum0, ax = a[11], bx = b[7]); + smlal(&accum2, ax = a[12], bx); + smlal(&accum0, ax, bx = b[6]); + smlal(&accum2, ax = a[13], bx); + smlal(&accum0, ax, bx = b[5]); + smlal(&accum2, ax = a[14], bx); + smlal(&accum0, ax, bx = b[4]); + smlal(&accum2, ax = a[15], bx); + smlal(&accum0, ax, bx = b[3]); + + /* t terms */ + accum1 += accum0; + accum3 += accum2; + smlal(&accum3, ax = a[8], bx); + smlal(&accum1, ax, bx = b[2]); + smlal(&accum3, ax = a[9], bx); + smlal(&accum1, ax, bx = b[1]); + smlal(&accum3, ax = a[10], bx); + smlal(&accum1, ax, bx = b[0]); + smlal(&accum3, ax = a[11], bx); + + smlal(&accum1, ax = a[3], bx = bm[7]); + smlal(&accum3, ax = a[4], bx); + smlal(&accum1, ax, bx = bm[6]); + smlal(&accum3, ax = a[5], bx); + smlal(&accum1, ax, bx = bm[5]); + smlal(&accum3, ax = a[6], bx); + smlal(&accum1, ax, bx = bm[4]); + smlal(&accum3, ax = a[7], bx); + smlal(&accum1, ax, bx = bm[3]); + + /* 1 terms */ + smlal(&accum2, ax = a[0], bx); + smlal(&accum0, ax, bx = bm[2]); + smlal(&accum2, ax = a[1], bx); + smlal(&accum0, ax, bx = bm[1]); + smlal(&accum2, ax = a[2], bx); + smlal(&accum0, ax, bx = bm[0]); + smlal(&accum2, ax = a[3], bx); + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[2] = ((uint32_t)(accum0)) & mask; + c[3] = ((uint32_t)(accum2)) & mask; + c[10] = ((uint32_t)(accum1)) & mask; + c[11] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + + /* t^3 terms */ + smull(&accum1, ax = aa[5], bx = b[15]); + smull(&accum3, ax = aa[6], bx); + smlal(&accum1, ax, bx = b[14]); + smlal(&accum3, ax = aa[7], bx); + smlal(&accum1, ax, bx = b[13]); + + accum0 = accum1; + accum2 = accum3; + + /* t^2 terms */ + + smlal(&accum2, ax = aa[0], bx); + smlal(&accum0, ax, bx = b[12]); + smlal(&accum2, ax = aa[1], bx); + smlal(&accum0, ax, bx = b[11]); + smlal(&accum2, ax = aa[2], bx); + smlal(&accum0, ax, bx = b[10]); + smlal(&accum2, ax = aa[3], bx); + smlal(&accum0, ax, bx = b[9]); + smlal(&accum2, ax = aa[4], bx); + smlal(&accum0, ax, bx = b[8]); + smlal(&accum2, ax = aa[5], bx); + + + smlal(&accum0, ax = a[13], bx = b[7]); + smlal(&accum2, ax = a[14], bx); + smlal(&accum0, ax, bx = b[6]); + smlal(&accum2, ax = a[15], bx); + smlal(&accum0, ax, bx = b[5]); + + /* t terms */ + accum1 += accum0; + accum3 += accum2; + + smlal(&accum3, ax = a[8], bx); + smlal(&accum1, ax, bx = b[4]); + smlal(&accum3, ax = a[9], bx); + smlal(&accum1, ax, bx = b[3]); + smlal(&accum3, ax = a[10], bx); + smlal(&accum1, ax, bx = b[2]); + smlal(&accum3, ax = a[11], bx); + smlal(&accum1, ax, bx = b[1]); + smlal(&accum3, ax = a[12], bx); + smlal(&accum1, ax, bx = b[0]); + smlal(&accum3, ax = a[13], bx); + + + smlal(&accum1, ax = a[5], bx = bm[7]); + smlal(&accum3, ax = a[6], bx); + smlal(&accum1, ax, bx = bm[6]); + smlal(&accum3, ax = a[7], bx); + smlal(&accum1, ax, bx = bm[5]); + + /* 1 terms */ + + smlal(&accum2, ax = a[0], bx); + smlal(&accum0, ax, bx = bm[4]); + smlal(&accum2, ax = a[1], bx); + smlal(&accum0, ax, bx = bm[3]); + smlal(&accum2, ax = a[2], bx); + smlal(&accum0, ax, bx = bm[2]); + smlal(&accum2, ax = a[3], bx); + smlal(&accum0, ax, bx = bm[1]); + smlal(&accum2, ax = a[4], bx); + smlal(&accum0, ax, bx = bm[0]); + smlal(&accum2, ax = a[5], bx); + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[4] = ((uint32_t)(accum0)) & mask; + c[5] = ((uint32_t)(accum2)) & mask; + c[12] = ((uint32_t)(accum1)) & mask; + c[13] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + + /* t^3 terms */ + smull(&accum1, ax = aa[7], bx = b[15]); + accum0 = accum1; + + /* t^2 terms */ + + smull(&accum2, ax = aa[0], bx); + smlal(&accum0, ax, bx = b[14]); + smlal(&accum2, ax = aa[1], bx); + smlal(&accum0, ax, bx = b[13]); + smlal(&accum2, ax = aa[2], bx); + smlal(&accum0, ax, bx = b[12]); + smlal(&accum2, ax = aa[3], bx); + smlal(&accum0, ax, bx = b[11]); + smlal(&accum2, ax = aa[4], bx); + smlal(&accum0, ax, bx = b[10]); + smlal(&accum2, ax = aa[5], bx); + smlal(&accum0, ax, bx = b[9]); + smlal(&accum2, ax = aa[6], bx); + smlal(&accum0, ax, bx = b[8]); + smlal(&accum2, ax = aa[7], bx); + + + smlal(&accum0, ax = a[15], bx = b[7]); + + /* t terms */ + accum1 += accum0; + accum3 = accum2; + + smlal(&accum3, ax = a[8], bx); + smlal(&accum1, ax, bx = b[6]); + smlal(&accum3, ax = a[9], bx); + smlal(&accum1, ax, bx = b[5]); + smlal(&accum3, ax = a[10], bx); + smlal(&accum1, ax, bx = b[4]); + smlal(&accum3, ax = a[11], bx); + smlal(&accum1, ax, bx = b[3]); + smlal(&accum3, ax = a[12], bx); + smlal(&accum1, ax, bx = b[2]); + smlal(&accum3, ax = a[13], bx); + smlal(&accum1, ax, bx = b[1]); + smlal(&accum3, ax = a[14], bx); + smlal(&accum1, ax, bx = b[0]); + smlal(&accum3, ax = a[15], bx); + + + smlal(&accum1, ax = a[7], bx = bm[7]); + + /* 1 terms */ + + smlal(&accum2, ax = a[0], bx); + smlal(&accum0, ax, bx = bm[6]); + smlal(&accum2, ax = a[1], bx); + smlal(&accum0, ax, bx = bm[5]); + smlal(&accum2, ax = a[2], bx); + smlal(&accum0, ax, bx = bm[4]); + smlal(&accum2, ax = a[3], bx); + smlal(&accum0, ax, bx = bm[3]); + smlal(&accum2, ax = a[4], bx); + smlal(&accum0, ax, bx = bm[2]); + smlal(&accum2, ax = a[5], bx); + smlal(&accum0, ax, bx = bm[1]); + smlal(&accum2, ax = a[6], bx); + smlal(&accum0, ax, bx = bm[0]); + smlal(&accum2, ax = a[7], bx); + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[6] = ((uint32_t)(accum0)) & mask; + c[7] = ((uint32_t)(accum2)) & mask; + c[14] = ((uint32_t)(accum1)) & mask; + c[15] = ((uint32_t)(accum3)) & mask; + + accum0 = accum2 >> 28; + accum1 = accum3 >> 28; + } + + accum0 += accum1; + accum0 += c[8]; + accum1 += c[0]; + c[8] = ((uint32_t)(accum0)) & mask; + c[0] = ((uint32_t)(accum1)) & mask; + + accum0 >>= 28; + accum1 >>= 28; + c[9] += ((uint32_t)(accum0)); + c[1] += ((uint32_t)(accum1)); +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *as +) { + // p448_t ar, br; +// p448_copy(&ar,as); +// p448_copy(&br,bs); +// p448_weak_reduce(&ar); +// p448_weak_reduce(&br); + + const uint32_t *a = as->limb; + uint32_t *c = cs->limb; + + uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp; + uint32_t mask = (1<<28) - 1; + + uint32_t bm[8]; + + /* For some reason clang doesn't vectorize this without prompting? */ + // unsigned int i; + // for (i=0; i> 28; + accum3 += accum1 >> 28; + + c[0] = ((uint32_t)(accum0)) & mask; + c[1] = ((uint32_t)(accum2)) & mask; + c[8] = ((uint32_t)(accum1)) & mask; + c[9] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + /* t^3 terms */ + smull2(&accum1, ax = a[11], bx = a[15]); + smull2(&accum3, ax = a[12], bx); + smlal2(&accum1, ax, bx = a[14]); + smlal2(&accum3, ax = a[13], bx); + smlal(&accum1, ax, ax); + + accum0 = accum1; + accum2 = accum3; + + /* t^2 terms */ + smlal2(&accum2, ax = a[8], bx = a[11]); + smlal2(&accum0, ax, bx = a[10]); + smlal2(&accum2, ax = a[9], bx); + smlal(&accum0, ax, ax); + + smlal2(&accum0, ax = a[3], bx = a[7]); + smlal2(&accum2, ax = a[4], bx); + smlal2(&accum0, ax, bx = a[6]); + smlal2(&accum2, ax = a[5], bx); + smlal(&accum0, ax, ax); + + /* t terms */ + accum1 += accum0; + accum3 += accum2; + smlal2(&accum3, ax = a[0], bx = a[3]); + smlal2(&accum1, ax, bx = a[2]); + smlal2(&accum3, ax = a[1], bx); + smlal(&accum1, ax, ax); + + accum1 = -accum1; + accum3 = -accum3; + accum2 = -accum2; + accum0 = -accum0; + + smlal2(&accum1, ax = bm[3], bx = bm[7]); + smlal2(&accum3, ax = bm[4], bx); + smlal2(&accum1, ax, bx = bm[6]); + smlal2(&accum3, ax = bm[5], bx); + smlal(&accum1, ax, ax); + + /* 1 terms */ + smlal2(&accum2, ax = bm[0], bx = bm[3]); + smlal2(&accum0, ax, bx = bm[2]); + smlal2(&accum2, ax = bm[1], bx); + smlal(&accum0, ax, ax); + + + tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; + tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[2] = ((uint32_t)(accum0)) & mask; + c[3] = ((uint32_t)(accum2)) & mask; + c[10] = ((uint32_t)(accum1)) & mask; + c[11] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + + /* t^3 terms */ + smull2(&accum1, ax = a[13], bx = a[15]); + smull2(&accum3, ax = a[14], bx); + smlal(&accum1, ax, ax); + + accum0 = accum1; + accum2 = accum3; + + /* t^2 terms */ + + smlal2(&accum2, ax = a[8], bx = a[13]); + smlal2(&accum0, ax, bx = a[12]); + smlal2(&accum2, ax = a[9], bx); + smlal2(&accum0, ax, bx = a[11]); + smlal2(&accum2, ax = a[10], bx); + smlal(&accum0, ax, ax); + + + smlal2(&accum0, ax = a[5], bx = a[7]); + smlal2(&accum2, ax = a[6], bx); + smlal(&accum0, ax, ax); + + /* t terms */ + accum1 += accum0; + accum3 += accum2; + + smlal2(&accum3, ax = a[0], bx = a[5]); + smlal2(&accum1, ax, bx = a[4]); + smlal2(&accum3, ax = a[1], bx); + smlal2(&accum1, ax, bx = a[3]); + smlal2(&accum3, ax = a[2], bx); + smlal(&accum1, ax, ax); + + accum1 = -accum1; + accum3 = -accum3; + accum2 = -accum2; + accum0 = -accum0; + + smlal2(&accum1, ax = bm[5], bx = bm[7]); + smlal2(&accum3, ax = bm[6], bx); + smlal(&accum1, ax, ax); + + /* 1 terms */ + + smlal2(&accum2, ax = bm[0], bx = bm[5]); + smlal2(&accum0, ax, bx = bm[4]); + smlal2(&accum2, ax = bm[1], bx); + smlal2(&accum0, ax, bx = bm[3]); + smlal2(&accum2, ax = bm[2], bx); + smlal(&accum0, ax, ax); + + + tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; + tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[4] = ((uint32_t)(accum0)) & mask; + c[5] = ((uint32_t)(accum2)) & mask; + c[12] = ((uint32_t)(accum1)) & mask; + c[13] = ((uint32_t)(accum3)) & mask; + + accumC0 = accum2 >> 28; + accumC1 = accum3 >> 28; + } + { + + /* t^3 terms */ + smull(&accum1, ax = a[15], bx = a[15]); + accum0 = accum1; + + /* t^2 terms */ + + smull2(&accum2, ax = a[8], bx); + smlal2(&accum0, ax, bx = a[14]); + smlal2(&accum2, ax = a[9], bx); + smlal2(&accum0, ax, bx = a[13]); + smlal2(&accum2, ax = a[10], bx); + smlal2(&accum0, ax, bx = a[12]); + smlal2(&accum2, ax = a[11], bx); + smlal(&accum0, ax, ax); + + + smlal(&accum0, ax = a[7], bx = a[7]); + + /* t terms */ + accum1 += accum0; + accum3 = accum2; + + smlal2(&accum3, ax = a[0], bx); + smlal2(&accum1, ax, bx = a[6]); + smlal2(&accum3, ax = a[1], bx); + smlal2(&accum1, ax, bx = a[5]); + smlal2(&accum3, ax = a[2], bx); + smlal2(&accum1, ax, bx = a[4]); + smlal2(&accum3, ax = a[3], bx); + smlal(&accum1, ax, ax); + + accum1 = -accum1; + accum3 = -accum3; + accum2 = -accum2; + accum0 = -accum0; + + bx = bm[7]; + smlal(&accum1, bx, bx); + + /* 1 terms */ + + smlal2(&accum2, ax = bm[0], bx); + smlal2(&accum0, ax, bx = bm[6]); + smlal2(&accum2, ax = bm[1], bx); + smlal2(&accum0, ax, bx = bm[5]); + smlal2(&accum2, ax = bm[2], bx); + smlal2(&accum0, ax, bx = bm[4]); + smlal2(&accum2, ax = bm[3], bx); + smlal(&accum0, ax, ax); + + tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; + tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; + + + accum0 += accumC0; + accum1 += accumC1; + accum2 += accum0 >> 28; + accum3 += accum1 >> 28; + + c[6] = ((uint32_t)(accum0)) & mask; + c[7] = ((uint32_t)(accum2)) & mask; + c[14] = ((uint32_t)(accum1)) & mask; + c[15] = ((uint32_t)(accum3)) & mask; + + accum0 = accum2 >> 28; + accum1 = accum3 >> 28; + } + + accum0 += accum1; + accum0 += c[8]; + accum1 += c[0]; + c[8] = ((uint32_t)(accum0)) & mask; + c[0] = ((uint32_t)(accum1)) & mask; + + accum0 >>= 28; + accum1 >>= 28; + c[9] += ((uint32_t)(accum0)); + c[1] += ((uint32_t)(accum1)); +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + const uint32_t bhi = b>>28, blo = b & (1<<28)-1; + + const uint32_t *a = as->limb; + uint32_t *c = cs->limb; + + uint64_t accum0, accum8; + uint32_t mask = (1ull<<28)-1; + + int i; + + uint32_t c0, c8, n0, n8; + accum0 = widemul_32(bhi, a[15]); + accum8 = widemul_32(bhi, a[15] + a[7]); + c0 = a[0]; c8 = a[8]; + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[0] = accum0 & mask; accum0 >>= 28; + c[8] = accum8 & mask; accum8 >>= 28; + + i=1; + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + c0 = a[i]; c8 = a[i+8]; + smlal(&accum0, bhi, n0); + smlal(&accum8, bhi, n8); + smlal(&accum0, blo, c0); + smlal(&accum8, blo, c8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + { + n0 = a[i]; n8 = a[i+8]; + smlal(&accum0, bhi, c0); + smlal(&accum8, bhi, c8); + smlal(&accum0, blo, n0); + smlal(&accum8, blo, n8); + + c[i] = accum0 & mask; accum0 >>= 28; + c[i+8] = accum8 & mask; accum8 >>= 28; + i++; + } + + accum0 += accum8 + c[8]; + c[8] = accum0 & mask; + c[9] += accum0 >> 28; + + accum8 += c[0]; + c[0] = accum8 & mask; + c[1] += accum8 >> 28; +} + +void +p448_strong_reduce ( + p448_t *a +) { + word_t mask = (1ull<<28)-1; + + /* first, clear high */ + a->limb[8] += a->limb[15]>>28; + a->limb[0] += a->limb[15]>>28; + a->limb[15] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + dsword_t scarry = 0; + int i; + for (i=0; i<16; i++) { + scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask); + a->limb[i] = scarry & mask; + scarry >>= 28; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + word_t scarry_mask = scarry & mask; + dword_t carry = 0; + + /* add it back */ + for (i=0; i<16; i++) { + carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask); + a->limb[i] = carry & mask; + carry >>= 28; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint32_t any = 0; + int i; + for (i=0; i<16; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + for (i=0; i<8; i++) { + uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28); + for (j=0; j<7; j++) { + serial[7*i+j] = limb; + limb >>= 8; + } + assert(limb == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + uint64_t out = 0; + for (j=0; j<7; j++) { + out |= ((uint64_t)serial[7*i+j])<<(8*j); + } + x->limb[2*i] = out & (1ull<<28)-1; + x->limb[2*i+1] = out >> 28; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + uint32_t ge = -1, mask = (1ull<<28)-1; + for (i=0; i<8; i++) { + ge &= x->limb[i]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); + + /* Propagate the rest */ + for (i=9; i<16; i++) { + ge &= x->limb[i]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_arm_32/p448.h b/src/arch_arm_32/p448.h new file mode 100644 index 0000000..4628a89 --- /dev/null +++ b/src/arch_arm_32/p448.h @@ -0,0 +1,378 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include "word.h" + +#include +#include + +typedef struct p448_t { + uint32_t limb[16]; +} __attribute__((aligned(32))) p448_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + out->limb[0] = x & (1<<28)-1; + out->limb[1] = x>>28; + for (i=2; i<16; i++) { + out->limb[i] = 0; + } +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = doswap; + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + */ +} + +void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = doNegate; + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint32_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; + uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; + uint32x4_t *aa = (uint32x4_t*) a; + aa[0] += lo; + aa[1] += lo; + aa[2] += hi; + aa[3] += lo; +} + +void +p448_weak_reduce ( + p448_t *a +) { + uint64_t mask = (1ull<<28) - 1; + uint64_t tmp = a->limb[15] >> 28; + int i; + a->limb[8] += tmp; + for (i=15; i>0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/arch_x86_64/ec_point.c b/src/arch_x86_64/ec_point.c new file mode 100644 index 0000000..87df79f --- /dev/null +++ b/src/arch_x86_64/ec_point.c @@ -0,0 +1,910 @@ +/** + * @cond internal + * @file ec_point.c + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#include "ec_point.h" + + +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L1, x ); + p448_mul ( &L2, x, &L1 ); + p448_sqr ( &L1, &L2 ); + p448_mul ( &L2, x, &L1 ); + p448_sqrn ( &L1, &L2, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L1, &L0, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L2, &L0, 9 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( &L2, x, &L0 ); + p448_sqrn ( &L0, &L2, 18 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqrn ( &L0, &L2, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 111 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L0, &L2 ); + p448_mul ( &L1, x, &L0 ); + p448_sqrn ( &L0, &L1, 223 ); + p448_mul ( a, &L2, &L0 ); +} + +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( a, x, &L0 ); +} + +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 4 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + struct p448_t L0, L1, L2, L3; + mask_t L4, L5, L6; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_mul ( &L2, &a->xa, &L1 ); + p448_add ( &L1, &L2, &L3 ); + p448_sub ( &L0, &L3, &L2 ); + p448_bias ( &L0, 2 ); + p448_mul ( &L3, &L0, &L1 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L0, &a->z0, &a->z0 ); + p448_bias ( &L0, 1 ); + p448_add ( &L2, &L0, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_mul ( &L2, &a->xd, &L0 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L2, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1; + mask_t L2, L3; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_mul ( &b->t, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_mul ( &b->x, &b->z, &b->t ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &L1, &b->x, &b->t ); + p448_isr ( &b->t, &L1 ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &L1, &b->x ); + p448_add ( &L1, &a->y, &a->x ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); + p448_bias ( &b->t, 2 ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); + p448_weak_reduce( &b->y ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + struct p448_t L0, L1; + mask_t L2, L3; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + struct p448_t L0; + mask_t L1, L2; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + struct p448_t L0, L1, L2; + mask_t L3, L4; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + struct p448_t L0, L1, L2; + mask_t L3, L4; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + struct p448_t L0, L1, L2, L3, L4, L5, L6, L7; + mask_t L8, L9; + p448_sqr ( &a->x, r ); + p448_sqr ( &L1, &a->x ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L7, &a->y ); + p448_bias ( &L7, 2 ); + p448_sqr ( &L0, &L7 ); + p448_mulw ( &L6, &L0, 1527402724 ); + p448_mulw ( &L5, &L1, 6108985600 ); + p448_add ( &a->y, &L5, &L6 ); + p448_mulw ( &L6, &L0, 6109454568 ); + p448_sub ( &L5, &a->y, &L6 ); + p448_bias ( &L5, 2 ); + p448_mulw ( &L2, &a->y, 78160 ); + p448_mul ( &L4, &L5, &L7 ); + p448_mul ( &L6, &L4, &L2 ); + p448_mul ( &L2, &L5, &L6 ); + p448_isr ( &L3, &L2 ); + p448_mul ( &L2, &L4, &L3 ); + p448_sqr ( &L4, &L3 ); + p448_mul ( &L3, &L6, &L4 ); + p448_mul ( &L6, &L5, &L3 ); + p448_mul ( &L5, &L6, &L3 ); + p448_copy ( &L4, &a->x ); + p448_subw ( &L4, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L3, &a->x, &L6 ); + p448_sub ( &a->x, &L4, &L3 ); + p448_bias ( &a->x, 3 ); + p448_mul ( &L3, &L2, &a->x ); + p448_mulw ( &L2, &L3, 78160 ); + p448_neg ( &a->x, &L2 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L2, &L1, &L1 ); + p448_add ( &L1, &L2, &L0 ); + p448_subw ( &L1, 2 ); + p448_bias ( &L1, 1 ); + p448_mul ( &L0, &L1, &L6 ); + p448_mulw ( &L1, &L0, 3054649120 ); + p448_add ( &L0, &L1, &a->y ); + p448_mul ( &a->y, &L5, &L0 ); + L9 = p448_is_zero( &L7 ); + L8 = - L9; + p448_addw ( &a->y, L8 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + struct p448_t L0, L1, L2, L3; + mask_t L4, L5; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L0, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L0 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L1, &ext->x, &ext->y ); + p448_neg ( &L0, &L1 ); + p448_add ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + L5 = p448_is_zero( &L1 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L0, &L2 ); + p448_addw ( &L0, 0 ); + p448_sqr ( &L1, &ext->x ); + p448_add ( &L2, &L1, &L0 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L1, &ext->t ); + p448_mul ( &L0, &L1, &L3 ); + p448_mulw ( &L1, &L0, 39081 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L1, &L3, &L2 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L2, &L3, &L1 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + struct p448_t L0, L1, L2, L3; + mask_t L4, L5; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + + diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c new file mode 100644 index 0000000..7a37195 --- /dev/null +++ b/src/arch_x86_64/p448.c @@ -0,0 +1,467 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p448.h" +#include "x86-64-arith.h" + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + const uint64_t *a = as->limb, *b = bs->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ull<<56) - 1; + + uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))); + + /* For some reason clang doesn't vectorize this without prompting? */ + unsigned int i; + for (i=0; i>= 56; + accum1 >>= 56; + + mac(&accum0, &aa[1],&bb[3]); + mac(&accum1, &a[5], &b[7]); + mac(&accum0, &aa[2], &bb[2]); + mac(&accum1, &a[6], &b[6]); + mac(&accum0, &aa[3], &bb[1]); + accum1 += accum0; + + accum2 = widemul(&a[0],&b[0]); + accum1 -= accum2; + accum0 += accum2; + + msb(&accum0, &a[1], &b[3]); + msb(&accum0, &a[2], &b[2]); + mac(&accum1, &a[7], &b[5]); + msb(&accum0, &a[3], &b[1]); + mac(&accum1, &aa[0], &bb[0]); + mac(&accum0, &a[4], &b[4]); + + c[0] = ((uint64_t)(accum0)) & mask; + c[4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(&aa[2],&bb[3]); + msb(&accum0, &a[2], &b[3]); + mac(&accum1, &a[6], &b[7]); + + mac(&accum2, &aa[3], &bb[2]); + msb(&accum0, &a[3], &b[2]); + mac(&accum1, &a[7], &b[6]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(&a[0],&b[1]); + mac(&accum1, &aa[0], &bb[1]); + mac(&accum0, &a[4], &b[5]); + + mac(&accum2, &a[1], &b[0]); + mac(&accum1, &aa[1], &bb[0]); + mac(&accum0, &a[5], &b[4]); + + accum1 -= accum2; + accum0 += accum2; + + c[1] = ((uint64_t)(accum0)) & mask; + c[5] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(&aa[3],&bb[3]); + msb(&accum0, &a[3], &b[3]); + mac(&accum1, &a[7], &b[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(&a[0],&b[2]); + mac(&accum1, &aa[0], &bb[2]); + mac(&accum0, &a[4], &b[6]); + + mac(&accum2, &a[1], &b[1]); + mac(&accum1, &aa[1], &bb[1]); + mac(&accum0, &a[5], &b[5]); + + mac(&accum2, &a[2], &b[0]); + mac(&accum1, &aa[2], &bb[0]); + mac(&accum0, &a[6], &b[4]); + + accum1 -= accum2; + accum0 += accum2; + + c[2] = ((uint64_t)(accum0)) & mask; + c[6] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum0 += c[3]; + accum1 += c[7]; + c[3] = ((uint64_t)(accum0)) & mask; + c[7] = ((uint64_t)(accum1)) & mask; + + /* we could almost stop here, but it wouldn't be stable, so... */ + + accum0 >>= 56; + accum1 >>= 56; + c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); + c[0] += ((uint64_t)(accum1)); +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0, accum4; + uint64_t mask = (1ull<<56) - 1; + + accum0 = widemul_rm(b, &a[0]); + accum4 = widemul_rm(b, &a[4]); + + c[0] = accum0 & mask; accum0 >>= 56; + c[4] = accum4 & mask; accum4 >>= 56; + + mac_rm(&accum0, b, &a[1]); + mac_rm(&accum4, b, &a[5]); + + c[1] = accum0 & mask; accum0 >>= 56; + c[5] = accum4 & mask; accum4 >>= 56; + + mac_rm(&accum0, b, &a[2]); + mac_rm(&accum4, b, &a[6]); + + c[2] = accum0 & mask; accum0 >>= 56; + c[6] = accum4 & mask; accum4 >>= 56; + + mac_rm(&accum0, b, &a[3]); + mac_rm(&accum4, b, &a[7]); + + c[3] = accum0 & mask; accum0 >>= 56; + c[7] = accum4 & mask; accum4 >>= 56; + + c[4] += accum0 + accum4; + c[0] += accum4; + + /* + * TODO: double-check that this is not necessary. + accum0 += accum4 + c[4]; + c[4] = accum0 & mask; + c[5] += accum0 >> 56; + + accum4 += c[0]; + c[0] = accum4 & mask; + c[1] += accum4 >> 56; + */ +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *as +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ull<<56) - 1; + + uint64_t aa[4] __attribute__((aligned(32))); + + /* For some reason clang doesn't vectorize this without prompting? */ + unsigned int i; + for (i=0; i>= 55; + accum1 >>= 55; + + mac2(&accum0, &aa[1],&aa[3]); + mac2(&accum1, &a[5], &a[7]); + mac(&accum0, &aa[2], &aa[2]); + accum1 += accum0; + + msb2(&accum0, &a[1], &a[3]); + mac(&accum1, &a[6], &a[6]); + + accum2 = widemul(&a[0],&a[0]); + accum1 -= accum2; + accum0 += accum2; + + msb(&accum0, &a[2], &a[2]); + mac(&accum1, &aa[0], &aa[0]); + mac(&accum0, &a[4], &a[4]); + + c[0] = ((uint64_t)(accum0)) & mask; + c[4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul2(&aa[2],&aa[3]); + msb2(&accum0, &a[2], &a[3]); + mac2(&accum1, &a[6], &a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul2(&a[0],&a[1]); + mac2(&accum1, &aa[0], &aa[1]); + mac2(&accum0, &a[4], &a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[1] = ((uint64_t)(accum0)) & mask; + c[5] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(&aa[3],&aa[3]); + msb(&accum0, &a[3], &a[3]); + mac(&accum1, &a[7], &a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul2(&a[0],&a[2]); + mac2(&accum1, &aa[0], &aa[2]); + mac2(&accum0, &a[4], &a[6]); + + mac(&accum2, &a[1], &a[1]); + mac(&accum1, &aa[1], &aa[1]); + mac(&accum0, &a[5], &a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[2] = ((uint64_t)(accum0)) & mask; + c[6] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum0 += c[3]; + accum1 += c[7]; + c[3] = ((uint64_t)(accum0)) & mask; + c[7] = ((uint64_t)(accum1)) & mask; + + /* we could almost stop here, but it wouldn't be stable, so... */ + + accum0 >>= 56; + accum1 >>= 56; + c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); + c[0] += ((uint64_t)(accum1)); +} + +void +p448_strong_reduce ( + p448_t *a +) { + uint64_t mask = (1ull<<56)-1; + + /* first, clear high */ + a->limb[4] += a->limb[7]>>56; + a->limb[0] += a->limb[7]>>56; + a->limb[7] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + __int128_t scarry = 0; + int i; + for (i=0; i<8; i++) { + scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); + a->limb[i] = scarry & mask; + scarry >>= 56; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<8; i++) { + carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); + a->limb[i] = carry & mask; + carry >>= 56; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint64_t any = 0; + int i; + for (i=0; i<8; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + for (i=0; i<8; i++) { + for (j=0; j<7; j++) { + serial[7*i+j] = red.limb[i]; + red.limb[i] >>= 8; + } + assert(red.limb[i] == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + word_t out = 0; + for (j=0; j<7; j++) { + out |= ((word_t)serial[7*i+j])<<(8*j); + } + x->limb[i] = out; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + word_t ge = -1, mask = (1ull<<56)-1; + for (i=0; i<4; i++) { + ge &= x->limb[i]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); + + /* Propagate the rest */ + for (i=5; i<8; i++) { + ge &= x->limb[i]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_x86_64/p448.h b/src/arch_x86_64/p448.h new file mode 100644 index 0000000..b0b4dc0 --- /dev/null +++ b/src/arch_x86_64/p448.h @@ -0,0 +1,376 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include +#include + +#include "word.h" + +typedef struct p448_t { + uint64_t limb[8]; +} __attribute__((aligned(32))) p448_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + out->limb[0] = x; + for (i=1; i<8; i++) { + out->limb[i] = 0; + } +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = doswap; + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + */ +} + +void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + struct p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + struct p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = doNegate; + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint64_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt; + uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; + uint64x4_t *aa = (uint64x4_t*) a; + aa[0] += lo; + aa[1] += hi; +} + +void +p448_weak_reduce ( + p448_t *a +) { + /* PERF: use pshufb/palignr if anyone cares about speed of this */ + uint64_t mask = (1ull<<56) - 1; + uint64_t tmp = a->limb[7] >> 56; + int i; + a->limb[4] += tmp; + for (i=7; i>0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/arch_x86_64/x86-64-arith.h b/src/arch_x86_64/x86-64-arith.h new file mode 100644 index 0000000..32ee832 --- /dev/null +++ b/src/arch_x86_64/x86-64-arith.h @@ -0,0 +1,279 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __X86_64_ARITH_H__ +#define __X86_64_ARITH_H__ + +#include + +/* TODO: non x86-64 versions of these. + * FUTURE: autogenerate + */ + +static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { + #ifndef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rax;" + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"m"(*a) + : "cc"); + return (((__uint128_t)(d))<<64) | c; + #else + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx;" + "mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"m"(*a) + : "rdx"); + return (((__uint128_t)(d))<<64) | c; + #endif +} + +static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { + #ifndef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rax;" + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"r"(a) + : "cc"); + return (((__uint128_t)(d))<<64) | c; + #else + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"d"(a)); + return (((__uint128_t)(d))<<64) | c; + #endif +} + +static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { + #ifndef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b];" + : [c]"=a"(c), [d]"=d"(d) + : [b]"m"(*b), [a]"m"(*a) + : "cc"); + return (((__uint128_t)(d))<<64) | c; + #else + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx;" + "leaq (,%%rdx,2), %%rdx;" + "mulx %[b], %[c], %[d];" + : [c]"=r"(c), [d]"=r"(d) + : [b]"m"(*b), [a]"m"(*a) + : "rdx"); + return (((__uint128_t)(d))<<64) | c; + #endif +} + +static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + uint64_t lo2 = *acc2, hi2 = *acc2>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + "addq %[c], %[lo2]; " + "adcq %[d], %[hi2]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + "addq %%rax, %[lo2]; " + "adcq %%rdx, %[hi2]; " + : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; + *acc2 = (((__uint128_t)(hi2))<<64) | lo2; +} + +static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"d"(a) + : "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"r"(a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "addq %%rdx, %%rdx; " + "mulx %[b], %[c], %[d]; " + "addq %[c], %[lo]; " + "adcq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b]; " + "addq %%rax, %[lo]; " + "adcq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[c], %[lo]; " + "sbbq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "mulq %[b]; " + "subq %%rax, %[lo]; " + "sbbq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + *acc = (((__uint128_t)(hi))<<64) | lo; +} + +static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t lo = *acc, hi = *acc>>64; + #ifdef __BMI2__ + uint64_t c,d; + __asm__ volatile + ("movq %[a], %%rdx; " + "addq %%rdx, %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[c], %[lo]; " + "sbbq %[d], %[hi]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + #else + __asm__ volatile + ("movq %[a], %%rax; " + "addq %%rax, %%rax; " + "mulq %[b]; " + "subq %%rax, %[lo]; " + "sbbq %%rdx, %[hi]; " + : [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rax", "rdx", "cc"); + #endif + *acc = (((__uint128_t)(hi))<<64) | lo; + +} + +static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { + uint64_t c,d, lo = *acc, hi = *acc>>64; + __asm__ volatile + ("movq %[a], %%rdx; " + "mulx %[b], %[c], %[d]; " + "subq %[lo], %[c]; " + "sbbq %[hi], %[d]; " + : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) + : [b]"m"(*b), [a]"m"(*a) + : "rdx", "cc"); + *acc = (((__uint128_t)(d))<<64) | c; +} + +static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { + return ((__uint128_t)(a)) * b; +} + +static __inline__ __int128_t widemuls(int64_t a, int64_t b) { + return ((__int128_t)(a)) * b; +} + +static __inline__ uint64_t opacify(uint64_t x) { + __asm__ volatile("" : "+r"(x)); + return x; +} + +static __inline__ mask_t is_zero(uint64_t x) { + __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); + return ~x; +} + +#endif /* __X86_64_ARITH_H__ */ diff --git a/src/barrett_field.c b/src/barrett_field.c new file mode 100644 index 0000000..55afe7d --- /dev/null +++ b/src/barrett_field.c @@ -0,0 +1,349 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "barrett_field.h" +#include +#include + +word_t +add_nr_ext_packed( + word_t *out, + const word_t *a, + uint32_t nwords_a, + const word_t *c, + uint32_t nwords_c, + word_t mask +) { + uint32_t i; + dword_t carry = 0; + for (i=0; i>= WORD_BITS; + } + for (; i>= WORD_BITS; + } + return carry; +} + +static __inline__ word_t +add_nr_packed( + word_t *a, + const word_t *c, + uint32_t nwords +) { + uint32_t i; + dword_t carry = 0; + for (i=0; i>= WORD_BITS; + } + return carry; +} + +word_t +sub_nr_ext_packed( + word_t *out, + const word_t *a, + uint32_t nwords_a, + const word_t *c, + uint32_t nwords_c, + word_t mask +) { + uint32_t i; + dsword_t carry = 0; + for (i=0; i>= WORD_BITS; + } + for (; i>= WORD_BITS; + } + return carry; +} + +static word_t +widemac( + word_t *accum, + uint32_t nwords_accum, + const word_t *mier, + uint32_t nwords_mier, + word_t mand, + word_t carry +) { + uint32_t i; + assert(nwords_mier <= nwords_accum); + + for (i=0; i> WORD_BITS; + } + + for (; i> WORD_BITS; + } + + return carry; +} + +void +barrett_negate ( + word_t *a, + uint32_t nwords_a, + const struct barrett_prime_t *prime +) { + uint32_t i; + dsword_t carry = 0; + + barrett_reduce(a,nwords_a,0,prime); + + /* Have p = 2^big - p_lo. Want p - a = 2^big - p_lo - a */ + + for (i=0; inwords_lo; i++) { + a[i] = carry = carry - prime->p_lo[i] - a[i]; + carry >>= WORD_BITS; + } + for (; inwords_p; i++) { + a[i] = carry = carry - a[i]; + if (inwords_p-1) { + carry >>= WORD_BITS; + } + } + + a[prime->nwords_p-1] = carry = carry + (((word_t)1) << prime->p_shift); + + for (; i>WORD_BITS)); +} + +void +barrett_reduce( + word_t *a, + uint32_t nwords_a, + word_t a_carry, + const struct barrett_prime_t *prime +) { + uint32_t repeat, nwords_left_in_a=nwords_a; + + /* Is there a point to this a_carry business? */ + assert(a_carry < ((word_t)1) << prime->p_shift); + assert(nwords_a >= prime->nwords_p); + assert(prime->nwords_p > 0); /* scan-build: prevent underflow */ + + for (; nwords_left_in_a >= prime->nwords_p; nwords_left_in_a--) { + for (repeat=0; repeat<2; repeat++) { + /* PERF: surely a more careful implementation could + * avoid this double round + */ + word_t mand = a[nwords_left_in_a-1] >> prime->p_shift; + a[nwords_left_in_a-1] &= (((word_t)1)<p_shift)-1; + if (prime->p_shift && !repeat) { + /* collect high bits when there are any */ + if (nwords_left_in_a < nwords_a) { + mand |= a[nwords_left_in_a] << (WORD_BITS-prime->p_shift); + a[nwords_left_in_a] = 0; + } else { + mand |= a_carry << (WORD_BITS-prime->p_shift); + } + } + + word_t carry = widemac( + a+nwords_left_in_a-prime->nwords_p, + prime->nwords_p, + prime->p_lo, + prime->nwords_lo, + mand, + 0 + ); + assert(!carry); + (void)carry; + } + } + + assert(nwords_left_in_a == prime->nwords_p-1); + + /* OK, but it still isn't reduced. Add and subtract p_lo. */ + word_t cout = add_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,-1); + if (prime->p_shift) { + cout = (cout<<(WORD_BITS-prime->p_shift)) + (a[prime->nwords_p-1]>>prime->p_shift); + a[prime->nwords_p-1] &= (((word_t)1)<p_shift)-1; + } + + /* mask = carry-1: if no carry then do sub, otherwise don't */ + sub_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,cout-1); +} + +/* PERF: This function is horribly slow. Enough to break 1%. */ +void +barrett_mul_or_mac( + word_t *accum, + uint32_t nwords_accum, + + const word_t *a, + uint32_t nwords_a, + + const word_t *b, + uint32_t nwords_b, + + const struct barrett_prime_t *prime, + + mask_t doMac +) { + assert(nwords_accum >= prime->nwords_p); + + /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */ + uint32_t nwords_tmp = (nwords_a > prime->nwords_p) ? nwords_a : prime->nwords_p; + nwords_tmp++; + assert(nwords_tmp > 0); /* scan-build: prevent underflow. */ + if (nwords_tmp < nwords_accum && doMac) + nwords_tmp = nwords_accum; + + word_t tmp[nwords_tmp]; + int bpos, idown; + uint32_t i; + + for (i=0; i= 0; bpos--) { + /* Invariant at the beginning of the loop: the high word is unused. */ + assert(tmp[nwords_tmp-1] == 0); + + /* shift up */ + for (idown=nwords_tmp-2; idown>=0; idown--) { + tmp[idown+1] = tmp[idown]; + } + tmp[0] = 0; + + /* mac and reduce */ + word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0); + + /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */ + assert(!carry); + barrett_reduce(tmp, nwords_tmp, carry, prime); + + /* at this point, the number of words used is nwords_p <= nwords_tmp-1, + * so the high word is again clear */ + } + + if (doMac) { + word_t cout = add_nr_packed(tmp, accum, nwords_accum); + barrett_reduce(tmp, nwords_tmp, cout, prime); + } + + for (i=0; inwords_p * sizeof(word_t); + if (prime->p_shift) { + nserial -= (WORD_BITS - prime->p_shift) / 8; + } + + + /* Track x < p, p = 2^k - p_lo <==> x + p_lo < 2^k */ + dword_t carry = 0; + + for (i=0; i*sizeof(word_t)>= WORD_BITS; + + word_t the = 0; + for (j=0; jnwords_lo) carry += prime->p_lo[i]; + } + + /* check for reduction */ + if (prime->p_shift) { + carry >>= prime->p_shift; + } else { + carry >>= WORD_BITS; + } + + /* at this point, carry > 0 indicates failure */ + dsword_t scarry = carry; + scarry = -scarry; + scarry >>= WORD_BITS; + scarry >>= WORD_BITS; + + return (mask_t) ~scarry; +} + +void +barrett_deserialize_and_reduce ( + word_t *x, + const uint8_t *serial, + uint32_t nserial, + const struct barrett_prime_t *prime +) { + unsigned int size = (nserial + sizeof(word_t) - 1)/sizeof(word_t); + if (size < prime->nwords_p) { + size = prime->nwords_p; + } + word_t tmp[size]; + memset(tmp,0,sizeof(tmp)); + + unsigned int i,j; + for (i=0; i*sizeof(word_t)nwords_p; i++) { + x[i] = tmp[i]; + } + for (; i>(8*j); + } + } +} diff --git a/src/crandom.c b/src/crandom.c new file mode 100644 index 0000000..e4a71d0 --- /dev/null +++ b/src/crandom.c @@ -0,0 +1,442 @@ +/* Copyright (c) 2011 Stanford University. + * Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +/* Chacha random number generator code copied from crandom */ + +#include "intrinsics.h" +#include "crandom.h" +#include + +volatile unsigned int crandom_features = 0; + +unsigned int crandom_detect_features() { + unsigned int out = GEN; + +# if (defined(__i386__) || defined(__x86_64__)) + u_int32_t a,b,c,d; + + a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); + out |= GEN; + if (d & 1<<26) out |= SSE2; + if (d & 1<< 9) out |= SSSE3; + if (c & 1<<25) out |= AESNI; + if (c & 1<<28) out |= AVX; + if (b & 1<<5) out |= AVX2; + + a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); + if (c & 1<<11) out |= XOP; + if (c & 1<<30) out |= RDRAND; +# endif + + return out; +} + + + +INTRINSIC u_int64_t rdrand(int abort_on_fail) { + uint64_t out = 0; + int tries = 1000; + + if (HAVE(RDRAND)) { + # if defined(__x86_64__) + u_int64_t out, a=0; + for (; tries && !a; tries--) { + __asm__ __volatile__ ( + "rdrand %0\n\tsetc %%al" + : "=r"(out), "+a"(a) :: "cc" + ); + } + # elif (defined(__i386__)) + u_int32_t reg, a=0; + uint64_t out; + for (; tries && !a; tries--) { + __asm__ __volatile__ ( + "rdrand %0\n\tsetc %%al" + : "=r"(reg), "+a"(a) :: "cc" + ); + } + out = reg; a = 0; + for (; tries && !a; tries--) { + __asm__ __volatile__ ( + "rdrand %0\n\tsetc %%al" + : "=r"(reg), "+a"(a) :: "cc" + ); + } + out = out << 32 | reg; + return out; + # else + abort(); // whut + # endif + } else { + tries = 0; + } + + if (abort_on_fail && !tries) { + abort(); + } + + return out; +} + + +/* ------------------------------- Vectorized code ------------------------------- */ +#define shuffle(x,i) _mm_shuffle_epi32(x, \ + i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64) + +#define add _mm_add_epi32 +#define add64 _mm_add_epi64 + +#define NEED_XOP (MIGHT_HAVE(XOP)) +#define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP)) +#define NEED_SSE2 (MIGHT_HAVE(SSE2) && !MUST_HAVE(SSSE3)) +#define NEED_CONV (!MUST_HAVE(SSE2)) + +#if NEED_XOP +static __inline__ void +quarter_round_xop( + ssereg *a, + ssereg *b, + ssereg *c, + ssereg *d +) { + *a = add(*a,*b); *d = xop_rotate(16, *d ^ *a); + *c = add(*c,*d); *b = xop_rotate(12, *b ^ *c); + *a = add(*a,*b); *d = xop_rotate(8, *d ^ *a); + *c = add(*c,*d); *b = xop_rotate(7, *b ^ *c); +} +#endif + +#if NEED_SSSE3 +static const ssereg shuffle8 = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull }; +static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull }; + +INTRINSIC ssereg ssse3_rotate_8(ssereg a) { + return _mm_shuffle_epi8(a, shuffle8); +} + +INTRINSIC ssereg ssse3_rotate_16(ssereg a) { + return _mm_shuffle_epi8(a, shuffle16); +} + +static __inline__ void +quarter_round_ssse3( + ssereg *a, + ssereg *b, + ssereg *c, + ssereg *d +) { + *a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a); + *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c); + *a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a); + *c = add(*c,*d); *b = sse2_rotate(7, *b ^ *c); +} +#endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */ + +#if NEED_SSE2 +static __inline__ void +quarter_round_sse2( + ssereg *a, + ssereg *b, + ssereg *c, + ssereg *d +) { + *a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a); + *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c); + *a = add(*a,*b); *d = sse2_rotate(8, *d ^ *a); + *c = add(*c,*d); *b = sse2_rotate(7, *b ^ *c); +} +#endif + +#define DOUBLE_ROUND(qrf) { \ + qrf(&a1,&b1,&c1,&d1); \ + qrf(&a2,&b2,&c2,&d2); \ + b1 = shuffle(b1,1); \ + c1 = shuffle(c1,2); \ + d1 = shuffle(d1,3); \ + b2 = shuffle(b2,1); \ + c2 = shuffle(c2,2); \ + d2 = shuffle(d2,3); \ + \ + qrf(&a1,&b1,&c1,&d1); \ + qrf(&a2,&b2,&c2,&d2); \ + b1 = shuffle(b1,3); \ + c1 = shuffle(c1,2); \ + d1 = shuffle(d1,1); \ + b2 = shuffle(b2,3); \ + c2 = shuffle(c2,2); \ + d2 = shuffle(d2,1); \ + } + +#define OUTPUT_FUNCTION { \ + output[0] = add(a1,aa); \ + output[1] = add(b1,bb); \ + output[2] = add(c1,cc); \ + output[3] = add(d1,dd); \ + output[4] = add(a2,aa); \ + output[5] = add(b2,bb); \ + output[6] = add(c2,add(cc,p)); \ + output[7] = add(d2,dd); \ + \ + output += 8; \ + \ + cc = add64(add64(cc,p), p); \ + a1 = a2 = aa; \ + b1 = b2 = bb; \ + c1 = cc; c2 = add64(cc,p);\ + d1 = d2 = dd; \ + } +/* ------------------------------------------------------------------------------- */ + +INTRINSIC u_int32_t rotate(int r, u_int32_t a) { + return a<>(32-r); +} + +static __inline__ __attribute__((unused)) void +quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) { + *a = *a + *b; *d = rotate(16, *d^*a); + *c = *c + *d; *b = rotate(12, *b^*c); + *a = *a + *b; *d = rotate(8, *d^*a); + *c = *c + *d; *b = rotate(7, *b^*c); +} + +static void +crandom_chacha_expand(u_int64_t iv, + u_int64_t ctr, + int nr, + int output_size, + const unsigned char *key_, + unsigned char *output_) { +# if MIGHT_HAVE_SSE2 + if (HAVE(SSE2)) { + ssereg *key = (ssereg *)key_; + ssereg *output = (ssereg *)output_; + + ssereg a1 = key[0], a2 = a1, aa = a1, + b1 = key[1], b2 = b1, bb = b1, + c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1, + d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull}, + d2 = d1, dd = d1, + p = {0, 1}; + + int i,r; +# if (NEED_XOP) + if (HAVE(XOP)) { + for (i=0; i0; r-=2) + DOUBLE_ROUND(quarter_round_xop); + OUTPUT_FUNCTION; + } + return; + } +# endif +# if (NEED_SSSE3) + if (HAVE(SSSE3)) { + for (i=0; i0; r-=2) + DOUBLE_ROUND(quarter_round_ssse3); + OUTPUT_FUNCTION; + } + return; + } +# endif +# if (NEED_SSE2) + if (HAVE(SSE2)) { + for (i=0; i0; r-=2) + DOUBLE_ROUND(quarter_round_sse2); + OUTPUT_FUNCTION; + } + return; + } +# endif + } +# endif + +# if NEED_CONV + { + const u_int32_t *key = (const u_int32_t *)key_; + u_int32_t + x[16], + input[16] = { + key[0], key[1], key[2], key[3], + key[4], key[5], key[6], key[7], + iv, iv>>32, ctr, ctr>>32, + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }, + *output = (u_int32_t *)output_; + int i, r; + + for (i=0; i0; r-=2) { + quarter_round(&x[0], &x[4], &x[8], &x[12]); + quarter_round(&x[1], &x[5], &x[9], &x[13]); + quarter_round(&x[2], &x[6], &x[10], &x[14]); + quarter_round(&x[3], &x[7], &x[11], &x[15]); + + quarter_round(&x[0], &x[5], &x[10], &x[15]); + quarter_round(&x[1], &x[6], &x[11], &x[12]); + quarter_round(&x[2], &x[7], &x[8], &x[13]); + quarter_round(&x[3], &x[4], &x[9], &x[14]); + } + for (r=0; r<16; r++) { + output[r] = x[r] + input[r]; + } + + output += 16; + input[11] ++; + if (!input[11]) input[12]++; + } + } + +#endif /* NEED_CONV */ +} + +/* "return 4", cf xkcd #221 */ +#define CRANDOM_MAGIC 0x72657475726e2034ull + +int +crandom_init_from_file( + struct crandom_state_t *state, + const char *filename, + int reseed_interval, + int reseeds_mandatory +) { + state->fill = 0; + state->reseed_countdown = reseed_interval; + state->reseed_interval = reseed_interval; + state->ctr = 0; + + state->randomfd = open(filename, O_RDONLY); + if (state->randomfd == -1) { + int err = errno; + return err ? err : -1; + } + + ssize_t offset = 0, red; + do { + red = read(state->randomfd, state->seed + offset, 32 - offset); + if (red > 0) offset += red; + } while (red > 0 && offset < 32); + + if (offset < 32) { + int err = errno; + return err ? err : -1; + } + + memset(state->buffer, 0, 96); + + state->magic = CRANDOM_MAGIC; + state->reseeds_mandatory = reseeds_mandatory; + + return 0; +} + +void +crandom_init_from_buffer( + struct crandom_state_t *state, + const char initial_seed[32] +) { + memcpy(state->seed, initial_seed, 32); + memset(state->buffer, 0, 96); + state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0; + state->randomfd = -1; + state->magic = CRANDOM_MAGIC; +} + +int +crandom_generate( + struct crandom_state_t *state, + unsigned char *output, + unsigned long long length +) { + /* the generator isn't seeded; maybe they ignored the return value of init_from_file */ + if (unlikely(state->magic != CRANDOM_MAGIC)) { + abort(); + } + + int ret = 0; + + while (length) { + if (unlikely(state->fill <= 0)) { + uint64_t iv = 0; + if (state->reseed_interval) { + /* it's nondeterministic, stir in some rdrand() or rdtsc() */ + if (HAVE(RDRAND)) { + iv = rdrand(0); + if (!iv) iv = rdtsc(); + } else { + iv = rdtsc(); + } + + state->reseed_countdown--; + if (unlikely(state->reseed_countdown <= 0)) { + /* reseed by xoring in random state */ + state->reseed_countdown = state->reseed_interval; + ssize_t offset = 0, red; + do { + red = read(state->randomfd, state->buffer + offset, 32 - offset); + if (red > 0) offset += red; + } while (red > 0 && offset < 32); + + if (offset < 32) { + /* The read failed. Signal an error with the return code. + * + * If reseeds are mandatory, crash. + * + * If not, the generator is still probably safe to use, because reseeding + * is basically over-engineering for caution. Also, the user might ignore + * the return code, so we still need to fill the request. + * + * Set reseed_countdown = 1 so we'll try again later. If the user's + * performance sucks as a result of ignoring the error code while calling + * us in a loop, well, that's life. + */ + if (state->reseeds_mandatory) { + abort(); + } + + ret = errno; + if (ret == 0) ret = -1; + state->reseed_countdown = 1; + } + + int i; + for (i=0; i<32; i++) { + /* Stir in the buffer. If somehow the read failed, it'll be zeros. */ + state->seed[i] ^= state->buffer[i]; + } + } + } + crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed); + state->ctr++; + state->fill = sizeof(state->buffer); + } + + unsigned long long copy = (length > state->fill) ? state->fill : length; + state->fill -= copy; + memcpy(output, state->buffer + state->fill, copy); + memset(state->buffer + state->fill, 0, copy); + output += copy; length -= copy; + } + + return ret; +} + +void +crandom_destroy( + struct crandom_state_t *state +) { + if (state->magic == CRANDOM_MAGIC && state->randomfd) { + (void) close(state->randomfd); + /* Ignore the return value from close(), because what would it mean? + * "Your random device, which you were reading over NFS, lost some data"? + */ + } + + memset(state, 0, sizeof(*state)); +} diff --git a/src/exported.sym b/src/exported.sym new file mode 100644 index 0000000..e26f3db --- /dev/null +++ b/src/exported.sym @@ -0,0 +1,6 @@ +_goldilocks_init +_goldilocks_keygen +_goldilocks_shared_secret +_goldilocks_sign +_goldilocks_verify +_goldilocks_private_to_public diff --git a/src/goldilocks.c b/src/goldilocks.c new file mode 100644 index 0000000..f178d7a --- /dev/null +++ b/src/goldilocks.c @@ -0,0 +1,393 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "config.h" +#include "word.h" + +#include + +#if GOLDILOCKS_USE_PTHREAD +#include +#endif + +#include "goldilocks.h" +#include "ec_point.h" +#include "scalarmul.h" +#include "barrett_field.h" +#include "crandom.h" +#include "sha512.h" +#include "intrinsics.h" + +#ifndef GOLDILOCKS_RANDOM_INIT_FILE +#define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom" +#endif + +#ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL +#define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000 +#endif + +/* We'll check it ourselves */ +#ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY +#define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0 +#endif + +/* FUTURE: auto */ +const struct affine_t goldilocks_base_point = { + {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), + U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), + U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324), + U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff) + }}, + {{ 19 }} +}; + +static const char *G_INITING = "initializing"; +static const char *G_INITED = "initialized"; +static const char *G_FAILED = "failed to initialize"; + +/* FUTURE: auto */ +static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { + U64LE(0xdc873d6d54a7bb0d), + U64LE(0xde933d8d723a70aa), + U64LE(0x3bb124b65129c96f), + 0x8335dc16 +}; +const struct barrett_prime_t goldi_q448 = { + 448/WORD_BITS, + 62 % WORD_BITS, + sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]), + goldi_q448_lo +}; + +/* FUTURE: auto */ +struct { + const char * volatile state; +#if GOLDILOCKS_USE_PTHREAD + pthread_mutex_t mutex; +#endif + struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64]; + struct fixed_base_table_t fixed_base; + struct tw_niels_t wnafs[32]; + struct crandom_state_t rand; +} goldilocks_global; + +static inline mask_t +goldilocks_check_init() { + if (likely(goldilocks_global.state == G_INITED)) { + return MASK_SUCCESS; + } else { + return MASK_FAILURE; + } +} + +int +goldilocks_init () { + const char *res = compare_and_swap(&goldilocks_global.state, NULL, G_INITING); + if (res == G_INITED) return GOLDI_EALREADYINIT; + else if (res) { + return GOLDI_ECORRUPT; + } + +#if GOLDILOCKS_USE_PTHREAD + int ret = pthread_mutex_init(&goldilocks_global.mutex, NULL); + if (ret) goto fail; +#endif + + struct extensible_t ext; + struct tw_extensible_t text; + + /* Sanity check: the base point is on the curve. */ + assert(validate_affine(&goldilocks_base_point)); + + /* Convert it to twisted Edwards. */ + convert_affine_to_extensible(&ext, &goldilocks_base_point); + twist_even(&text, &ext); + + /* Precompute the tables. */ + mask_t succ; + + int big = (WORD_BITS==64); + uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; + + succ = precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs); + succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, 5); + + int criff_res = crandom_init_from_file(&goldilocks_global.rand, + GOLDILOCKS_RANDOM_INIT_FILE, + GOLDILOCKS_RANDOM_RESEED_INTERVAL, + GOLDILOCKS_RANDOM_RESEEDS_MANDATORY); + + if (succ & !criff_res) { + if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) { + abort(); + } + return 0; + } + + /* it failed! fall though... */ + +fail: + if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_FAILED)) { + /* ok something is seriously wrong */ + abort(); + } + return -1; +} + +static const struct p448_t +sqrt_d_minus_1 = {{ + U58LE(0xd2e21836749f46), + U58LE(0x888db42b4f0179), + U58LE(0x5a189aabdeea38), + U58LE(0x51e65ca6f14c06), + U58LE(0xa49f7b424d9770), + U58LE(0xdcac4628c5f656), + U58LE(0x49443b8748734a), + U58LE(0x12fec0c0b25b7a) +}}; + +int +goldilocks_keygen ( + struct goldilocks_private_key_t *privkey, + struct goldilocks_public_key_t *pubkey +) { + if (!goldilocks_check_init()) { + return GOLDI_EUNINIT; + } + + word_t sk[448*2/WORD_BITS]; + + struct tw_extensible_t exta; + struct p448_t pk; + +#if GOLDILOCKS_USE_PTHREAD + int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex); + if (ml_ret) return ml_ret; +#endif + + int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk)); + int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32); + if (!ret) ret = ret2; + +#if GOLDILOCKS_USE_PTHREAD + ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex); + if (ml_ret) abort(); +#endif + + barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448); + barrett_serialize(privkey->opaque, sk, 448/8); + + scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base); + //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta); + untwist_and_double_and_serialize(&pk, &exta); + + p448_serialize(pubkey->opaque, &pk); + memcpy(&privkey->opaque[56], pubkey->opaque, 56); + + return ret ? GOLDI_ENODICE : GOLDI_EOK; +} + +int +goldilocks_private_to_public ( + struct goldilocks_public_key_t *pubkey, + const struct goldilocks_private_key_t *privkey +) { + struct p448_t pk; + mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]); + + if (msucc) { + p448_serialize(pubkey->opaque, &pk); + return GOLDI_EOK; + } else { + return GOLDI_ECORRUPT; + } +} + +int +goldilocks_shared_secret ( + uint8_t shared[64], + const struct goldilocks_private_key_t *my_privkey, + const struct goldilocks_public_key_t *your_pubkey +) { + /* This function doesn't actually need anything in goldilocks_global, + * so it doesn't check init. + */ + + word_t sk[448/WORD_BITS]; + struct p448_t pk; + + mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1; + +#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS + struct p448_t sum, prod; + msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]); + p448_mul(&prod,&pk,&sum); + p448_add(&sum,&pk,&sum); +#endif + + msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448); + succ &= montgomery_ladder(&pk,&pk,sk,446,2); + + p448_serialize(shared,&pk); + + /* obliterate records of our failure by adjusting with obliteration key */ + struct sha512_ctx_t ctx; + sha512_init(&ctx); + +#ifdef EXPERIMENT_ECDH_OBLITERATE_CT + uint8_t oblit[40]; + unsigned i; + for (i=0; i<8; i++) { + oblit[i] = "noshared"[i] & ~(succ&msucc); + } + for (i=0; i<32; i++) { + oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc); + } + sha512_update(&ctx, oblit, 40); +#endif + +#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS + /* stir in the sum and product of the pubkeys. */ + uint8_t a_pk[56]; + p448_serialize(a_pk, &sum); + sha512_update(&ctx, a_pk, 56); + p448_serialize(a_pk, &prod); + sha512_update(&ctx, a_pk, 56); +#endif + + /* stir in the shared key and finish */ + sha512_update(&ctx, shared, 56); + sha512_final(&ctx, shared); + + return (GOLDI_ECORRUPT & ~msucc) + | (GOLDI_EINVAL & msucc &~ succ) + | (GOLDI_EOK & msucc & succ); +} + +int +goldilocks_sign ( + uint8_t signature_out[56*2], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_private_key_t *privkey +) { + if (!goldilocks_check_init()) { + return GOLDI_EUNINIT; + } + + /* challenge = H(pk, [nonceG], message). */ + word_t skw[448/WORD_BITS]; + mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448); + if (!succ) { + memset(skw,0,sizeof(skw)); + return GOLDI_ECORRUPT; + } + + /* Derive a nonce. TODO: use HMAC. FUTURE: factor. */ + unsigned char sha_out[512/8]; + word_t tk[448/WORD_BITS]; + struct sha512_ctx_t ctx; + sha512_init(&ctx); + sha512_update(&ctx, (const unsigned char *)"signonce", 8); + sha512_update(&ctx, &privkey->opaque[112], 32); + sha512_update(&ctx, message, message_len); + sha512_update(&ctx, &privkey->opaque[112], 32); + sha512_final(&ctx, sha_out); + barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448); + + /* 4[nonce]G */ + uint8_t signature_tmp[56]; + struct tw_extensible_t exta; + struct p448_t gsk; + scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base); + double_tw_extensible(&exta); + untwist_and_double_and_serialize(&gsk, &exta); + p448_serialize(signature_tmp, &gsk); + + word_t challenge[448/WORD_BITS]; + sha512_update(&ctx, &privkey->opaque[56], 56); + sha512_update(&ctx, signature_tmp, 56); + sha512_update(&ctx, message, message_len); + sha512_final(&ctx, sha_out); + barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); + + // reduce challenge and sub. + barrett_negate(challenge,448/WORD_BITS,&goldi_q448); + + barrett_mac( + tk,448/WORD_BITS, + challenge,448/WORD_BITS, + skw,448/WORD_BITS, + &goldi_q448 + ); + + word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1); + barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448); + + memcpy(signature_out, signature_tmp, 56); + barrett_serialize(signature_out+56, tk, 448/8); + memset((unsigned char *)tk,0,sizeof(tk)); + memset((unsigned char *)skw,0,sizeof(skw)); + memset((unsigned char *)challenge,0,sizeof(challenge)); + + /* response = 2(nonce_secret - sk*challenge) + * Nonce = 8[nonce_secret]*G + * PK = 2[sk]*G, except doubled (TODO) + * so [2] ( [response]G + 2[challenge]PK ) = Nonce + */ + + return 0; +} + +int +goldilocks_verify ( + const uint8_t signature[56*2], + const uint8_t *message, + uint64_t message_len, + const struct goldilocks_public_key_t *pubkey +) { + if (!goldilocks_check_init()) { + return GOLDI_EUNINIT; + } + + struct p448_t pk; + word_t s[448/WORD_BITS]; + + mask_t succ = p448_deserialize(&pk,pubkey->opaque); + if (!succ) return GOLDI_EINVAL; + + succ = barrett_deserialize(s, &signature[56], &goldi_q448); + if (!succ) return GOLDI_EINVAL; + + /* challenge = H(pk, [nonceG], message). */ + unsigned char sha_out[512/8]; + word_t challenge[448/WORD_BITS]; + struct sha512_ctx_t ctx; + sha512_init(&ctx); + sha512_update(&ctx, pubkey->opaque, 56); + sha512_update(&ctx, signature, 56); + sha512_update(&ctx, message, message_len); + sha512_final(&ctx, sha_out); + barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); + + struct p448_t eph; + struct tw_extensible_t pk_text; + + /* deserialize [nonce]G */ + succ = p448_deserialize(&eph, signature); + if (!succ) return GOLDI_EINVAL; + + succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); + if (!succ) return GOLDI_EINVAL; + + linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 ); + + untwist_and_double_and_serialize( &pk, &pk_text ); + p448_sub(&eph, &eph, &pk); + p448_bias(&eph, 2); + + succ = p448_is_zero(&eph); + + return succ ? 0 : GOLDI_EINVAL; +} diff --git a/src/include/barrett_field.h b/src/include/barrett_field.h new file mode 100644 index 0000000..9d8f930 --- /dev/null +++ b/src/include/barrett_field.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __BARRETT_FIELD_H__ +#define __BARRETT_FIELD_H__ 1 + +/** + * @file barrett_field.h + * @brief Slow routines for generic primes in Barrett form. + * + * @warning These routines are very slow, roughly implemented, and should be made more + * flexible in the future. I might even outright switch to Montgomery form. + */ + +#include "word.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief A Barrett-form prime, 2^k - c. + * @todo Support primes of other forms. + */ +struct barrett_prime_t { + uint32_t nwords_p; /**< The number of bits in p, i.e. ceiling((k-1) / WORD_BITS) */ + uint32_t p_shift; /**< c mod WORD_BITS. */ + uint32_t nwords_lo; /**< The number of nonzero low words. */ + const word_t *p_lo; /**< The low words. */ +}; + +/** + * The Goldilocks prime. I'm not sure this is the right place for it, but oh well. + */ +extern const struct barrett_prime_t goldi_q448; + +/** + * Reduce a number (with optional high carry word) mod p. + * + * @param [inout] a The value to be reduced. + * @param [in] nwords_a The number of words in a. + * @param [in] a_carry A high word to be carried into the computation. + * @param [in] prime The Barrett prime. + */ +void +barrett_reduce( + word_t *a, + uint32_t nwords_a, + word_t a_carry, + const struct barrett_prime_t *prime +); + +/** + * out = a+(c&mask), returning a carry. + * + * @param [out] out The output, of length nwords_a. + * @param [in] a The "always" addend. + * @param [in] nwords_a The number of words in a. + * @param [in] c The "sometimes" addend. + * @param [in] nwords_c The number of words in c. + * @param [in] mask A mask of whether to add or not. + * @return A carry word. + */ +word_t +add_nr_ext_packed( + word_t *out, + const word_t *a, + uint32_t nwords_a, + const word_t *c, + uint32_t nwords_c, + word_t mask +); + +/** + * out = a-(c&mask), returning a borrow. + * + * @param [out] out The output, of length nwords_a. + * @param [in] a The "always" minuend. + * @param [in] nwords_a The number of words in a. + * @param [in] c The "sometimes" subtrahend. + * @param [in] nwords_c The number of words in c. + * @param [in] mask A mask of whether to add or not. + * @return A borrow word. + */ +word_t +sub_nr_ext_packed( + word_t *out, + const word_t *a, + uint32_t nwords_a, + const word_t *c, + uint32_t nwords_c, + word_t mask +); + +/** + * a -> reduce(-a) mod p + * + * @param [in] a The value to be reduced and negated. + * @param [in] nwords_a The number of words in a. Must be >= nwords_p. + * @param [in] prime The prime. + */ +void +barrett_negate ( + word_t *a, + uint32_t nwords_a, + const struct barrett_prime_t *prime +); + +/* + * If doMac, accum = accum + a*b mod p. + * Otherwise, accum = a*b mod p. + * + * This function is not __restrict__; you may pass accum, + * a, b, etc all from the same location. + */ +void +barrett_mul_or_mac( + word_t *accum, + uint32_t nwords_accum, + + const word_t *a, + uint32_t nwords_a, + + const word_t *b, + uint32_t nwords_b, + + const struct barrett_prime_t *prime, + + mask_t doMac +); + +static inline void +barrett_mul( + word_t *out, + int nwords_out, + + const word_t *a, + uint32_t nwords_a, + + const word_t *b, + uint32_t nwords_b, + + const struct barrett_prime_t *prime +) { + barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,0); +} + +static inline void +barrett_mac( + word_t *out, + uint32_t nwords_out, + + const word_t *a, + uint32_t nwords_a, + + const word_t *b, + uint32_t nwords_b, + + const struct barrett_prime_t *prime +) { + barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,-1); +} + +mask_t +barrett_deserialize ( + word_t *x, + const uint8_t *serial, + const struct barrett_prime_t *prime +); + +void +barrett_serialize ( + uint8_t *serial, + const word_t *x, + uint32_t nserial +); + +void +barrett_deserialize_and_reduce ( + word_t *x, + const uint8_t *serial, + uint32_t nserial, + const struct barrett_prime_t *prime +); + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __BARRETT_FIELD_H__ */ diff --git a/src/include/config.h b/src/include/config.h new file mode 100644 index 0000000..dbd785d --- /dev/null +++ b/src/include/config.h @@ -0,0 +1,8 @@ +#ifndef __GOLDILOCKS_CONFIG_H__ +#define __GOLDILOCKS_CONFIG_H__ 1 + +#define GOLDILOCKS_USE_PTHREAD 1 +#define EXPERIMENT_ECDH_OBLITERATE_CT 1 +#define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1 + +#endif // __GOLDILOCKS_CONFIG_H__ diff --git a/src/include/crandom.h b/src/include/crandom.h new file mode 100644 index 0000000..f603f13 --- /dev/null +++ b/src/include/crandom.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2011 Stanford University. + * Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +/** + * @file crandom.h + * @author Mike Hamburg + * @brief A miniature version of the (as of yet incomplete) crandom project. + */ + +#ifndef __GOLDI_CRANDOM_H__ +#define __GOLDI_CRANDOM_H__ 1 + +#include /* for uint64_t */ +#include /* for open */ +#include /* for returning errors after open */ +#include /* for abort */ +#include /* for memcpy */ +#include /* for bzero */ +#include /* for read */ + +/** + * @brief The state of a crandom generator. + * + * This object is opaque. It is not protected by a lock, and so must + * not be accessed by multiple threads at the same time. + */ +struct crandom_state_t { + /** @privatesection */ + unsigned char seed[32]; + unsigned char buffer[96]; + uint64_t ctr; + uint64_t magic; + unsigned int fill; + int reseed_countdown; + int reseed_interval; + int reseeds_mandatory; + int randomfd; +} __attribute__((aligned(16))) ; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize a crandom state from the chosen file. + * + * This function initializes a state from a given state file, or + * from a random device (eg. /dev/random or /dev/urandom). + * + * You must check the return value of this function. + * + * @param [out] state The crandom state variable to initalize. + * @param [in] filename The name of the seed file or random device. + * @param [in] reseed_interval The number of 96-byte blocks which can be + * generated without reseeding. Suggest 10000. + * @param [in] reseeds_mandatory If nonzero, call abort() if a reseed fails. + * Suggest 1. + * + * @retval 0 Success. + * @retval Nonzero An error to be interpreted by strerror(). + */ +int +crandom_init_from_file ( + struct crandom_state_t *state, + const char *filename, + int reseed_interval, + int reseeds_mandatory +) __attribute__((warn_unused_result)); + + +/** + * Initialize a crandom state from a buffer, for deterministic operation. + * + * This function is used to initialize a crandom state deterministically, + * mainly for testing purposes. It can also be used to expand a secret + * random value deterministically. + * + * @warning The crandom implementation is not guaranteed to be stable. + * That is, a later release might produce a different random stream from + * the same seed. + * + * @param [out] state The crandom state variable to initalize. + * @param [in] initial_seed The seed value. + */ +void +crandom_init_from_buffer ( + struct crandom_state_t *state, + const char initial_seed[32] +); + +/** + * Fill the output buffer with random data. + * + * This function uses the given crandom state to produce pseudorandom data + * in the output buffer. + * + * This function may perform reads from the state's random device if it needs + * to reseed. This could block if that file is a blocking source, such as + * a pipe or /dev/random on Linux. If reseeding fails and the state has + * reseeds_mandatory set, this function will call abort(). Otherwise, it will + * return an error code, but it will still randomize the buffer. + * + * If called on a corrupted, uninitialized or destroyed state, this function + * will abort(). + * + * @warning This function is not thread-safe with respect to the state. Don't + * call it from multiple threads with the same state at the same time. + * + * @param [inout] state The crandom state to use for generation. + * @param [out] output The buffer to fill with random data. + * @param [in] length The length of the buffer. + * + * @retval 0 Success. + * @retval Nonezero A non-mandatory reseed operation failed. + */ +int +crandom_generate ( + struct crandom_state_t *state, + unsigned char *output, + unsigned long long length +); + +/** + * Destroy the random state. Further calls to crandom_generate() on that state + * will abort(). + * + * @param [inout] state The state to be destroyed. + */ +void +crandom_destroy ( + struct crandom_state_t *state +); + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __GOLDI_CRANDOM_H__ */ diff --git a/src/include/ec_point.h b/src/include/ec_point.h new file mode 100644 index 0000000..456cd3d --- /dev/null +++ b/src/include/ec_point.h @@ -0,0 +1,552 @@ +/** + * @file ec_point.h + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#ifndef __CC_INCLUDED_EC_POINT_H__ +#define __CC_INCLUDED_EC_POINT_H__ + +#include "p448.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Affine point on an Edwards curve. + */ +struct affine_t { + struct p448_t x, y; +}; + +/** + * Affine point on a twisted Edwards curve. + */ +struct tw_affine_t { + struct p448_t x, y; +}; + +/** + * Montgomery buffer. + */ +struct montgomery_t { + struct p448_t z0, xd, zd, xa, za; +}; + +/** + * Extensible coordinates for Edwards curves, suitable for + * accumulators. + * + * Represents the point (x/z, y/z). The extra coordinates + * t,u satisfy xy = tuz, allowing for conversion to Extended + * form by multiplying t and u. + * + * The idea is that you don't have to do this multiplication + * when doubling the accumulator, because the t-coordinate + * isn't used there. At the same time, as long as you only + * have one point in extensible form, additions don't cost + * extra. + * + * This is essentially a lazier version of Hisil et al's + * lookahead trick. It might be worth considering that trick + * instead. + */ +struct extensible_t { + struct p448_t x, y, z, t, u; +}; + +/** + * Extensible coordinates for twisted Edwards curves, + * suitable for accumulators. + */ +struct tw_extensible_t { + struct p448_t x, y, z, t, u; +}; + +/** + * Niels coordinates for twisted Edwards curves. + * + * Good for mixed readdition; suitable for fixed tables. + */ +struct tw_niels_t { + struct p448_t a, b, c; +}; + +/** + * Projective niels coordinates for twisted Edwards curves. + * + * Good for readdition; suitable for temporary tables. + */ +struct tw_pniels_t { + struct tw_niels_t n; + struct p448_t z; +}; + + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_affine ( + struct affine_t* a, + const struct affine_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_tw_affine ( + struct tw_affine_t* a, + const struct tw_affine_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_montgomery ( + struct montgomery_t* a, + const struct montgomery_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_extensible ( + struct extensible_t* a, + const struct extensible_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_tw_extensible ( + struct tw_extensible_t* a, + const struct tw_extensible_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_tw_niels ( + struct tw_niels_t* a, + const struct tw_niels_t* ds +) __attribute__((unused,always_inline)); + +/** + * Auto-generated copy method. + */ +static __inline__ void +copy_tw_pniels ( + struct tw_pniels_t* a, + const struct tw_pniels_t* ds +) __attribute__((unused,always_inline)); + +/** + * Returns 1/sqrt(+- x). + * + * The Legendre symbol of the result is the same as that of the + * input. + * + * If x=0, returns 0. + */ +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +); + +/** + * Returns 1/x. + * + * If x=0, returns 0. + */ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +/** + * Add two points on a twisted Edwards curve, one in Extensible form + * and the other in half-Niels form. + */ +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +); + +/** + * Add two points on a twisted Edwards curve, one in Extensible form + * and the other in half-Niels form. + */ +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +); + +/** + * Add two points on a twisted Edwards curve, one in Extensible form + * and the other in projective Niels form. + */ +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +); + +/** + * Add two points on a twisted Edwards curve, one in Extensible form + * and the other in projective Niels form. + */ +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +); + +/** + * Double a point on a twisted Edwards curve, in "extensible" coordinates. + */ +void +double_tw_extensible ( + struct tw_extensible_t* a +); + +/** + * Double a point on an Edwards curve, in "extensible" coordinates. + */ +void +double_extensible ( + struct extensible_t* a +); + +/** + * Double a point, and transfer it to the twisted curve. + * + * That is, apply the 4-isogeny. + */ +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +); + +/** + * Double a point, and transfer it to the untwisted curve. + * + * That is, apply the dual isogeny. + */ +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +); + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +); + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +); + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +); + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +); + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +); + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +); + +void +montgomery_step ( + struct montgomery_t* a +); + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +); + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +); + +/** + * Serialize a point on an Edwards curve. + * + * The serialized form would be sqrt((z-y)/(z+y)) with sign of xz. + * + * It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x. + * + * But 4/(1-d) isn't square, so we need to twist it: + * + * -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x + */ +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +); + +/** + * + */ +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +); + +/** + * Expensive transfer from untwisted to twisted. Roughly equivalent to halve and isogeny. + * Correctly transfers point of order 2. + * + * Can't have x=+1 (it's not even). There is code to fix the exception that would otherwise + * occur at (0,1). + * + * Input point must be even. + */ +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +); + +/** + * Expensive transfer from untwisted to twisted. Roughly equivalent to halve and isogeny. + * + * This function is for testing purposes only, because it can return odd points on the + * twist. This can cause exceptions in the point addition formula. What's more, this + * function should be able to return points of order 4, which are at infinity. + * + * This function probably doesn't properly handle special cases, such as the point at + * infinity (FUTURE). + * + * This function probably isn't a homomorphism, in that it probably doesn't consistently + * handle adjustments by the point of order 2 when the input is odd. (FUTURE) + */ +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +); + +mask_t +is_square ( + const struct p448_t* x +); + +mask_t +is_even_pt ( + const struct extensible_t* a +); + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +); + +/** + * Deserialize a point to an untwisted affine curve. + */ +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +); + +/** + * Deserialize a point and transfer it to the twist. + * + * Not guaranteed to preserve the 4-torsion component. + * + * Refuses to deserialize +-1, which are the points of order 2. + */ +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +); + +void +set_identity_extensible ( + struct extensible_t* a +); + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +); + +void +set_identity_affine ( + struct affine_t* a +); + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +); + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +); + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +); + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +); + +mask_t +validate_affine ( + const struct affine_t* a +); + +/** + * Check the invariants for struct tw_extensible_t. + * NOTE: This function was automatically generated + * with no regard for speed. + */ +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +); + +/** + * Check the invariants for struct extensible_t. + * NOTE: This function was automatically generated + * with no regard for speed. + */ +mask_t +validate_extensible ( + const struct extensible_t* ext +); + + +void +copy_affine ( + struct affine_t* a, + const struct affine_t* ds +) { + p448_copy ( &a->x, &ds->x ); + p448_copy ( &a->y, &ds->y ); +} + +void +copy_tw_affine ( + struct tw_affine_t* a, + const struct tw_affine_t* ds +) { + p448_copy ( &a->x, &ds->x ); + p448_copy ( &a->y, &ds->y ); +} + +void +copy_montgomery ( + struct montgomery_t* a, + const struct montgomery_t* ds +) { + p448_copy ( &a->z0, &ds->z0 ); + p448_copy ( &a->xd, &ds->xd ); + p448_copy ( &a->zd, &ds->zd ); + p448_copy ( &a->xa, &ds->xa ); + p448_copy ( &a->za, &ds->za ); +} + +void +copy_extensible ( + struct extensible_t* a, + const struct extensible_t* ds +) { + p448_copy ( &a->x, &ds->x ); + p448_copy ( &a->y, &ds->y ); + p448_copy ( &a->z, &ds->z ); + p448_copy ( &a->t, &ds->t ); + p448_copy ( &a->u, &ds->u ); +} + +void +copy_tw_extensible ( + struct tw_extensible_t* a, + const struct tw_extensible_t* ds +) { + p448_copy ( &a->x, &ds->x ); + p448_copy ( &a->y, &ds->y ); + p448_copy ( &a->z, &ds->z ); + p448_copy ( &a->t, &ds->t ); + p448_copy ( &a->u, &ds->u ); +} + +void +copy_tw_niels ( + struct tw_niels_t* a, + const struct tw_niels_t* ds +) { + p448_copy ( &a->a, &ds->a ); + p448_copy ( &a->b, &ds->b ); + p448_copy ( &a->c, &ds->c ); +} + +void +copy_tw_pniels ( + struct tw_pniels_t* a, + const struct tw_pniels_t* ds +) { + copy_tw_niels( &a->n, &ds->n ); + p448_copy ( &a->z, &ds->z ); +} + + + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __CC_INCLUDED_EC_POINT_H__ */ diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h new file mode 100644 index 0000000..02a8a1e --- /dev/null +++ b/src/include/intrinsics.h @@ -0,0 +1,244 @@ +/* Copyright (c) 2011 Stanford University. + * Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +/** @file intrinsics.h + * @brief cRandom intrinsics header. + */ + +#ifndef __CRANDOM_INTRINSICS_H__ +#define __CRANDOM_INTRINSICS_H__ 1 + +#include + +#include + +#define INTRINSIC \ + static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused)) + +#define GEN 1 +#define SSE2 2 +#define SSSE3 4 +#define AESNI 8 +#define XOP 16 +#define AVX 32 +#define AVX2 64 +#define RDRAND 128 + +/** + * If on x86, read the timestamp counter. Otherwise, return 0. + */ +INTRINSIC u_int64_t rdtsc() { + u_int64_t out = 0; +# if (defined(__i386__) || defined(__x86_64__)) + __asm__ __volatile__ ("rdtsc" : "=A"(out)); +# endif + return out; +} + +/** + * Return x unchanged, but confuse the compiler. + * + * This is mainly for use in test scripts, to prevent the value from + * being constant-folded or removed by dead code elimination. + * + * @param x A 64-bit number. + * @return The same number in a register. + */ +INTRINSIC u_int64_t opacify(u_int64_t x) { + __asm__ volatile("mov %0, %0" : "+r"(x)); + return x; +} + +#ifdef __AVX2__ +# define MIGHT_HAVE_AVX2 1 +# ifndef MUST_HAVE_AVX2 +# define MUST_HAVE_AVX2 0 +# endif +#else +# define MIGHT_HAVE_AVX2 0 +# define MUST_HAVE_AVX2 0 +#endif + +#ifdef __AVX__ +# define MIGHT_HAVE_AVX 1 +# ifndef MUST_HAVE_AVX +# define MUST_HAVE_AVX MUST_HAVE_AVX2 +# endif +#else +# define MIGHT_HAVE_AVX 0 +# define MUST_HAVE_AVX 0 +#endif + +#ifdef __SSSE3__ +# define MIGHT_HAVE_SSSE3 1 +# ifndef MUST_HAVE_SSSE3 +# define MUST_HAVE_SSSE3 MUST_HAVE_AVX +# endif +#else +# define MIGHT_HAVE_SSSE3 0 +# define MUST_HAVE_SSSE3 0 +#endif + +#ifdef __SSE2__ +# define MIGHT_HAVE_SSE2 1 +# ifndef MUST_HAVE_SSE2 +# define MUST_HAVE_SSE2 MUST_HAVE_SSSE3 +# endif + typedef __m128i ssereg; +# define pslldq _mm_slli_epi32 +# define pshufd _mm_shuffle_epi32 + +INTRINSIC ssereg sse2_rotate(int r, ssereg a) { + return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r); +} + +#else +# define MIGHT_HAVE_SSE2 0 +# define MUST_HAVE_SSE2 0 +#endif + +#ifdef __AES__ +/* don't include intrinsics file, because not all platforms have it */ +# define MIGHT_HAVE_AESNI 1 +# ifndef MIGHT_HAVE_RDRAND +# define MIGHT_HAVE_RDRAND 1 +# endif +# ifndef MUST_HAVE_RDRAND +# define MUST_HAVE_RDRAND 0 +# endif +# ifndef MUST_HAVE_AESNI +# define MUST_HAVE_AESNI 0 +# endif + +#else +# define MIGHT_HAVE_AESNI 0 +# define MUST_HAVE_AESNI 0 +# define MIGHT_HAVE_RDRAND 0 +# define MUST_HAVE_RDRAND 0 +#endif + +#ifdef __XOP__ +/* don't include intrinsics file, because not all platforms have it */ +# define MIGHT_HAVE_XOP 1 +# ifndef MUST_HAVE_XOP +# define MUST_HAVE_XOP 0 +# endif +INTRINSIC ssereg xop_rotate(int amount, ssereg x) { + ssereg out; + __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount)); + return out; +} +#else +# define MIGHT_HAVE_XOP 0 +# define MUST_HAVE_XOP 0 +#endif + +#define MIGHT_MASK \ + ( SSE2 * MIGHT_HAVE_SSE2 \ + | SSSE3 * MIGHT_HAVE_SSSE3 \ + | AESNI * MIGHT_HAVE_AESNI \ + | XOP * MIGHT_HAVE_XOP \ + | AVX * MIGHT_HAVE_AVX \ + | RDRAND * MIGHT_HAVE_RDRAND \ + | AVX2 * MIGHT_HAVE_AVX2) + +#define MUST_MASK \ + ( SSE2 * MUST_HAVE_SSE2 \ + | SSSE3 * MUST_HAVE_SSSE3 \ + | AESNI * MUST_HAVE_AESNI \ + | XOP * MUST_HAVE_XOP \ + | AVX * MUST_HAVE_AVX \ + | RDRAND * MUST_HAVE_RDRAND \ + | AVX2 * MUST_HAVE_AVX2 ) + +#define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature) +#define MUST_HAVE(feature) ((MUST_MASK & feature) == feature) + +#ifdef __cplusplus +# define extern_c extern "C" +#else +# define extern_c +#endif + +extern_c +unsigned int crandom_detect_features(); + +#ifndef likely +# define likely(x) __builtin_expect((x),1) +# define unlikely(x) __builtin_expect((x),0) +#endif + +/** + * Atomic compare and swap, return by fetching. + * + * Equivalent to: + * ret = *target; if (*target == old) *target = new; return ret; + * + * @param [inout] target The volatile memory area to be CAS'd + * @param [in] old The expected old value of the target. + * @param [in] new A value to replace the target on success. + */ +INTRINSIC const char * +compare_and_swap ( + const char *volatile* target, + const char *old, + const char *new +); + +const char *compare_and_swap ( + const char *volatile* target, + const char *old, + const char *new +) { + return __sync_val_compare_and_swap(target,old,new); +} + +/** + * Atomic compare and swap. Return whether successful. + * + * Equivalent to: + * if (*target == old) { *target = new; return nonzero; } else { return 0; } + * + * @param [inout] target The volatile memory area to be CAS'd + * @param [in] old The expected old value of the target. + * @param [in] new A value to replace the target on success. + */ +INTRINSIC int +bool_compare_and_swap ( + const char *volatile* target, + const char *old, + const char *new +); + +int +bool_compare_and_swap ( + const char *volatile* target, + const char *old, + const char *new +) { + return __sync_bool_compare_and_swap(target,old,new); +} + +/** + * Determine whether the current processor supports the given feature. + * + * This function is designed so that it should only have runtime overhead + * if the feature is not known at compile time -- that is, if + * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not. + */ +extern volatile unsigned int crandom_features; +INTRINSIC int HAVE(unsigned int feature); + +int HAVE(unsigned int feature) { + unsigned int features; + if (!MIGHT_HAVE(feature)) return 0; + if (MUST_HAVE(feature)) return 1; + features = crandom_features; + if (unlikely(!features)) + crandom_features = features = crandom_detect_features(); + return likely((features & feature) == feature); +} + +#endif /* __CRANDOM_INTRINSICS_H__ */ diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h new file mode 100644 index 0000000..122fccc --- /dev/null +++ b/src/include/scalarmul.h @@ -0,0 +1,289 @@ +/** + * @file scalarmul.h + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + */ + +#ifndef __P448_ALGO_H__ +#define __P448_ALGO_H__ 1 + +#include "ec_point.h" +#include "intrinsics.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A precomputed table for fixed-base scalar multiplication. + * + * This uses a signed combs format. + */ +struct fixed_base_table_t { + /** Comb tables containing multiples of the base point. */ + struct tw_niels_t *table; + + /** Adjustments to the scalar in even and odd cases, respectively. */ + word_t scalar_adjustments[2*(448/WORD_BITS)]; + + /** The number of combs in the table. */ + unsigned int n; + + /** The number of teeth in each comb. */ + unsigned int t; + + /** The spacing between the teeth. */ + unsigned int s; + + /** If nonzero, the table was malloc'd by precompute_for_combs. */ + unsigned int own_table; +}; + +/** + * Full Montgomery ladder in inverse square root format. + * + * Out = [2^n_extra_doubles * scalar] * in, where + * scalar is little-endian and has length $nbits$ bits. + * + * If the scalar is even and/or n_extra_doubles >= 1, + * then this function will reject points which are not + * on the curve by returning MASK_FAILURE. + * + * This function will also reject multiplies which output + * the identity or the point of order 2. It may be worth + * revisiting this decision in the FUTURE. The idea is that + * this can only happen when: the input is the identity or the + * point of order 2; or the input is the point of order 4 on + * the twist; or the scalar is 0 or a multiple of the curve + * order; or the scalar is a multiple of the twist order and + * the input point is on the twist. + * + * This function takes constant time with respect to $*in$ + * and $*scalar$, but not of course with respect to nbits or + * n_extra_doubles. + * + * For security, we recommend setting n_extra_doubles = 1. + * Because the cofactor of Goldilocks is 4 and input points + * are always even (when on the curve), this will cancel the + * cofactor. + * + * @param [out] out The output point. + * @param [in] in The base point. + * @param [in] scalar The scalar's little-endian representation. + * @param [in] nbits The number of bits in the scalar. Note that + * unlike in Curve25519, we do not require the top bit to be set. + * @param [in] n_extra_doubles The number of extra doubles to do at + * the end. + * + * @retval MASK_SUCCESS The operation was successful. + * @retval MASK_FAILURE The input point was invalid, or the output + * would be the identity or the point of order 2. + */ +mask_t +montgomery_ladder ( + struct p448_t *out, + const struct p448_t *in, + const word_t *scalar, + unsigned int nbits, + unsigned int n_extra_doubles +) __attribute__((warn_unused_result)); + +/** + * Scalar multiply a twisted Edwards-form point. + * + * This function takes constant time. + * + * Currently the scalar is always exactly 448 bits long. + * + * @param [inout] working The point to multply. + * @param [in] scalar The scalar, in little-endian form. + */ +void +scalarmul ( + struct tw_extensible_t *working, + const word_t scalar[448/WORD_BITS] + /* TODO? int nbits */ +); + +/** + * Scalar multiply a twisted Edwards-form point. Use the same + * algorithm as scalarmul(), but uses variable array indices. + * + * Currently the scalar is always exactly 448 bits long. + * + * @warning This function uses variable array indices, + * so it is insecure against cache-timing attacks. It is intended + * for microbenchmarking, to see how much constant-time arithmetic + * costs us. + * + * @param [inout] working The point to multply. + * @param [in] scalar The scalar, in little-endian form. + */ +void +scalarmul_vlook ( + struct tw_extensible_t *working, + const word_t scalar[448/WORD_BITS] + /* TODO? int nbits */ +); + +/** + * Precompute a table to accelerate fixed-point scalar + * multiplication using the "multiple signed combs" approach. + * + * This function computes $n$ "comb" tables, each containing + * 2^(t-1) points in tw_niels_t format. You must have + * n * t * s >= 446 for complete coverage. + * + * The scalar multiplication algorithm may adjust the scalar by + * a multiple of q. Therefore, we strongly recommend to use base + * points in the q-torsion group (i.e. doubly even points). + * + * @param [out] out The table to compute. + * @param [in] base The base point. + * @param [in] n The number of combs in the table. + * @param [in] t The number of teeth in each comb. + * @param [in] s The spacing between the teeth. + * @param [out] prealloc An optional preallocated array containing + * space for n<<(t-1) values of type tw_niels_t. + * + * @retval MASK_SUCCESS Success. + * @retval MASK_FAILURE Failure, most likely because we are out + * of memory. + */ +mask_t +precompute_fixed_base ( + struct fixed_base_table_t *out, + const struct tw_extensible_t *base, + unsigned int n, + unsigned int t, + unsigned int s, + struct tw_niels_t *prealloc +) __attribute__((warn_unused_result)); + + /** + * Destroy a fixed-base table. Frees any memory that we allocated + * for the combs. + * + * @param [in] table The table to destroy. + */ +void +destroy_fixed_base ( + struct fixed_base_table_t *table +); + +/** + * Scalar multiplication with precomputation. Set working to + * to [scalar] * Base, where Base is the base point passed to + * precompute_for_combs(). + * + * The scalar may be adjusted by a multiple of q, so this routine + * can be wrong by a cofactor if the base has cofactor components. + * + * @param [out] out The output point. + * @param [in] scalar The scalar. + * @param [in] nbits The number of bits in the scalar. Must be <= n*t*s. + * @param [in] table The precomputed table. + * + * @retval MASK_SUCCESS Success. + * @retval MASK_FAILURE Failure, because n*t*s < nbits + */ +mask_t +scalarmul_fixed_base ( + struct tw_extensible_t *out, + const word_t *scalar, + unsigned int nbits, + const struct fixed_base_table_t *table +); + +/** + * Variable-time scalar multiplication. + * + * @warning This function takes variable time. It is intended for + * microbenchmarking. + * + * @param [inout] working The input and output point. + * @param [in] scalar The scalar. + */ +void +scalarmul_vt ( + struct tw_extensible_t *working, + const word_t scalar[448/WORD_BITS] +); + + +/** + * Precompute a table to accelerate fixed-point scalar + * multiplication (and, more importantly, linear combos) + * using the "windowed non-adjacent form" approach. + * + * @param [out] out The output table. Must have room for 1< + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * SHA512 hashing context. + * + * This structure is opaque. + */ +struct sha512_ctx_t { + /** @privatesection */ + uint64_t chain[8]; + uint8_t block[128]; + uint64_t nbytes; +}; + +void +sha512_init ( + struct sha512_ctx_t *ctx +); + +void +sha512_update ( + struct sha512_ctx_t *ctx, + const unsigned char *data, + uint64_t bytes +); + +void +sha512_final ( + struct sha512_ctx_t *ctx, + uint8_t result[64] +); + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __GOLDI_SHA512_H__ */ diff --git a/src/include/word.h b/src/include/word.h new file mode 100644 index 0000000..0fc7427 --- /dev/null +++ b/src/include/word.h @@ -0,0 +1,128 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __WORD_H__ +#define __WORD_H__ + +/* for posix_memalign */ +#define _XOPEN_SOURCE 600 + +#include +#include +#include +#include + +#if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT) +/* It's a 64-bit machine if: + * // limits.h thinks so + * __uint128_t exists + * size_t is 64 bits + * Either longs are 64-bits (doesn't happen on Windows) + * or pointers are 64-bits (doesn't happen on 32/64 arches) + * FUTURE: validate this hack on more architectures. + */ +typedef uint32_t hword_t; +typedef uint64_t word_t; +typedef __uint128_t dword_t; +typedef int32_t hsword_t; +typedef int64_t sword_t; +typedef __int128_t dsword_t; +#define PRIxWORD PRIx64 +#define PRIxWORDfull "%016" PRIx64 +#define PRIxWORD58 "%014" PRIx64 +#define U64LE(x) x##ull +#define U58LE(x) x##ull +#else +typedef uint16_t hword_t; +typedef uint32_t word_t; +typedef uint64_t dword_t; +typedef int16_t hsword_t; +typedef int32_t sword_t; +typedef int64_t dsword_t; +#define PRIxWORD PRIx32 +#define PRIxWORDfull "%08" PRIx32 +#define PRIxWORD58 "%07" PRIx32 +#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32 +#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 +#endif + +#define WORD_BITS (sizeof(word_t) * 8) + +/* TODO: vector width for procs like ARM; gcc support */ +typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4))); + +static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1; + +/* FIXME this only works on clang */ +typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); +typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); +typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); +typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); +typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); +typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); +typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); + +#if __AVX2__ +typedef uint32x8_t big_register_t; +typedef uint64x4_t uint64xn_t; +typedef uint32x8_t uint32xn_t; +#elif __SSE2__ || __ARM_NEON__ +typedef uint32x4_t big_register_t; +typedef uint64x2_t uint64xn_t; +typedef uint32x4_t uint32xn_t; +#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ +typedef uint64_t big_register_t, uint64xn_t; +typedef uint32_t uint32xn_t; +#else +typedef uint64_t uint64xn_t; +typedef uint32_t uint32xn_t; +typedef uint32_t big_register_t; +#endif + + +#if __AVX2__ || __SSE2__ || __ARM_NEON__ +static __inline__ big_register_t +br_is_zero(big_register_t x) { + return (big_register_t)(x == (big_register_t)0); +} +#else +static __inline__ mask_t +br_is_zero(word_t x) { + return (((dword_t)x) - 1)>>WORD_BITS; +} +#endif + + + +/** + * Allocate memory which is sufficiently aligned to be used for the + * largest vector on the system (for now that's a big_register_t). + * + * Man malloc says that it does this, but at least for AVX2 on MacOS X, + * it's lying. + * + * @param size The size of the region to allocate. + * @return A suitable pointer, which can be free'd with free(), + * or NULL if no memory can be allocated. + */ +static __inline__ void * +malloc_vector ( + size_t size +) __attribute__((always_inline, unused)); + +void * +malloc_vector(size_t size) { + void *out = NULL; + + int ret = posix_memalign(&out, sizeof(big_register_t), size); + + if (ret) { + return NULL; + } else { + return out; + } +} + +#endif /* __WORD_H__ */ diff --git a/src/scalarmul.c b/src/scalarmul.c new file mode 100644 index 0000000..1ad856c --- /dev/null +++ b/src/scalarmul.c @@ -0,0 +1,844 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#include "word.h" + +#include +#include +#include + +#include "intrinsics.h" +#include "scalarmul.h" +#include "barrett_field.h" + +mask_t +montgomery_ladder ( + struct p448_t *out, + const struct p448_t *in, + const word_t *scalar, + unsigned int nbits, + unsigned int n_extra_doubles +) { + struct montgomery_t mont; + deserialize_montgomery(&mont, in); + + int i,j,n=(nbits-1)%WORD_BITS; + mask_t pflip = 0; + for (j=(nbits+WORD_BITS-1)/WORD_BITS-1; j>=0; j--) { + word_t w = scalar[j]; + for (i=n; i>=0; i--) { + mask_t flip = -((w>>i)&1); + p448_cond_swap(&mont.xa,&mont.xd,flip^pflip); + p448_cond_swap(&mont.za,&mont.zd,flip^pflip); + montgomery_step(&mont); + pflip = flip; + } + n = WORD_BITS-1; + } + p448_cond_swap(&mont.xa,&mont.xd,pflip); + p448_cond_swap(&mont.za,&mont.zd,pflip); + + assert(n_extra_doubles < INT_MAX); + for (j=0; j<(int)n_extra_doubles; j++) { + montgomery_step(&mont); + } + + return serialize_montgomery(out, &mont, in); +} + +static __inline__ void +cond_negate_tw_niels ( + struct tw_niels_t *n, + mask_t doNegate +) { + p448_cond_swap(&n->a, &n->b, doNegate); + p448_cond_neg(&n->c, doNegate); +} + +static __inline__ void +cond_negate_tw_pniels ( + struct tw_pniels_t *n, + mask_t doNegate +) { + cond_negate_tw_niels(&n->n, doNegate); +} + +void +constant_time_lookup_tw_pniels ( + struct tw_pniels_t *out, + const struct tw_pniels_t *in, + int nin, + int idx +) { + big_register_t big_one = 1, big_i = idx; + big_register_t *o = (big_register_t *)out; + const big_register_t *i = (const big_register_t *)in; + int j; + unsigned int k; + + memset(out, 0, sizeof(*out)); + for (j=0; j>= 1; + if (i> (i%WORD_BITS) & 0xF, + inv = (bits>>3)-1; + bits ^= inv; + + constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); + cond_negate_tw_pniels(&pn, inv); + convert_tw_pniels_to_tw_extensible(working, &pn); + + + for (i-=4; i>=0; i-=4) { + double_tw_extensible(working); + double_tw_extensible(working); + double_tw_extensible(working); + double_tw_extensible(working); + + bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF; + inv = (bits>>3)-1; + bits ^= inv; + + constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); + cond_negate_tw_pniels(&pn, inv); + add_tw_pniels_to_tw_extensible(working, &pn); + } +} + +void +scalarmul_vlook ( + struct tw_extensible_t *working, + const word_t scalar[448/WORD_BITS] +) { + + const int nbits=448; /* HACK? */ + word_t prepared_data[448*2/WORD_BITS] = { + U64LE(0x9595b847fdf73126), + U64LE(0x9bb9b8a856af5200), + U64LE(0xb3136e22f37d5c4f), + U64LE(0x0000000189a19442), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x4000000000000000), + + U64LE(0x721cf5b5529eec33), + U64LE(0x7a4cf635c8e9c2ab), + U64LE(0xeec492d944a725bf), + U64LE(0x000000020cd77058), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000) + }; /* TODO: split off */ + + word_t scalar2[448/WORD_BITS]; + convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); + + struct tw_extensible_t tabulator; + copy_tw_extensible(&tabulator, working); + double_tw_extensible(&tabulator); + + struct tw_pniels_t pn, multiples[8]; + convert_tw_extensible_to_tw_pniels(&pn, &tabulator); + convert_tw_extensible_to_tw_pniels(&multiples[0], working); + + int i; + for (i=1; i<8; i++) { + add_tw_pniels_to_tw_extensible(working, &pn); + convert_tw_extensible_to_tw_pniels(&multiples[i], working); + } + + i = nbits - 4; + int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF, + inv = (bits>>3)-1; + bits ^= inv; + + copy_tw_pniels(&pn, &multiples[bits&7]); + cond_negate_tw_pniels(&pn, inv); + convert_tw_pniels_to_tw_extensible(working, &pn); + + + for (i-=4; i>=0; i-=4) { + double_tw_extensible(working); + double_tw_extensible(working); + double_tw_extensible(working); + double_tw_extensible(working); + + bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF; + inv = (bits>>3)-1; + bits ^= inv; + + copy_tw_pniels(&pn, &multiples[bits&7]); + cond_negate_tw_pniels(&pn, inv); + add_tw_pniels_to_tw_extensible(working, &pn); + } +} + + +mask_t +scalarmul_fixed_base ( + struct tw_extensible_t *out, + const word_t scalar[448/WORD_BITS], + unsigned int nbits, + const struct fixed_base_table_t *table +) { + unsigned int n = table->n, t = table->t, s = table->s; + assert(n >= 1 && t >= 1 && s >= 1); + + if (n*t*s < nbits) { + return MASK_FAILURE; + } + + unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS, + scalar2_words = scalar_words; + if (scalar2_words < 448 / WORD_BITS) + scalar2_words = 448 / WORD_BITS; + word_t scalar2[scalar2_words], scalar3[scalar2_words]; + + /* Copy scalar to scalar3, but clear its high bits (if there are any) */ + unsigned int i,j,k; + for (i=0; iscalar_adjustments , 448 / WORD_BITS + ); + + struct tw_niels_t ni; + + for (i=0; i> (bit%WORD_BITS) & 1) << k; + } + } + + mask_t invert = (tab>>(t-1))-1; + tab ^= invert; + tab &= (1<<(t-1)) - 1; + + constant_time_lookup_tw_niels(&ni, table->table + (j<<(t-1)), 1<<(t-1), tab); + cond_negate_tw_niels(&ni, invert); + if (i||j) { + add_tw_niels_to_tw_extensible(out, &ni); + } else { + convert_tw_niels_to_tw_extensible(out, &ni); + } + } + } + + return MASK_SUCCESS; +} + +mask_t +precompute_fixed_base ( + struct fixed_base_table_t *out, + const struct tw_extensible_t *base, + unsigned int n, + unsigned int t, + unsigned int s, + struct tw_niels_t *prealloc +) { + if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { + memset(out, 0, sizeof(*out)); + return 0; + } + + out->n = n; + out->t = t; + out->s = s; + + struct tw_extensible_t working, start; + copy_tw_extensible(&working, base); + struct tw_pniels_t pn_tmp; + + struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1)); + struct p448_t *zs = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1))); + struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1))); + + struct tw_niels_t *table = prealloc; + if (prealloc) { + out->own_table = 0; + } else { + table = (struct tw_niels_t *) malloc_vector(sizeof(*table) * (n<<(t-1))); + out->own_table = 1; + } + out->table = table; + + if (!doubles || !zs || !zis || !table) { + free(doubles); + free(zs); + free(zis); + memset(out, 0, sizeof(*out)); + memset(table, 0, sizeof(*table) * (n<<(t-1))); + if (!prealloc) free(table); + return 0; + } + + unsigned int i,j,k; + + /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */ + unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1; + assert(adjustment_size >= 448/WORD_BITS); + word_t adjustment[adjustment_size]; + for (i=0; iscalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)], + *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)]; + for (i=0; i<448/WORD_BITS; i++) { + low_adjustment[i] = adjustment[i]; + } + + /* The high adjustment is low + q = low - q_lo + 2^big */ + (void) + sub_nr_ext_packed( + high_adjustment, + adjustment, 448/WORD_BITS, + goldi_q448.p_lo, goldi_q448.nwords_lo, + -1 + ); + if (goldi_q448.p_shift) { + high_adjustment[goldi_q448.nwords_p - 1] += ((word_t)1)<>1); + int idx = ((i+1)<<(t-1))-1 ^ gray; + + convert_tw_extensible_to_tw_pniels(&pn_tmp, &start); + copy_tw_niels(&table[idx], &pn_tmp.n); + p448_copy(&zs[idx], &pn_tmp.z); + + if (j >= (1<<(t-1)) - 1) break; + int delta = (j+1) ^ ((j+1)>>1) ^ gray; + + for (k=0; delta>1; k++) + delta >>=1; + + if (gray & (1<table) { + memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); + } + if (table->own_table) { + free(table->table); + } + memset(table,0,sizeof(*table)); +} + +mask_t +precompute_fixed_base_wnaf ( + struct tw_niels_t *out, + const struct tw_extensible_t *const_base, + unsigned int tbits +) { + int i; + struct p448_t *zs = (struct p448_t *) malloc_vector(sizeof(*zs)< 0) { + double_tw_extensible(&base); + convert_tw_extensible_to_tw_pniels(&twop, &base); + add_tw_pniels_to_tw_extensible(&base, &tmp); + + convert_tw_extensible_to_tw_pniels(&tmp, &base); + p448_copy(&zs[1], &tmp.z); + copy_tw_niels(&out[1], &tmp.n); + + for (i=2; i < 1<= 0; i--) { + int bit = (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1; + current = 2*current + bit; + + /* + * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0 + * So current loses (tableBits+1) bits every time. It otherwise gains + * 1 bit per iteration. The number of iterations is + * (nbits + 2 + tableBits), and an additional control word is added at + * the end. So the total number of control words is at most + * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2. + * There's also the stopper with power -1, for a total of +3. + */ + if (current >= (2<> 1; // |delta| < 2^tablebits + current = -(current & 1); + + for (j=i; (delta & 1) == 0; j++) { + delta >>= 1; + } + control[position].power = j+1; + control[position].addend = delta; + position++; + assert(position <= nbits/(tableBits+1) + 2); + } + } + + if (current) { + for (j=0; (current & 1) == 0; j++) { + current >>= 1; + } + control[position].power = j; + control[position].addend = current; + position++; + assert(position <= nbits/(tableBits+1) + 2); + } + + + control[position].power = -1; + control[position].addend = 0; + return position; +} + + +static void +prepare_wnaf_table( + struct tw_pniels_t *output, + struct tw_extensible_t *working, + unsigned int tbits +) { + convert_tw_extensible_to_tw_pniels(&output[0], working); + + if (tbits == 0) return; + + double_tw_extensible(working); + struct tw_pniels_t twop; + convert_tw_extensible_to_tw_pniels(&twop, working); + + add_tw_pniels_to_tw_extensible(working, &output[0]); + convert_tw_extensible_to_tw_pniels(&output[1], working); + + for (int i=2; i < 1< 0) { + assert(control[0].addend > 0); + assert(control[0].power >= 0); + convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]); + } else { + set_identity_tw_extensible(working); + return; + } + + int conti = 1, i; + for (i = control[0].power - 1; i >= 0; i--) { + double_tw_extensible(working); + + if (i == control[conti].power) { + assert(control[conti].addend); + + if (control[conti].addend > 0) { + add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]); + } else { + sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]); + } + conti++; + assert(conti <= control_bits); + } + } +} + +void +scalarmul_fixed_base_wnaf_vt ( + struct tw_extensible_t *working, + const word_t scalar[448/WORD_BITS], + unsigned int nbits, + const struct tw_niels_t *precmp, + unsigned int table_bits +) { + struct smvt_control control[nbits/(table_bits+1)+3]; + + int control_bits = recode_wnaf(control, scalar, nbits, table_bits); + + if (control_bits > 0) { + assert(control[0].addend > 0); + assert(control[0].power >= 0); + convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]); + } else { + set_identity_tw_extensible(working); + return; + } + + int conti = 1, i; + for (; control[conti].power >= 0; conti++) { + assert(conti <= control_bits); + for (i = control[conti-1].power - control[conti].power; i; i--) { + double_tw_extensible(working); + } + + assert(control[conti].addend); + if (control[conti].addend > 0) { + add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]); + } else { + sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]); + } + } + + for (i = control[conti-1].power; i; i--) { + double_tw_extensible(working); + } +} + +void +linear_combo_var_fixed_vt( + struct tw_extensible_t *working, + const word_t scalar_var[448/WORD_BITS], + unsigned int nbits_var, + const word_t scalar_pre[448/WORD_BITS], + unsigned int nbits_pre, + const struct tw_niels_t *precmp, + unsigned int table_bits_pre +) { + const int table_bits_var = 3; + struct smvt_control control_var[nbits_var/(table_bits_var+1)+3]; + struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3]; + + int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var); + int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre); + (void)ncb_var; + (void)ncb_pre; + + struct tw_pniels_t precmp_var[1< control_pre[0].power) { + convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]); + contv++; + } else if (i == control_pre[0].power && i >=0 ) { + convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]); + add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]); + contv++; contp++; + } else { + i = control_pre[0].power; + convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]); + contp++; + } + + if (i < 0) { + set_identity_tw_extensible(working); + return; + } + + for (i--; i >= 0; i--) { + double_tw_extensible(working); + + if (i == control_var[contv].power) { + assert(control_var[contv].addend); + + if (control_var[contv].addend > 0) { + add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]); + } else { + sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]); + } + contv++; + } + + if (i == control_pre[contp].power) { + assert(control_pre[contp].addend); + + if (control_pre[contp].addend > 0) { + add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]); + } else { + sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]); + } + contp++; + } + } + + assert(contv == ncb_var); + assert(contp == ncb_pre); +} + + + diff --git a/src/sha512.c b/src/sha512.c new file mode 100644 index 0000000..dd1468b --- /dev/null +++ b/src/sha512.c @@ -0,0 +1,187 @@ +/* Copyright (c) 2011 Stanford University. + * Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __APPLE__ +#define _BSD_SOURCE +#include +#endif + +#include "sha512.h" + +#include +#include + +static inline uint64_t +rotate_r ( + uint64_t x, + int d +) { + return (x >> d) | (x << (64-d)); +} + +#ifdef __APPLE__ +static inline uint64_t +htobe64 (uint64_t x) { + __asm__ ("bswapq %0" : "+r"(x)); + return x; +} +#endif + +static const uint64_t +sha512_init_state[8] = { + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 +}; + +static const uint64_t +sha512_k[80] = { + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +}; + +static inline uint64_t S0 (uint64_t h1) { + return rotate_r(h1, 28) ^ rotate_r(h1, 34) ^ rotate_r(h1, 39); +} + +static inline uint64_t S1 (uint64_t h4) { + return rotate_r(h4,14) ^ rotate_r(h4,18) ^ rotate_r(h4,41); +} + +static inline uint64_t s0 (uint64_t a) { + return rotate_r(a,1) ^ rotate_r(a,8) ^ a>>7; +} + +static inline uint64_t s1 (uint64_t b) { + return rotate_r(b,19) ^ rotate_r(b,61) ^ b>>6; +} + +static inline uint64_t ch (uint64_t h4, uint64_t h5, uint64_t h6) { + return h6^(h4 & (h6^h5)); +} + +static inline uint64_t maj(uint64_t h1, uint64_t h2, uint64_t h3) { + return (h1&h2) ^ (h3&(h1^h2)); +} + +static void +sha512_process_block ( + struct sha512_ctx_t *ctx +) { + uint64_t i, tmp, a, b, + *w = (uint64_t *) ctx->block, + *state = ctx->chain, + h0 = state[0], h1 = state[1], h2 = state[2], h3 = state[3], + h4 = state[4], h5 = state[5], h6 = state[6], h7 = state[7]; + + /* Clang doesn't unswitch this automatically */ + for (i=0; i<16; i++) { + /* load up the input word for this round */ + tmp = w[i] = htobe64(w[i]); + tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i]; + + /* shift register */ + h7 = h6; h6 = h5; h5 = h4; + h4 = h3 + tmp; + h3 = h2; h2 = h1; h1 = h0; + h0 = tmp + maj(h1,h2,h3) + S0(h1); + } + + for (; i<80; i++) { + /* load up the input word for this round */ + a = w[(i+1 ) & 15]; + b = w[(i+14) & 15]; + tmp = w[i&15] = s0(a) + s1(b) + w[i&15] + w[(i+9) & 15]; + tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i]; + + /* shift register */ + h7 = h6; h6 = h5; h5 = h4; + h4 = h3 + tmp; + h3 = h2; h2 = h1; h1 = h0; + h0 = tmp + maj(h1,h2,h3) + S0(h1); + } + + state[0] += h0; + state[1] += h1; + state[2] += h2; + state[3] += h3; + state[4] += h4; + state[5] += h5; + state[6] += h6; + state[7] += h7; +} + +void +sha512_init ( + struct sha512_ctx_t *ctx +) { + ctx->nbytes = 0; + memcpy(ctx->chain, sha512_init_state, sizeof(sha512_init_state)); + memset(ctx->block, 0, sizeof(ctx->block)); +} + +void +sha512_update ( + struct sha512_ctx_t *ctx, + const unsigned char *data, + uint64_t bytes +) { + assert(ctx->nbytes < 1ull<<56); + assert(bytes < 1ull<<56); + + while (bytes) { + uint64_t fill = ctx->nbytes % 128, accept = 128 - fill; + if (accept > bytes) accept = bytes; + ctx->nbytes += accept; + memcpy(ctx->block + fill, data, accept); + + if (fill+accept == 128) + sha512_process_block(ctx); + + bytes -= accept; + data += accept; + } + + assert(ctx->nbytes < 1ull<<56); +} + +void +sha512_final ( + struct sha512_ctx_t *ctx, + uint8_t result[64] +) { + uint64_t fill = ctx->nbytes % 128, i; + ctx->block[fill++] = 0x80; + if (fill > 112) { + memset(ctx->block + fill, 0, 128-fill); + sha512_process_block(ctx); + fill = 0; + } + memset(ctx->block + fill, 0, 112-fill); + *((uint64_t *)&ctx->block[112]) = 0; + *((uint64_t *)&ctx->block[120]) = htobe64((ctx->nbytes * 8)); + sha512_process_block(ctx); + for (i=0; i<8; i++) { + ctx->chain[i] = htobe64(ctx->chain[i]); + } + memcpy(result, ctx->chain, sizeof(ctx->chain)); + sha512_init(ctx); +} diff --git a/test/bench.c b/test/bench.c new file mode 100644 index 0000000..b54488f --- /dev/null +++ b/test/bench.c @@ -0,0 +1,684 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" + +#include +#include +#include +#include + +#include "p448.h" +#include "ec_point.h" +#include "scalarmul.h" +#include "barrett_field.h" +#include "crandom.h" +#include "goldilocks.h" +#include "sha512.h" + +double now() { + struct timeval tv; + gettimeofday(&tv, NULL); + + return tv.tv_sec + tv.tv_usec/1000000.0; +} + +void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) { + crandom_generate(crand, (unsigned char *)a, sizeof(*a)); + p448_strong_reduce(a); +} + +void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) { + crandom_generate(crand, (unsigned char *)sk, 448/8); +} + +void p448_print( const char *descr, const struct p448_t *a ) { + p448_t b; + p448_copy(&b, a); + p448_strong_reduce(&b); + int j; + printf("%s = 0x", descr); + for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) { + printf(PRIxWORD58, b.limb[j]); + } + printf("\n"); +} + +void p448_print_full( const char *descr, const struct p448_t *a ) { + int j; + printf("%s = 0x", descr); + for (j=15; j>=0; j--) { + printf("%02" PRIxWORD "_" PRIxWORD58 " ", + a->limb[j]>>28, a->limb[j]&(1<<28)-1); + } + printf("\n"); +} + +void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) { + int j; + printf("%s = 0x", descr); + for (j=448/WORD_BITS-1; j>=0; j--) { + printf(PRIxWORDfull, secret[j]); + } + printf("\n"); +} + +#ifndef N_TESTS_BASE +#define N_TESTS_BASE 10000 +#endif + +int main(int argc, char **argv) { + (void)argc; + (void)argv; + + struct tw_extensible_t ext; + struct extensible_t exta; + struct tw_niels_t niels; + struct tw_pniels_t pniels; + struct affine_t affine; + struct montgomery_t mb; + struct p448_t a,b,c,d; + + + double when; + int i; + + int nbase = N_TESTS_BASE; + + /* Bad randomness so we can debug. */ + char initial_seed[32]; + for (i=0; i<32; i++) initial_seed[i] = i; + struct crandom_state_t crand; + crandom_init_from_buffer(&crand, initial_seed); + + word_t sk[448/WORD_BITS],tk[448/WORD_BITS]; + q448_randomize(&crand, sk); + + when = now(); + for (i=0; ia isog: %5.1fns\n", when * 1e9 / i); + + when = now(); + for (i=0; ii isog: %5.1fns\n", when * 1e9 / i); + + when = now(); + for (i=0; i +#include + + +int failed_tests, n_tests, failed_this_test, running_a_test; + +void end_test() { + if (!failed_this_test) { + printf("[PASS]\n"); + } + n_tests ++; + running_a_test = 0; +} + +void begin_test(const char *name) { + if (running_a_test) end_test(); + printf("%s...%*s",name,(int)(30-strlen(name)),""); + fflush(stdout); + failed_this_test = 0; + running_a_test = 1; +} + +void youfail() { + if (failed_this_test) return; + failed_this_test = 1; + failed_tests ++; + printf("[FAIL]\n"); +} + +static int +hexchar (char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'a' && c <= 'f') { + return 10 + c - 'a'; + } else if (c >= 'A' && c <= 'F') { + return 10 + c - 'A'; + } else { + return -1; + } +} + +int +hexdecode ( + unsigned char *bytes, + const char *hex, + unsigned int nbytes +) { + if (strlen(hex) != nbytes*2) { + return -1; + } + + unsigned int i; + for (i=0; i=0; j--) { + printf(PRIxWORD58, b.limb[j]); + } + printf("\n"); +} + +void scalar_print ( + const char *descr, + const word_t *scalar, + int nwords +) { + int j; + printf("%s = 0x", descr); + for (j=nwords-1; j>=0; j--) { + printf(PRIxWORDfull, scalar[j]); + } + printf("\n"); +} + +int main(int argc, char **argv) { + (void) argc; + (void) argv; + + n_tests = running_a_test = failed_tests = 0; + begin_test("SHA-512 NIST Monte Carlo"); + test_sha512_monte_carlo(); + + begin_test("EC point operations"); + test_pointops(); + + begin_test("Scalarmul compatibility"); + test_scalarmul_compatibility(); + + begin_test("Scalarmul commutativity"); + test_scalarmul_commutativity(); + + if (running_a_test) end_test(); + printf("\n"); + if (failed_tests) { + printf("Failed %d / %d tests.\n", failed_tests, n_tests); + } else { + printf("Passed all %d tests.\n", n_tests); + } + + return failed_tests ? 1 : 0; +} diff --git a/test/test.h b/test/test.h new file mode 100644 index 0000000..5bbdc48 --- /dev/null +++ b/test/test.h @@ -0,0 +1,42 @@ +#ifndef __GOLDILOCKS_TEST_H__ +#define __GOLDILOCKS_TEST_H__ 1 + +#include "word.h" +#include "p448.h" + +int +hexdecode ( + unsigned char *bytes, + const char *hex, + unsigned int nbytes +); + +void +hexprint ( + const char *descr, + const unsigned char *bytes, + unsigned int nbytes +); + +void p448_print ( + const char *descr, + const struct p448_t *a +); + +void scalar_print ( + const char *descr, + const word_t *scalar, + int nwords +); + +void youfail(); + +int test_sha512_monte_carlo(); + +int test_scalarmul_compatibility (); + +int test_scalarmul_commutativity (); + +int test_pointops (); + +#endif // __GOLDILOCKS_TEST_H__ diff --git a/test/test_pointops.c b/test/test_pointops.c new file mode 100644 index 0000000..6dfdab7 --- /dev/null +++ b/test/test_pointops.c @@ -0,0 +1,287 @@ +#include "test.h" + +#include + +#include "ec_point.h" +#include "p448.h" +#include "crandom.h" + + +static void +failprint_ext ( + const struct extensible_t *a +) { + struct p448_t zi, scaled; + p448_print(" x", &a->x); + p448_print(" y", &a->y); + p448_print(" z", &a->z); + p448_inverse(&zi, &a->z); + p448_mul(&scaled, &zi, &a->x); + p448_print(" X", &scaled); + p448_mul(&scaled, &zi, &a->y); + p448_print(" Y", &scaled); + printf("\n"); +} + +static void +failprint_tw_ext ( + const struct tw_extensible_t *a +) { + failprint_ext((const struct extensible_t *)a); +} + +static mask_t +fail_if_different ( + const struct extensible_t *a, + const struct extensible_t *b, + const char *faildescr, + const char *adescr, + const char *bdescr +) { + mask_t succ = eq_extensible(a, b); + + if (!succ) { + youfail(); + printf(" %s\n", faildescr); + + printf("\n %s:\n", adescr); + failprint_ext(a); + + printf("\n %s:\n", bdescr); + failprint_ext(b); + } + + return succ; +} + +static mask_t +validate_ext( + const struct extensible_t *ext, + int evenness, + const char *description +) { + mask_t succ = validate_extensible(ext), succ2; + const char *error = "Point isn't on the curve."; + if (evenness > 0) { + succ2 = is_even_pt(ext); + if (succ &~ succ2) error = "Point isn't even."; + succ &= succ2; + } else if (evenness < 0) { + succ2 = is_even_pt(ext); + if (succ &~ succ2) error = "Point is even but shouldn't be."; + succ &= succ2; + } /* FUTURE: quadness */ + + if (~succ) { + youfail(); + printf(" %s\n", error); + printf(" %s\n", description); + failprint_ext(ext); + } + + return succ; +} + +static mask_t +validate_tw_ext( + const struct tw_extensible_t *ext, + int evenness, + const char *description +) { + mask_t succ = validate_tw_extensible(ext), succ2; + const char *error = "Point isn't on the twisted curve."; + if (evenness > 0) { + succ2 = is_even_tw(ext); + if (succ &~ succ2) error = "Point isn't even."; + succ &= succ2; + } else if (evenness < 0) { + succ2 = is_even_tw(ext); + if (succ &~ succ2) error = "Point is even but shouldn't be."; + succ &= succ2; + } /* FUTURE: quadness */ + + if (~succ) { + youfail(); + printf(" %s\n", error); + printf(" %s\n", description); + failprint_tw_ext(ext); + } + + return succ; +} + +static mask_t +fail_if_different_tw ( + const struct tw_extensible_t *a, + const struct tw_extensible_t *b, + const char *faildescr, + const char *adescr, + const char *bdescr +) { + return fail_if_different( + (const struct extensible_t *)a, (const struct extensible_t *)b, + faildescr,adescr,bdescr + ); +} + +static int +add_double_test ( + const struct affine_t *base1, + const struct affine_t *base2 +) { + mask_t succ = MASK_SUCCESS; + struct extensible_t exb; + struct tw_extensible_t text1, text2, texta, textb; + struct tw_pniels_t pn; + + /* Convert to ext */ + convert_affine_to_extensible(&exb, base1); + succ &= validate_ext(&exb,0,"base1"); + twist_and_double(&text1, &exb); + succ &= validate_tw_ext(&text1,2,"iso1"); + convert_affine_to_extensible(&exb, base2); + succ &= validate_ext(&exb,0,"base2"); + twist_and_double(&text2, &exb); + succ &= validate_tw_ext(&text2,2,"iso2"); + + /* a + b == b + a? */ + convert_tw_extensible_to_tw_pniels(&pn, &text1); + copy_tw_extensible(&texta, &text2); + add_tw_pniels_to_tw_extensible(&texta, &pn); + + convert_tw_extensible_to_tw_pniels(&pn, &text2); + copy_tw_extensible(&textb, &text1); + add_tw_pniels_to_tw_extensible(&textb, &pn); + + succ &= fail_if_different_tw(&texta,&textb,"Addition commutativity","a+b","b+a"); + + copy_tw_extensible(&textb, &text2); + add_tw_pniels_to_tw_extensible(&textb, &pn); + copy_tw_extensible(&texta, &text2); + double_tw_extensible(&texta); + + succ &= fail_if_different_tw(&texta,&textb,"Doubling test","2b","b+b"); + + if (~succ) { + printf(" Bases were:\n"); + p448_print(" x1", &base1->x); + p448_print(" y1", &base1->y); + p448_print(" x2", &base2->x); + p448_print(" y2", &base2->y); + } + + return succ ? 0 : -1; +} + +static int +single_twisting_test ( + const struct affine_t *base +) { + struct extensible_t exb, ext, tmpext; + struct tw_extensible_t text, text2; + mask_t succ = MASK_SUCCESS; + + convert_affine_to_extensible(&exb, base); + succ &= validate_ext(&exb,0,"base"); + + /* check: dual . iso = 4 */ + twist_and_double(&text, &exb); + succ &= validate_tw_ext(&text,2,"iso"); + untwist_and_double(&ext, &text); + succ &= validate_ext(&ext,2,"dual.iso"); + + copy_extensible(&tmpext,&exb); + double_extensible(&tmpext); + succ &= validate_ext(&tmpext,1,"2*base"); + + double_extensible(&tmpext); + succ &= validate_ext(&tmpext,2,"4*base"); + + succ &= fail_if_different(&ext,&tmpext,"Isogeny and dual","Dual . iso","4*base"); + + /* check: twist and serialize */ + test_only_twist(&text, &exb); + succ &= validate_tw_ext(&text,0,"tot"); + mask_t evt = is_even_tw(&text), evb = is_even_pt(&exb); + if (evt != evb) { + youfail(); + printf(" Different evenness from twist base: %d, twist: %d\n", (int)-evt, (int)-evb); + + succ = 0; + } /* FUTURE: quadness */ + + p448_t sera,serb; + untwist_and_double_and_serialize(&sera,&text); + copy_extensible(&tmpext,&exb); + double_extensible(&tmpext); + serialize_extensible(&serb,&tmpext); + + /* check that their (doubled; FUTURE?) serializations are equal */ + if (~p448_eq(&sera,&serb)) { + youfail(); + printf(" Different serialization from twist + double ()\n"); + p448_print(" t", &sera); + p448_print(" b", &serb); + succ = 0; + } + + untwist_and_double(&ext, &text); + succ &= validate_ext(&tmpext,1,"dual.tot"); + + twist_and_double(&text2, &ext); + succ &= validate_tw_ext(&text2,2,"iso.dual.tot"); + + double_tw_extensible(&text); + succ &= validate_tw_ext(&text,1,"2*tot"); + + double_tw_extensible(&text); + succ &= validate_tw_ext(&text,2,"4*tot"); + + succ &= fail_if_different_tw(&text,&text2,"Dual and isogeny","4*tot","iso.dual.tot"); + + if (~succ) { + printf(" Base was:\n"); + p448_print(" x", &base->x); + p448_print(" y", &base->y); + } + + + return succ ? 0 : -1; +} + +int test_pointops () { + struct affine_t base, pbase; + struct p448_t ser448; + + struct crandom_state_t crand; + crandom_init_from_buffer(&crand, "test_pointops random initializer"); + + int i, ret; + for (i=0; i<1000; i++) { + uint8_t ser[56]; + crandom_generate(&crand, ser, sizeof(ser)); + + /* TODO: we need a p448 generate, which can return random or pathological. */ + mask_t succ = p448_deserialize(&ser448, ser); + if (!succ) { + youfail(); + printf(" Unlikely: fail at p448_deserialize\n"); + return -1; + } + + if (i) { + copy_affine(&pbase, &base); + } + elligator_2s_inject(&base, &ser448); + + if (i) { + ret = add_double_test(&base, &pbase); + if (ret) return ret; + } + + ret = single_twisting_test(&base); + if (ret) return ret; + } + + return 0; +} diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c new file mode 100644 index 0000000..d98cfd8 --- /dev/null +++ b/test/test_scalarmul.c @@ -0,0 +1,289 @@ +#include "test.h" + +#include + +#include "scalarmul.h" +#include "ec_point.h" +#include "p448.h" +#include "crandom.h" + +/* 0 = succeed, 1 = inval, -1 = fail */ +static int +single_scalarmul_compatibility_test ( + const struct p448_t *base, + const word_t *scalar, + int nbits +) { + struct tw_extensible_t text, work; + struct p448_t mont, ct, vl, vt; + + int ret = 0, i; + mask_t succ, succm; + + const struct p448_t + sqrt_d_minus_1 = {{ + U58LE(0xd2e21836749f46), + U58LE(0x888db42b4f0179), + U58LE(0x5a189aabdeea38), + U58LE(0x51e65ca6f14c06), + U58LE(0xa49f7b424d9770), + U58LE(0xdcac4628c5f656), + U58LE(0x49443b8748734a), + U58LE(0x12fec0c0b25b7a) + }}; + + succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base); + + succm = montgomery_ladder(&mont,base,scalar,nbits,1); + + if (succ != succm) { + youfail(); + printf(" Deserialize_and_twist_approx succ=%d, montgomery_ladder succ=%d\n", + (int)-succ, (int)-succm); + printf(" nbits = %d\n", nbits); + p448_print(" base", base); + scalar_print(" scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS); + return -1; + } + + if (!succ) { + return 1; + } + + struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}}; + const int nparams = sizeof(params)/sizeof(params[0]); + struct fixed_base_table_t fbt; + struct p448_t fbout[nparams], wout[6]; + memset(&fbt, 0, sizeof(fbt)); + memset(&fbout, 0, sizeof(fbout)); + memset(&wout, 0, sizeof(wout)); + + /* compute using combs */ + for (i=0; i +#include + +#include "sha512.h" + + + +static int sha512_monte_carlo_core ( + const char *seed, + const char *checks[100] +) { + struct sha512_ctx_t sha; + sha512_init(&sha); + + unsigned char md0[64],md1[64],md2[64]; + + int ret = hexdecode(md0,seed,64); + if (ret) { + youfail(); + printf(" SHA-512 NIST Monte Carlo validation seed hex decode failure.\n"); + return -1; + } + + int i,j; + + memcpy(md1,md0,sizeof(md1)); + memcpy(md2,md0,sizeof(md1)); + + for (j=0; j<100; j++) { + + for (i=3; i<1003; i++) { + sha512_update(&sha,md0,sizeof(md0)); + sha512_update(&sha,md1,sizeof(md1)); + sha512_update(&sha,md2,sizeof(md2)); + memcpy(md0,md1,sizeof(md1)); + memcpy(md1,md2,sizeof(md1)); + sha512_final(&sha,md2); + } + + ret = hexdecode(md0,checks[j],64); + if (ret) { + youfail(); + printf(" SHA-512 NIST Monte Carlo validation hex decode failure at iteration %d\n", j); + return -1; + } else if (memcmp(md0,md2,sizeof(md2))) { + youfail(); + printf(" SHA-512 NIST Monte Carlo validation failure at iteration %d\n", j); + hexprint(" Expected", md0, 64); + hexprint(" But got ", md2, 64); + return j+1; + } + + memcpy(md0,md2,sizeof(md1)); + memcpy(md1,md2,sizeof(md1)); + } + + return 0; +} + +int test_sha512_monte_carlo() { + const char *seed = + "5c337de5caf35d18ed90b5cddfce001ca1b8ee8602f367e7c24ccca6f893802f" + "b1aca7a3dae32dcd60800a59959bc540d63237876b799229ae71a2526fbc52cd"; + const char *checks[100] = { + "ada69add0071b794463c8806a177326735fa624b68ab7bcab2388b9276c036e4" + "eaaff87333e83c81c0bca0359d4aeebcbcfd314c0630e0c2af68c1fb19cc470e", + "ef219b37c24ae507a2b2b26d1add51b31fb5327eb8c3b19b882fe38049433dbe" + "ccd63b3d5b99ba2398920bcefb8aca98cd28a1ee5d2aaf139ce58a15d71b06b4", + "c3d5087a62db0e5c6f5755c417f69037308cbce0e54519ea5be8171496cc6d18" + "023ba15768153cfd74c7e7dc103227e9eed4b0f82233362b2a7b1a2cbcda9daf", + "bb3a58f71148116e377505461d65d6c89906481fedfbcfe481b7aa8ceb977d25" + "2b3fe21bfff6e7fbf7575ceecf5936bd635e1cf52698c36ef6908ddbd5b6ae05", + "b68f0cd2d63566b3934a50666dec6d62ca1db98e49d7733084c1f86d91a8a08c" + "756fa7ece815e20930dd7cb66351bad8c087c2f94e8757cb98e7f4b86b21a8a8", + "937d7856a82a84c163c79417d0540c47daaf9ffe662c843737dbbcbe5f865bf6" + "f47a9d2bd10129a4f498073094653c324a2519a1c71ac1279b1623ff7d24647a", + "f8fbc058c2b9f84131c9decfa543a35ade41581f670398efd61b3abfced9c1cf" + "cb5324f2370487f9c59a65bc668ea596c8d22ce8a33014dfad28357fa7d05f04", + "4ab0c9484ff5c30fa64ae6e81510c5fea566eafb88f175f8bc19109f40fe8001" + "4c8b77fff10b8750778429bf3c5497e4cb92d9b30014f4cb975dff2a45244c28", + "685179397554d276513d630234a03419808c698abf2600d7490aabb8e455c6ab" + "6ea412c7729dc140a79dff66533c6946cbe90f9da9ed16e2e629db1651bea870", + "335e6e941ab7dadfecdb74ea6cb4e8584b6e3408841a33a6cf7fd6a63294b193" + "0a60983240311672acac3840a90e64cc366ce75081b2252627e9c31197ebad03", + "e3217f6af6e279e9445dc3738cbf9ba0e9edba0455844a73648139777afdea2c" + "4d8032e214f541bf92675fb23f24df8e4fe98e0003aadfb6d8f9cc2cd799bbf7", + "ee2fdfb3ae630613b7d890977cf2515deac272a37f27e4a01961ecf103d4ff5b" + "45cc8aef53b635dd75aa51aabf71c0642555ccd3281e0388f8ca09d83258cf30", + "6a30d97cc98af6a25b673dce7aeab8d762bf2e55ea0c6dc899179281f84dd02a" + "2896f77e9c106b472f55f7adbef7b1157be567ee1236ebdac2a3c5d8cb133eb5", + "ac1176abdc5f71170183d92ae55856221b0d95590af11d9d72ba605ec026bbec" + "52d6974bc43a1efb125ff2b161fbdc616fda00f04193a0bc26aacdfa052a5741", + "59fa909480620ecc08d34531a6da1b55158b74fc93ddf68e1d242615b6f3843a" + "7952e63e798c6445cde1b07e0be09d0d711cb7b42a0e7760a593b08acfceb63d", + "9eb253319efa61b864f27bd334d7dd78b38d3265fb544e0c8edee950a547e1d8" + "db921a285774ab94d66beae933298d20f2a5aa87c62fe1e383cc3b18e7af18ac", + "81735324005671f7bdad9e685ee8257f5e0622b9fcb5d38dbdfb2df27258c3e1" + "d46d76e24c0c92c744e1b50a2b4b0d31525b3af83cc80a75722d921bdeef59c4", + "17498cdff4323bb8021e44eca6559e05d8ff9a0ef2ee9d4ba0ac6e73f83972a0" + "dfbb6d47728fa70311d7c82e154966e1b7678263b0f65133e9116969193d429b", + "228c4574d7c45eb9ba9240722133fce74abe00c7328ab30b4bde373dc79afdd6" + "e0569d36268cd5eaa2f27205fc00512577bcbb6699e1d66ed85eafaba7548afb", + "3d40ccd9cc445bbecca9227c67fe455d89e0b7c1c858d32f30e2b544ca9a5a60" + "6535aea2e59fec6ec4d1ba898cc4338c6eadef9c0884bcf56aca2f481a2d7d3e", + "e1e577aeac92e3a2b7f8a262bf2ac9c037d2274ca6618fbe4cc21db7c699e994" + "6b6671ae45ea433a1e392a5bc9eec96fd641ba8f4a047f022a04a337227004df", + "5e4424c0bcb2f0f7a2428821a9d5840a82401f4440ae6bed25c53cd9e71cf9d3" + "9904d6a375bd721f4332ab0202529c91feb9c094c3e6d34ca4f66649ee6fa212", + "56b199d63ca37189d5ca0d40006ac7bcb9f39cbdc00ef7b8a5697caa7d81d05b" + "645a146995b1151d01958f1589337e14afc6e7dd10a815170e527a398e6ce8c3", + "d2d498ff93fb03013a64f295b5bc68e57d2fb5600da578aa011d43ff432eae3e" + "0c800f9e2a53155e56fdbf5e068fe2b4beb3e42b2585531b8b16c4d8ca3356c6", + "3d3875489903710f17cf4247b5842ace6f017b1a3b99e9ee5fbc04fc7898e78b" + "12693879878028ca40c63cd0f6925fb7d0ca0412e4f06619e3ace223690f03b8", + "a013e21cd1234483c95c2ea2757be949bc79401ba39b09c316a1612d594642be" + "65ca106e12695ac3808c57c6f2980e895fd1fe188946562afc238414e1e43649", + "c5f6367d7195489e16242f912fbe0d8002e947de3a7e9c53f77b1e5e90e05bd7" + "ca395e787e34cb5f500c02da59c9d83de35601de7ae80dae74a0d6b4a292d43b", + "7c28c44c6aaba83c122f24d68273e28a5afd65b4071d02b7ea3300478d511897" + "1e1356ae57cbc70d2a177ea464a1c2c50d4297b933e789c63b1481797ae8f08c", + "af7cb42b1c70a85ac1ae1c2991b25b657c19f4fcf83af7f7dc0ae1028c1452a6" + "a17dc98929634fe6ed3855b70b96bc2caa93d82037b94ebeddc77e4c1a7cc563", + "bd56ad4c0cbd162706053da929d667253aadcf417affb483fff4f2699bf406d1" + "28cfdf5196dfbb05bb89ccbf04c5147bd2ebb3156b0bc1768ca6faa171c91c01", + "004d7b0fff9bcddf4b3913ae190a76728705a3d23874d92a8b7ff246c8fcad46" + "623cb04723c8aded0cba4968d1a8cc1375b99005786c1bcb7ae4bf13325c3ae0", + "8299a5bf5ed64f525c4eebbeca969fc1b91a81adb58c584bdd2d7676386a31fa" + "546643a3cf505007584f02fb712d708cab645bf078a1b9339f5a76aee985d017", + "ce7100f3455db1a9776a9f40d562ea998afca1f9fee7e0d81c8db34cf68ad23a" + "8bfa6fc04774703e1e56d5196b66966158fcf2a8335a58c6ba7ba1af756ba1dc", + "90aaabcb655ee921b8350229efe6064a60051cf0cac858fa3d43afd5b97cc823" + "01bd1b8cc1f874022e5af948185638783a13ca1bbd5049ace7fbf4f6d90c201f", + "3cf0a25b33ded3e0806dfe603b9987f1d6f2b3fdcb1ec7f8566828c00e17e8f5" + "9e38b3bca302396c7525ca194e6cc8501369059e2e34ae21e3141215876847c4", + "bdc5266aee339a1ff13fcf5229773cd3d14b47101e83076927c160bb71bf7445" + "590525a2012d52af008e118e16df1b6bfcaf8f22b4e45f9e749f3c20625a2bc8", + "ef8d2ba885381ab97756d59dbbbf53a1ea35d152b2d8f82c3518430aa34e7083" + "59194ea43950d032e151f576d343a5c3cfe6b71d4ed0ead9d3a107402589bad0", + "194ea5324c4179998dd7057755f255fdea04dadf533f7851e3e9718b610948e3" + "2fd28323077d9421142ac808978adfa325b668c8599a2e01c757a5a14ed2dd37", + "106984d2f0087e621dae760552bc6279072267883c204079481af6034354f1a2" + "b77c17e6c039a1063e479342aa3ccd90330dd3fb5a7d5e976619497e2d3326cd", + "a1347216f1a6db47b90c4ded3c5c75440f54c22c87d538314d1340f86f88acba" + "01378acb933ddad0adc6b75d55bfb7e8efc9c4a531b2a410610b7515b6dac66a", + "b76e4db147e0eaa4f04880654088b9d0fce518c8c377d92c846345604dc6b2b1" + "8d377fdb8e30f06d9bcfe6d7dacc07d6adff73d98d49f8f132b80f3084390830", + "acd4e527763dfd4513f0def0b1edf8ea12dc78d336b7b796f3dcc32e10687254" + "43a2f55ab4f666b27d6bf2ab39669c98293f0a9108051fd3144d31a1ed171ddd", + "10128c15494bc87a87374f676ef9fe2df20b36ffcca41a80bd40b216637b3de7" + "10efd070e277827820a7bba3cceb7b21f8fe7f9775d6c4df4d3da5349434ec49", + "2632dd5c188c6ed3a4610405fdda704add752f5424d9de65a51400fe478e26cd" + "0412e5f91ca4b744c34f4954f40a3a4254431d21954623208b527b7b4daa687e", + "45707f5b6fc5ccd1f78d77f177d10fb8b462c74cc821518cd5cfa4b5d6b40b41" + "8044900693c37abbb82367d340fec67f800d74072935da1706b4d90ae26099c7", + "56c37f31220b5b3040373d91b2c5e42fe9e601a12f7f8dc4534459bf28e484b8" + "713db243c5782c031e674003a3c14c42fd152e7188789065e82795e10f87d54b", + "5da94c899d48bd8299fee3d81662f8d6c5f8f8bc54d18cb0368b13cebaee7ad7" + "1e74ea80f34974ad166f04f9a0602809166fe4085a475a8ca86cade12b6754c4", + "0664363f97ba910760b0922e31ca880ca97469506cb007e3108c36c3ce3ce180" + "1fb4197609479339e8820632b6a38bffffee05a9adc11cc544b9aa6f5b95cc6f", + "732c41a1edaa727c04f627ff158aaff67c18efd667216132b99ab84d108996a1" + "0bb008b5d803b22ed1aa78bb0d10f8a762fd34777d7dccce8e84827ba88d4193", + "fc9c21d67e393a2b05a23a17d8db630cbaebaa3def211181749f1bcad1815606" + "27fb60ee20fae2e5980cbf50fce0a19dce807e7fb75c4da0ef008bc75d413a65", + "0453b765afc1edffa595efe345177f5805ed3abc1297ceab757ae7161723a614" + "4cb543299f418049276d16b7896662631634fab9549127c10f27505b7dee8665", + "3853f3bf024e0668e8d1ea53733a97537f97d9307c5f3a19864ab4eeb1654710" + "693bb961a344dec8a758f5e64b26fcb6dd423419c4a114fa749211a9de06c281", + "240137f0dd57beb3f7fc283bb3ead423c67883fd46f4e27471d7be57ad469a49" + "bad03a3658418bd55614678f3a463bceff85291314b90ef43ccbcb028f0a7a07", + "f9050a5271edbe4cfdb9520ec05bbdc3cbcb9bce36fd212338d3e7028a39b9ab" + "30793e561d75a2e424193264c7f0775e65599ef0c94e0ad24dbfe18252364267", + "47caa7a5862fad837aaa409a4a9df2575e645528c35159115911b7c4e2f08ae4" + "9d68de97249b31b83ce2c163f649cad4559dc6e6a7191f2922d79a5fd6af167b", + "13f5825c41fa49edf6104e3e35c9c224eba93e37374f730004c39c54e7391e4a" + "847fd61865235a3fe32224c96fbe86f7e14c3d5df496e83ec989a71b4f293a44", + "e5b55e05efe1ca6b9a96a57e3a1523d610d70f837e93b31fa98c2736d3e114d2" + "38d46ec6b6e3d19e774b253f6b0c7a2ebe69b7e60fc0874444806b2a2278df45", + "f14a586ac30f0af255f597a9aef9abba5e99c04d17b01f24427c4ee2c196b52a" + "cb1ceefc9b15cb822b3ecffdc2f7c49e11d3fc0769acee33361537d379c62e0c", + "7e2d3398807195c48e6ec52d20710bbf8b21ea8de4d1abc197897ccc58aeff40" + "259edc67270cdae0edcc686c0d0dccc5760c1495ab1cf48482dc2000ae2d42ad", + "2f3d5c5f990bf615d5e8b396ccbd0337da39fad09b059f955a431db76a9dc720" + "dffc4e02c0be397c7e0463799cd75fd6ab7c52bec66c8df5ef0d47e14a4c5927", + "483a1764d308cc494a2b543d29ba616483aefdf91c7769fd084eedaac1add189" + "1df95d317a47430b2bf73e4081f86597020e28afe2d34a22b77ea62b6112d09a", + "bfa88691ec951511651c6f14af100eeb26d87729e18ac3ef49a80d73ffeaeea5" + "3e97c4a7277a7ee9f2fba070b1c9720d6cdba407dd82267019e3f0f5662b2f2b", + "4c17c8e2e7132dbf82afebc40efc77926d16f4d2c082d846dac28733aa767e28" + "40ebf04f2563df75933466a36e11968d342e4157827605d04d9627ce9b5216c8", + "70bbfc29a2a765220af84e7bb10d759a3152ad4b5643ef6b89966950ec7ef950" + "3d57bc0a28c4ee789a60bf9dcac59139e15241d73b990410cf92eff213da9eca", + "8d1d56f37fc19b84984a6fa33aa9c2dbdbf79a29c04ad0b4cf20333e6bec9434" + "47be2416242f8cd2f9732e79bb925cc5a61a80c5fc9c079961243fd1c1f5900e", + "492fd0171f4dcd5d20ea6c0d34b5576c8894664ae5955e6737f5e3b711c2804d" + "99ccca065b7ec18c82da98b18a3029b765c51ebc7c433b36492e0ed6b8511bb6", + "7f49e8e54db7e5b4323cae2db71f3e8b8eba172dcad3602e9b7b058007a55893" + "58732d5afffa56072a46e89b1ea27ef8d556deb86b569c635d394f15d99d8a15", + "56884a6a9210d5f371e25823efb2511a9c410c26a441e07c1bdffe8605084267" + "d49c315baf6a692d7d97844b2714b4930877a5d7f52cf6fa151700fcb6980546", + "6aaef8284eef221ecb17ea3c9596f075b5155fe7b925d737ed3c6543c761c28c" + "7cd9d9d4b5e2a37b2f183a2a367bbd34b633497bc7a1737d61c8c1f3ef295062", + "38ef178f5688e59d47c375252db7b39f40c0c84169878ee7ba5086e4b25fea81" + "076b9c37847e9e6bf24ae0b343689c265ec5ca7469e619acd61b0276721efb1b", + "e3fe1aabad120777cf24eaae289b486632ca46ceb89afae73dbae5fa87c76787" + "9369355a9cc5c21ca604ed91d0f2f58c466573f3e6d88e52c62c0d3cb188e141", + "82f5bd920457bb2763a0da031a7fed47b236951b1ea420c20fd2b6de1dbfbb9c" + "4600ea7092788493e2d4be6ee24b6dba04e57af3e8f2f14d9837295420ac7631", + "6d0b26208ba9b1615067bb3ff97b292fe67e4c02d240d649c32370e0a4cd22d0" + "3bdf864be4d24a3f5f51aeccfd1afd5191e590edeb5f7bec323b0506c3104b89", + "d081083158054d08371ec84f4d3aa5aa761734ac6091a30330a861fda056f835" + "c750bf4f7981af1693ff28545366bd05cec47bccd77a7d237befb0135c534138", + "6ba8b52780b8a07a2a2015dd8f0c5e7437b8e024c4ee428f7ba91dfea118cb72" + "a939872550983317132b841b7cbc29a22b8f1cfea0c55203cafc69b55ed6244a", + "312692b0a51f002b7f06d05b39d15a5637dbddd2f4f1a73e6c88a4c841cdba5c" + "d8e69c0939ab39bb1a9c54fa35402143c97edb9704a0e9e1a98701710f6a5dad", + "aaee960de201a8dcccff95b834fccf0dafc03fe6cffc0429162bf4aff01165ab" + "07a0c9435e9cb412121b7ba010657ccc3152118602b665072136317d92fd4262", + "21fdff552e08c86c07f080cefacaaaf31846eb893bfe2e4f88c3c3cd8cbf592a" + "84500942695a5e5ae971ab343ce2695dd1baeb1f94dd4b53d678e14265e421ae", + "ca8f1a5b2172f6adb474da53b35e3f73ffd88263d3eecde72e48b16e1a065801" + "5b555ee319005a1d82802e91431ee777610f9b1028d819921e1044ad426b0270", + "ce5ab25eff9c1ddc569a1eaaa66b689109ee269db7066e0b02d39b3564fd14ca" + "6249987b7791e203d3d7c2ebf18558d2f23f94c03dd1d03aa63849e4d2889a76", + "a6f8b0561000dd4ae8b828c5f676e8c1a6474c4a042a645f1815bd52e9ff53c9" + "7dc36d5d8997f8ce332185feead76267f5b2e63f597fb3345ca0046e58fc0f24", + "fec86794bad4106c5ad1c1a2d9a1b7aae480396ec231eb5cac21c4077d17a0b6" + "52da0037363399a5a1dababa4a40e4c54b9124167580dee9108c4dbb24c57512", + "594f5dd3f4c87bdc0d81309386e9163a9718e34c7b0dcb4613f8487aa786f9d2" + "11cfb61bb247fa9f5ecef042e710f192850f5571807294bfd8a54397850e5773", + "d81ad866f25ef6a0a6431d267114da564513e5ebdcf48db7e95db8cf32a89f0a" + "b107874d796035db97420ffcf1db5f04dc1a52ddbbb960fc63b7f3f835cc8be6", + "431d537e098e9949f6a68108d55d20952e3bfcdeb7273bac3917e37790a84fa5" + "db04c33a79c113a06cf333e831d7702a00853a93fd0aa5146d934f4f71242a6a", + "4ed95636c6885ae4e63d042e82f4da830c702dbf3b9746d64770a64dd666b332" + "08315f3a947c4dff790771ef283788a9c74da83e22b97f750286a820ee46698c", + "a9bcb60b4d7724cdddddbc232b4ac70b94d0d7e9f0724b1222d918930cbb9bdb" + "b04b3ad43e3c8caf3bf8b004ee4aec6bd527ff8eb6189b44827f7ba7057f6a90", + "d6d5e44d5bb07fc4144ab6ab309f048968f73f7992beb326047e9e2cd7af6240" + "bc8abf46703c32fdb58fb2a8672594a660ef855be74f24cec09d4fb00219de82", + "dfda9ac0c7147530da97715ccf47814182255f2f2cf40287db97a4c63b43fcd3" + "9e6d41e560921492badb253a7dea0aba863c7c33b912bb59d1ff4de03a4f03bb", + "0395faaaf2e907f27779d6f1cc9c9db68ec390a38fbb0702c6475b46f7a39949" + "8d46fd8014f834b131e1e83abba0359b1f16d8fc0a393580615def2ad0caba73", + "41cb98f09029abe85d24a0f131f116c7f69f54f7e91c250642606512bf3da4ca" + "89ba70a4714a5f66d9ae81ff09317dadaff12a02057074c970f0f02a52bfafd2", + "8e8f161d48e306c5533ed614b8ef3a1979df6db7e13d0780a73c4a3980ddf0a9" + "5f93941d412c93683e39915a660c3fbec0dbb1bb6beea2e2099cd968011535c0", + "789593f0b8fb83ef9b3ec50ab8f6e1e47344f763d4f7ceab5600989e7b6fd5fe" + "f6ee5e487975f64474af6cd71ae4d9ecce8f009edea0227c7ebe73080b8f961b", + "f37e1449e0b313d9537a6177f7a31158d353e5b79c781facf02526ec94e0c6cf" + "da37105bac67098b194ea82efb307c2929a9ab8aca0e76c53e829e3f901cd245", + "2e74e745caaf2d449ab3b031dd214b48616853a512cf2e95c40cb8e7594fe5e4" + "879ac8a26d02eb35b3b96a5c9e7dcae3e15fd050a0bcc1fb3b9cb9c4df0fad3e", + "6eac7069c26082e52574ca6a58abb9b1b9faf452e8cca9f1c7023679ce192ca5" + "54892f30e38104d39088a24df35612444a0fc90084af7535fd9344fa51dded84", + "ada6caf30c4f6e3644d952366e01519af6771b406e2c447552f0c597b8dd10e9" + "e9b4e699c9a835de03f422be8980538d9786172dfd2fe511db272a1543d5aa35", + "4d4b0086b2cb05d713f2805caa7e6605c8f7dbbb2e0f92aa159aebdcd6306030" + "5f47b748f1bca6e0b6e11cf8f9697fcccb6584b878c4b54a699290728a40aa1b", + "97420b8a0ad102aeb92139da2c052d2748dd7d2dbb93a9ea79dc15b520d0ca7c" + "ab8cb7a00f5b5aebcb49d7e7f52a27180935ce617aeecdecba04064c668edd37", + "4aa7dad74eb51d09a6ae7735c4b795b078f51c314f14f42a0d63071e13bdc5fd" + "9f51612e77b36d44567502a3b5eb66c609ec017e51d8df93e58d1a44f3c1e375" + }; + + return sha512_monte_carlo_core(seed, checks); +}