New release.

Rework the directory structure into something saner, with src/ test/ include/ and build/ Beginning some tests. Also, now support scan-build. Now support 32-bit including vectorless ARM. NEON is not yet supported, because I don't have a test machine. Many internal changes, improvements, and bug fixes.
11 years ago · 1eab9a3a08
--- a/+ 2
+++ b/+ 2
@@ -508,7 +508,7 @@ HIDE_SCOPE_NAMES       = NO
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.

 SHOW_INCLUDE_FILES     = YES
 SHOW_INCLUDE_FILES     = NO

 # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
 # grouped member an include statement to the documentation, telling the reader
@@ -777,7 +777,7 @@ FILE_PATTERNS          =
 # be searched for input files as well.
 # The default value is: NO.

 RECURSIVE              = NO
 RECURSIVE              = YES

 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,51 @@
 March 29, 2014:
    Added a test directory with various tests.  Currently testing SHA512 Monte
    Carlo, compatibility of the different scalarmul functions, and some
    identities on EC point ops.  Began moving these tests out of benchmarker.
    
    Added scan-build support.
    
    Improved some internal interfaces.  Made a structure for Barrett primes
    instead of passing parameters individually.  Moved some field operations
    to places that make more sense, eg Barrett serialize and deserialize.  The
    deserialize operation now checks that its argument is in [0,q).
    
    Added more documentation.
    
    Changed the names of a bunch of functions.  Still not entirely consistent,
    but getting more so.
    
    Some minor speed improvements.  For example, multiply is now a couple cycles
    faster.
    
    Added a hackish attempt at thread-safety and initialization sanity checking
    in the Goldilocks top-level routines.
    
    Fixed some vector alignment bugs.  Compiling with -O0 should now work.
    
    Slightly simplified recode_wnaf.

    Add a config.h file for future configuration.  EXPERIMENT flags moved here.
    
    I've decided against major changes to SHA512 for the moment.  They add speed
    but also significantly bloat the code, which is going to hurt L1 cache
    performance.  Perhaps we should link to OpenSSL if a faster SHA512 is desired.
    
    Reorganize the source tree into src, test; factor arch stuff into src/arch_*.
    
    Make most of the code 32-bit clean.  There's now a 32-bit generic and 32-bit
    vectorless ARM version.  No NEON version yet because I don't have a test
    machine (could use my phone in a pinch I guess?).  The 32-bit version still
    isn't heavily optimized, but on ARM it's using a nicely reworked signed/phi-adic
    multiplier.  The squaring is also based on this, but could really stand some
    improvement.
    
    When passed an even exponent (or extra doubles), the Montgomery ladder should
    now be accept points if and only if they lie on the curve.  This needs
    additional testing, but it passes the zero bit exponent test.
    
    On 32-bit, use 8x4x14 instead of 5x5x18 table organization.  Probably there's
    a better heuristic.

 March 5, 2014:
    First revision.
--- a/+ 64
+++ b/+ 64
@@ -2,61 +2,101 @@
 # Released under the MIT License.  See LICENSE.txt for license information.

 CC = clang
 CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror  \
  -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 \
  -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC \
  -DEXPERIMENT_ECDH_OBLITERATE_CT=1 -DEXPERIMENT_ECDH_STIR_IN_PUBKEYS=1
 LD = clang

 .PHONY: clean all runbench todo doc
 ARCH = arch_x86_64

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	-Wgcc-compat -Wmissing-declarations
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
 LANGFLAGS = -std=c99
 GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
 OFLAGS = -O3
 #XFLAGS = -DN_TESTS_BASE=1000
 ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
 #ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16

 CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
 LDFLAGS = $(ARCHFLAGS)
 ASFLAGS = $(ARCHFLAGS)

 .PHONY: clean all test bench todo doc lib
 .PRECIOUS: build/%.s
 	

 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o

 all: bench
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o

 BENCHCOMPONENTS=build/bench.o

 all: lib build/test build/bench

 scan: clean
 	scan-build --use-analyzer=`which clang` \
 		 -enable-checker deadcode -enable-checker llvm \
 		 -enable-checker osx -enable-checker security -enable-checker unix \
 		make build/bench build/test build/goldilocks.so

 build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^

 build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^

 lib: build/goldilocks.so

 build/goldilocks.so: $(LIBCOMPONENTS)
 	rm -f $@
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  -exported_symbols_list src/exported.sym \
 		  $(LIBCOMPONENTS)

 bench: *.h *.c
 	$(CC) $(CFLAGS) -o $@ *.c
 	
 build/timestamp:
 	mkdir -p build
 	touch $@

 build/%.o: build/%.s
 	$(CC) -c -o $@ $<
 	$(CC) $(ASFLAGS) -c -o $@ $<

 build/%.s: %.c $(HEADERS)
 build/%.s: src/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 build/goldilocks.so: $(LIBCOMPONENTS)
 	rm -f $@
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  -exported_symbols_list exported.sym \
 		  $(LIBCOMPONENTS)
 build/%.s: test/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 build/%.s: src/$(ARCH)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 doc/timestamp:
 	mkdir -p doc
 	touch $@

 doc: Doxyfile doc/timestamp *.c *.h
 doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h
 	doxygen

 todo::
 	@egrep --color=auto -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
 	@echo '============================='
 	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
 	  egrep -w -i $$i *.h *.c > /dev/null || continue; \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
 	  /bin/echo -n $$i'       ' | head -c 10; \
 	  egrep -w -i $$i *.h *.c | wc -l; \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
 	done)
 	@echo '============================='
 	@echo -n 'Total     '
 	@egrep -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c | wc -l
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l

 bench: build/bench
 	./$<

 runbench: bench
 test: build/test
 	./$<

 clean:
 	rm -fr build bench *.o *.s
 	rm -fr build doc
--- a/TODO.txt
+++ b/TODO.txt
@@ -23,7 +23,7 @@ Important work items for Ed448-Goldilocks:
    * Word_t, mask_t, bigregister_t, etc.
    * Generate asm intrinsics with a script?

 * Bugfix: make sure that init() and randomization are thread-safe.
 * [DONE] Bugfix: make sure that init() and randomization are thread-safe.

 * Security: check on deserialization that points are < p.
    * Check also that they're nonzero or otherwise non-pathological?
@@ -80,30 +80,29 @@ Important work items for Ed448-Goldilocks:
 * Portability: make the inner layers of the code 32-bit clean.
    * Write new versions of the field code.
        * 28-bit limbs give less headroom for carries.
        * NEON and vectorless ARM.
        * Now have a vectorless ARM version; need NEON.
        * Improve speed of 32-bit field code.
    
    * Run through the SAGE tool to generate new bias & bound.

 * Portability: make the outer layers of the code 32-bit clean.
    * There are endian bugs in the signing algorithm.
    * NEON and vectorless constant-time comparison.
 * [DONE] Portability: make the outer layers of the code 32-bit clean.

 * Performance: write and incorporate some extra routines
    * Deserialize_and_isogeny
    * Unconditional negate (or just plain subtract)

 * Performance: fixed parameters?
 * Performance/flexibility: decide which parameters should be hard-coded.
    * Perhaps useful for comb precomputation.

 * Performance: Improve SHA512.
    * Improve portability.
    * [DONE?] Improve portability.
    * Improve speed.
        * Except not, because this adds too much code size.
        * Link OpenSSL if a fast SHA is desired.

 * Protocol:
    * Decide what things to stir into hashes for various functions.
    
 * Performance: improve the Barrett field code.
    * Support other primes?
    * Capture prime shape into a struct instead of passing 3 params.
    * Make 32-bit clean.  (SAGE?)
    * [DONE] Make 32-bit clean.

 * Automation:
    * Improve the SAGE tool to cover more cases
@@ -111,6 +110,10 @@ Important work items for Ed448-Goldilocks:
        * Constant-time selection
        * Intrinsics code
        * Field code?
    
    * SAGE tool is impossibly slow on 32-bit
         * Currently stuck on Elligator after 19 hours.
         * [FIXED] at least for now.
        
    * Vector-mul-chains
    * Negation "bubble pushing" optimization
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -0,0 +1,210 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file goldilocks.h
 * @author Mike Hamburg
 * @brief Goldilocks high-level functions.
 */
 #ifndef __GOLDILOCKS_H__
 #define __GOLDILOCKS_H__ 1

 #include <stdint.h>

 /**
 * @brief Serialized form of a Goldilocks public key.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_public_key_t {
    uint8_t opaque[56]; /**< Serialized data. */
 };

 /**
 * @brief Serialized form of a Goldilocks private key.
 *
 * Contains 56 bytes of actual private key, 56 bytes of
 * public key, and 32 bytes of symmetric key for randomization.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_private_key_t {
    uint8_t opaque[144]; /**< Serialized data. */
 };

 #ifdef __cplusplus
 extern "C" {
 #endif

 /** @brief No error. */
 static const int GOLDI_EOK      = 0;

 /** @brief Error: your key or other state is corrupt. */
 static const int GOLDI_ECORRUPT = 44801;

 /** @brief Error: other party's key is corrupt. */
 static const int GOLDI_EINVAL   = 44802;

 /** @brief Error: not enough entropy. */
 static const int GOLDI_ENODICE  = 44804;

 /** @brief Error: you need to initialize the library first. */
 static const int GOLDI_EUNINIT  = 44805;

 /** @brief Error: called init() but we are already initialized. */
 static const int GOLDI_EALREADYINIT  = 44805;

 /**
 * @brief Initialize Goldilocks' precomputed tables and
 * random number generator.  This function must be called before
 * any of the other Goldilocks routines (except
 * goldilocks_shared_secret in the current version) and should be
 * called only once per process.
 *
 * There is currently no way to tear down this state.  It is possible
 * that a future version of this library will not require this function.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EALREADYINIT Already initialized.
 * @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
 * @retval Nonzero An error occurred.
 */
 int
 goldilocks_init ()
 __attribute__((warn_unused_result));


 /**
 * @brief Generate a new random keypair.
 * @param [out] privkey The generated private key.
 * @param [out] pubkey The generated public key.
 *
 * @warning This isn't even my final form!
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ENODICE Insufficient entropy.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2)));

 /**
 * @brief Extract the public key from a private key.
 *
 * This is essentially a memcpy from the public part of the privkey.
 *    
 * @param [out] pubkey The extracted private key.
 * @param [in] privkey The private key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT The private key is corrupt.
 */
 int
 goldilocks_private_to_public (
    struct goldilocks_public_key_t *pubkey,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2)));

 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
 *
 * This function uses some compile-time flags whose merit remains to
 * be decided.
 *
 * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
 * of zeros to the secret before hashing.  In the case that the other
 * party's key is detectably corrupt, instead the symmetric part
 * of the secret key is used to produce a pseudorandom value.
 *
 * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
 * the two parties' public keys is prepended to the hash.
 *
 * In the current version, this function can safely be run even without
 * goldilocks_init().  But this property is not guaranteed for future
 * versions, so call it anyway.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] shared The shared secret established with the other party.
 * @param [in] my_privkey My private key.
 * @param [in] your_pubkey The other party's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EINVAL   The other party's key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_shared_secret (
    uint8_t shared[64],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,3)));
    
 /**
 * @brief Sign a message.
 *
 * The signature is deterministic, using the symmetric secret found in the
 * secret key to form a nonce.
 *
 * The technique used in signing is a modified Schnorr system, like EdDSA.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] signature_out Space for the output signature.
 * @param [in] message The message to be signed.
 * @param [in] message_len The length of the message to be signed.
 * @param [in] privkey My private key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2,4)));

 /**
 * @brief Verify a signature.
 *
 * This function is fairly strict.  It will correctly detect when
 * the signature has the wrong cofactor component, or when the sig
 * values aren't less than p or q.
 * 
 * Currently this function does not detect when the public key is weird,
 * eg 0, has cofactor, etc.  As a result, a party with a bogus public
 * key could create signatures that succeed on some systems and fail on
 * others.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] signature The signature.
 * @param [in] message The message to be verified.
 * @param [in] message_len The length of the message to be verified.
 * @param [in] pubkey The signer's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EINVAL The public key or signature is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,4)));

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDILOCKS_H__ */
--- a/src/arch_32/ec_point.c
+++ b/src/arch_32/ec_point.c
@@ -0,0 +1,959 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_bias ( &d->y,     2 );
    p448_weak_reduce( &d->y );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     1 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     1 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_weak_reduce( &e->t );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_weak_reduce(     b );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_bias ( &b->z,     2 );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_weak_reduce(   &L3 );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_weak_reduce( &a->y );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    p448_bias ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_bias (   &L3,     1 );
    p448_weak_reduce(   &L3 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/src/arch_32/p448.c
+++ b/src/arch_32/p448.c
@@ -0,0 +1,300 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"
 #include "p448.h"
 //#include "x86-64-arith.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
    word_t x
 ) {
    dword_t xx = x;
    xx--;
    return xx >> WORD_BITS;
 }

 static uint64_t widemul_32 (
    const uint32_t a,
    const uint32_t b
 ) {
    return ((uint64_t)a)* b;
 }

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    // p448_t ar, br;
 //     p448_copy(&ar,as);
 //     p448_copy(&br,bs);
 //     p448_weak_reduce(&ar);
 //     p448_weak_reduce(&br);
    
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
    uint32_t mask = (1<<28) - 1;  

    uint32_t aa[8], bb[8];

    /* For some reason clang doesn't vectorize this without prompting? */
    // unsigned int i;
    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    // }
    int i,j;
    for (i=0; i<8; i++) {
        aa[i] = a[i] + a[i+8];
        bb[i] = b[i] + b[i+8];
    }
    
    for (j=0; j<8; j++) {
        accum2 = 0;
    
        for (i=0; i<=j; i++) {      
            accum2 += widemul_32(a[j-i],b[i]);
            accum1 += widemul_32(aa[j-i],bb[i]);
            accum0 += widemul_32(a[8+j-i], b[8+i]);
        }
        
        accum1 -= accum2;
        accum0 += accum2;
        accum2 = 0;
        
        for (; i<8; i++) {
            accum0 -= widemul_32(a[8+j-i], b[i]);
            accum2 += widemul_32(aa[8+j-i], bb[i]);
            accum1 += widemul_32(a[16+j-i], b[8+i]);
        }

        accum1 += accum2;
        accum0 += accum2;

        c[j] = ((uint32_t)(accum0)) & mask;
        c[j+8] = ((uint32_t)(accum1)) & mask;

        accum0 >>= 28;
        accum1 >>= 28;
    }
    
    accum0 += accum1;
    accum0 += c[8];
    accum1 += c[0];
    c[8] = ((uint32_t)(accum0)) & mask;
    c[0] = ((uint32_t)(accum1)) & mask;
    
    accum0 >>= 28;
    accum1 >>= 28;
    c[9] += ((uint32_t)(accum0));
    c[1] += ((uint32_t)(accum1));
 }

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    uint64_t b
 ) {
    const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0, accum8;
    uint32_t mask = (1ull<<28)-1;  

    int i;

    accum0 = widemul_32(blo, a[0]);
    accum8 = widemul_32(blo, a[8]);
    accum0 += widemul_32(bhi, a[15]);
    accum8 += widemul_32(bhi, a[15] + a[7]);

    c[0] = accum0 & mask; accum0 >>= 28;
    c[8] = accum8 & mask; accum8 >>= 28;
    
    for (i=1; i<8; i++) {
        accum0 += widemul_32(blo, a[i]);
        accum8 += widemul_32(blo, a[i+8]);
        
        accum0 += widemul_32(bhi, a[i-1]);
        accum8 += widemul_32(bhi, a[i+7]);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
    }

    accum0 += accum8 + c[8];
    c[8] = accum0 & mask;
    c[9] += accum0 >> 28;

    accum8 += c[0];
    c[0] = accum8 & mask;
    c[1] += accum8 >> 28;
 }

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    p448_mul(cs,as,as); // PERF
 }

 void
 p448_strong_reduce (
    p448_t *a
 ) {
    word_t mask = (1ull<<28)-1;

    /* first, clear high */
    a->limb[8] += a->limb[15]>>28;
    a->limb[0] += a->limb[15]>>28;
    a->limb[15] &= mask;

    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    dsword_t scarry = 0;
    int i;
    for (i=0; i<16; i++) {
        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 28;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    word_t scarry_mask = scarry & mask;
    dword_t carry = 0;

    /* add it back */
    for (i=0; i<16; i++) {
        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 28;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p448_is_zero (
    const struct p448_t *a
 ) {
    struct p448_t b;
    p448_copy(&b,a);
    p448_strong_reduce(&b);

    uint32_t any = 0;
    int i;
    for (i=0; i<16; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
        for (j=0; j<7; j++) {
            serial[7*i+j] = limb;
            limb >>= 8;
        }
        assert(limb == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
        uint64_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((uint64_t)serial[7*i+j])<<(8*j);
        }
        x->limb[2*i] = out & (1ull<<28)-1;
        x->limb[2*i+1] = out >> 28;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 56 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    uint32_t ge = -1, mask = (1ull<<28)-1;
    for (i=0; i<8; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
    
    /* Propagate the rest */
    for (i=9; i<16; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
 }

 void
 simultaneous_invert_p448(
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 ) {
  if (n==0) {
      return;
  } else if (n==1) {
      p448_inverse(out,in);
      return;
  }
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<(int) (n-1); i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }
--- a/src/arch_32/p448.h
+++ b/src/arch_32/p448.h
@@ -0,0 +1,378 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x & (1<<28)-1;
    out->limb[1] = x>>28;
    for (i=2; i<16; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = doswap;

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = doNegate;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    int i;
    a->limb[8] += tmp;
    for (i=15; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/arch_arm_32/ec_point.c
+++ b/src/arch_arm_32/ec_point.c
@@ -0,0 +1,959 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_bias ( &d->y,     2 );
    p448_weak_reduce( &d->y );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     1 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     1 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_weak_reduce( &e->t );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_weak_reduce(     b );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_bias ( &b->z,     2 );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_weak_reduce(   &L3 );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_weak_reduce( &a->y );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    p448_bias ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_bias (   &L3,     1 );
    p448_weak_reduce(   &L3 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/src/arch_arm_32/p448.c
+++ b/src/arch_arm_32/p448.c
--- a/src/arch_arm_32/p448.h
+++ b/src/arch_arm_32/p448.h
@@ -0,0 +1,378 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x & (1<<28)-1;
    out->limb[1] = x>>28;
    for (i=2; i<16; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = doswap;

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = doNegate;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    int i;
    a->limb[8] += tmp;
    for (i=15; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/arch_x86_64/ec_point.c
+++ b/src/arch_x86_64/ec_point.c
@@ -0,0 +1,910 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_bias ( &d->y,     2 );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_bias (   &L0,     2 );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_bias ( &a->t,     3 );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_bias ( &a->t,     3 );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_bias ( &b->t,     3 );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_bias ( &b->t,     3 );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_bias (   &L1,     2 );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_bias (   &L1,     2 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_bias (   &L0,     4 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5, L6;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L2, &a->xa,   &L1 );
    p448_add  (   &L1,   &L2,   &L3 );
    p448_sub  (   &L0,   &L3,   &L2 );
    p448_bias (   &L0,     2 );
    p448_mul  (   &L3,   &L0,   &L1 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L0, &a->z0, &a->z0 );
    p448_bias (   &L0,     1 );
    p448_add  (   &L2,   &L0,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2, &a->xd,   &L0 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L2,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1;
    mask_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_bias ( &b->z,     2 );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_mul  ( &b->t, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->x, &b->z, &b->t );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  (   &L1, &b->x, &b->t );
    p448_isr  ( &b->t,   &L1 );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t,   &L1, &b->x );
    p448_add  (   &L1, &a->y, &a->x );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_bias ( &b->t,     2 );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
    p448_weak_reduce( &b->y );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    mask_t L2, L3;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    p448_bias ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    struct p448_t L0;
    mask_t L1, L2;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    struct p448_t L0, L1, L2;
    mask_t L3, L4;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    struct p448_t L0, L1, L2;
    mask_t L3, L4;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    struct p448_t L0, L1, L2, L3, L4, L5, L6, L7;
    mask_t L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L1, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L7, &a->y );
    p448_bias (   &L7,     2 );
    p448_sqr  (   &L0,   &L7 );
    p448_mulw (   &L6,   &L0, 1527402724 );
    p448_mulw (   &L5,   &L1, 6108985600 );
    p448_add  ( &a->y,   &L5,   &L6 );
    p448_mulw (   &L6,   &L0, 6109454568 );
    p448_sub  (   &L5, &a->y,   &L6 );
    p448_bias (   &L5,     2 );
    p448_mulw (   &L2, &a->y, 78160 );
    p448_mul  (   &L4,   &L5,   &L7 );
    p448_mul  (   &L6,   &L4,   &L2 );
    p448_mul  (   &L2,   &L5,   &L6 );
    p448_isr  (   &L3,   &L2 );
    p448_mul  (   &L2,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mul  (   &L6,   &L5,   &L3 );
    p448_mul  (   &L5,   &L6,   &L3 );
    p448_copy (   &L4, &a->x );
    p448_subw (   &L4,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L3, &a->x,   &L6 );
    p448_sub  ( &a->x,   &L4,   &L3 );
    p448_bias ( &a->x,     3 );
    p448_mul  (   &L3,   &L2, &a->x );
    p448_mulw (   &L2,   &L3, 78160 );
    p448_neg  ( &a->x,   &L2 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L2,   &L1,   &L1 );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_subw (   &L1,     2 );
    p448_bias (   &L1,     1 );
    p448_mul  (   &L0,   &L1,   &L6 );
    p448_mulw (   &L1,   &L0, 3054649120 );
    p448_add  (   &L0,   &L1, &a->y );
    p448_mul  ( &a->y,   &L5,   &L0 );
       L9 = p448_is_zero(   &L7 );
       L8 = -   L9;
    p448_addw ( &a->y,    L8 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L0, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L0 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L1, &ext->x, &ext->y );
    p448_neg  (   &L0,   &L1 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
       L5 = p448_is_zero(   &L1 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L0,   &L2 );
    p448_addw (   &L0,     0 );
    p448_sqr  (   &L1, &ext->x );
    p448_add  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L1, &ext->t );
    p448_mul  (   &L0,   &L1,   &L3 );
    p448_mulw (   &L1,   &L0, 39081 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L1,   &L3,   &L2 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L2,   &L3,   &L1 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/src/arch_x86_64/p448.c
+++ b/src/arch_x86_64/p448.c
@@ -0,0 +1,467 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"
 #include "x86-64-arith.h"

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32)));

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    }
    /*
    for (int i=0; i<4; i++) {
    aa[i] = a[i] + a[i+4];
    bb[i] = b[i] + b[i+4];
    }
    */

    accum2  = widemul(&a[0],&b[3]);
    accum0  = widemul(&aa[0],&bb[3]);
    accum1  = widemul(&a[4],&b[7]);

    mac(&accum2, &a[1], &b[2]);
    mac(&accum0, &aa[1], &bb[2]);
    mac(&accum1, &a[5], &b[6]);

    mac(&accum2, &a[2], &b[1]);
    mac(&accum0, &aa[2], &bb[1]);
    mac(&accum1, &a[6], &b[5]);

    mac(&accum2, &a[3], &b[0]);
    mac(&accum0, &aa[3], &bb[0]);
    mac(&accum1, &a[7], &b[4]);

    accum0 -= accum2;
    accum1 += accum2;

    c[3] = ((uint64_t)(accum1)) & mask;
    c[7] = ((uint64_t)(accum0)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;
    
    mac(&accum0, &aa[1],&bb[3]);
    mac(&accum1, &a[5], &b[7]);
    mac(&accum0, &aa[2], &bb[2]);
    mac(&accum1, &a[6], &b[6]);
    mac(&accum0, &aa[3], &bb[1]);
    accum1 += accum0;

    accum2 = widemul(&a[0],&b[0]);
    accum1 -= accum2;
    accum0 += accum2;
    
    msb(&accum0, &a[1], &b[3]);
    msb(&accum0, &a[2], &b[2]);
    mac(&accum1, &a[7], &b[5]);
    msb(&accum0, &a[3], &b[1]);
    mac(&accum1, &aa[0], &bb[0]);
    mac(&accum0, &a[4], &b[4]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[2],&bb[3]);
    msb(&accum0, &a[2], &b[3]);
    mac(&accum1, &a[6], &b[7]);

    mac(&accum2, &aa[3], &bb[2]);
    msb(&accum0, &a[3], &b[2]);
    mac(&accum1, &a[7], &b[6]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(&a[0],&b[1]);
    mac(&accum1, &aa[0], &bb[1]);
    mac(&accum0, &a[4], &b[5]);

    mac(&accum2, &a[1], &b[0]);
    mac(&accum1, &aa[1], &bb[0]);
    mac(&accum0, &a[5], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[3],&bb[3]);
    msb(&accum0, &a[3], &b[3]);
    mac(&accum1, &a[7], &b[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(&a[0],&b[2]);
    mac(&accum1, &aa[0], &bb[2]);
    mac(&accum0, &a[4], &b[6]);

    mac(&accum2, &a[1], &b[1]);
    mac(&accum1, &aa[1], &bb[1]);
    mac(&accum0, &a[5], &b[5]);

    mac(&accum2, &a[2], &b[0]);
    mac(&accum1, &aa[2], &bb[0]);
    mac(&accum0, &a[6], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 56;
    accum1 >>= 56;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0, accum4;
    uint64_t mask = (1ull<<56) - 1;  

    accum0 = widemul_rm(b, &a[0]);
    accum4 = widemul_rm(b, &a[4]);

    c[0] = accum0 & mask; accum0 >>= 56;
    c[4] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[1]);
    mac_rm(&accum4, b, &a[5]);

    c[1] = accum0 & mask; accum0 >>= 56;
    c[5] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[2]);
    mac_rm(&accum4, b, &a[6]);

    c[2] = accum0 & mask; accum0 >>= 56;
    c[6] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[3]);
    mac_rm(&accum4, b, &a[7]);

    c[3] = accum0 & mask; accum0 >>= 56;
    c[7] = accum4 & mask; accum4 >>= 56;

    c[4] += accum0 + accum4;
    c[0] += accum4;
    
    /*
     * TODO: double-check that this is not necessary.
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;

    accum4 += c[0];
    c[0] = accum4 & mask;
    c[1] += accum4 >> 56;
    */
 }

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4] __attribute__((aligned(32)));

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    }

    accum2  = widemul(&a[0],&a[3]);
    accum0  = widemul(&aa[0],&aa[3]);
    accum1  = widemul(&a[4],&a[7]);

    mac(&accum2, &a[1], &a[2]);
    mac(&accum0, &aa[1], &aa[2]);
    mac(&accum1, &a[5], &a[6]);

    accum0 -= accum2;
    accum1 += accum2;

    c[3] = ((uint64_t)(accum1))<<1 & mask;
    c[7] = ((uint64_t)(accum0))<<1 & mask;

    accum0 >>= 55;
    accum1 >>= 55;

    mac2(&accum0, &aa[1],&aa[3]);
    mac2(&accum1, &a[5], &a[7]);
    mac(&accum0, &aa[2], &aa[2]);
    accum1 += accum0;

    msb2(&accum0, &a[1], &a[3]);
    mac(&accum1, &a[6], &a[6]);
    
    accum2 = widemul(&a[0],&a[0]);
    accum1 -= accum2;
    accum0 += accum2;

    msb(&accum0, &a[2], &a[2]);
    mac(&accum1, &aa[0], &aa[0]);
    mac(&accum0, &a[4], &a[4]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul2(&aa[2],&aa[3]);
    msb2(&accum0, &a[2], &a[3]);
    mac2(&accum1, &a[6], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[1]);
    mac2(&accum1, &aa[0], &aa[1]);
    mac2(&accum0, &a[4], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[3],&aa[3]);
    msb(&accum0, &a[3], &a[3]);
    mac(&accum1, &a[7], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[2]);
    mac2(&accum1, &aa[0], &aa[2]);
    mac2(&accum0, &a[4], &a[6]);

    mac(&accum2, &a[1], &a[1]);
    mac(&accum1, &aa[1], &aa[1]);
    mac(&accum0, &a[5], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 56;
    accum1 >>= 56;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p448_strong_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<56)-1;

    /* first, clear high */
    a->limb[4] += a->limb[7]>>56;
    a->limb[0] += a->limb[7]>>56;
    a->limb[7] &= mask;

    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    __int128_t scarry = 0;
    int i;
    for (i=0; i<8; i++) {
        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 56;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<8; i++) {
        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 56;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p448_is_zero (
    const struct p448_t *a
 ) {
    struct p448_t b;
    p448_copy(&b,a);
    p448_strong_reduce(&b);

    uint64_t any = 0;
    int i;
    for (i=0; i<8; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
        word_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((word_t)serial[7*i+j])<<(8*j);
        }
        x->limb[i] = out;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 56 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    word_t ge = -1, mask = (1ull<<56)-1;
    for (i=0; i<4; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
 }

 void
 simultaneous_invert_p448(
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 ) {
  if (n==0) {
      return;
  } else if (n==1) {
      p448_inverse(out,in);
      return;
  }
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<(int) (n-1); i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }
--- a/src/arch_x86_64/p448.h
+++ b/src/arch_x86_64/p448.h
@@ -0,0 +1,376 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct p448_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x;
    for (i=1; i<8; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = doswap;

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    struct p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    struct p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = doNegate;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint64x4_t *aa = (uint64x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    int i;
    a->limb[4] += tmp;
    for (i=7; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/arch_x86_64/x86-64-arith.h
+++ b/src/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,279 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __X86_64_ARITH_H__
 #define __X86_64_ARITH_H__

 #include <stdint.h>

 /* TODO: non x86-64 versions of these.
 * FUTURE: autogenerate
 */

 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"r"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "leaq (,%%rdx,2), %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       "addq %[c], %[lo2]; "
       "adcq %[d], %[hi2]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       "addq %%rax, %[lo2]; "
       "adcq %%rdx, %[hi2]; "
       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
 }

 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"r"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
  
 }

 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t c,d, lo = *acc, hi = *acc>>64;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[lo], %[c]; "
       "sbbq %[hi], %[d]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  return ((__uint128_t)(a)) * b;
 }

 static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  return ((__int128_t)(a)) * b;
 }
 
 static __inline__ uint64_t opacify(uint64_t x) {
  __asm__ volatile("" : "+r"(x));
  return x;
 }

 static __inline__ mask_t is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }

 #endif /* __X86_64_ARITH_H__ */
--- a/src/barrett_field.c
+++ b/src/barrett_field.c
@@ -0,0 +1,349 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "barrett_field.h"
 #include <string.h>
 #include <assert.h>

 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    uint32_t nwords_a,
    const word_t *c,
    uint32_t nwords_c,
    word_t mask
 ) {
    uint32_t i;
    dword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] + (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static __inline__ word_t
 add_nr_packed(
    word_t *a,
    const word_t *c,
    uint32_t nwords
 ) {
    uint32_t i;
    dword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] + c[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    uint32_t nwords_a,
    const word_t *c,
    uint32_t nwords_c,
    word_t mask
 ) {
    uint32_t i;
    dsword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] - (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static word_t
 widemac(
    word_t *accum,
    uint32_t nwords_accum,
    const word_t *mier,
    uint32_t nwords_mier,
    word_t mand,
    word_t carry
 ) {
    uint32_t i;
    assert(nwords_mier <= nwords_accum);
    
    for (i=0; i<nwords_mier; i++) {
 #ifdef __clang_analyzer__
        /* always true, but this satisfies scan-build (bug in scan-build?) */
        assert(i<nwords_accum);
 #endif
        /* UMAAL chain for the wordy part of p */
        dword_t product = ((dword_t)mand) * mier[i];
        product += accum[i];
        product += carry;
        accum[i] = product;
        carry = product >> WORD_BITS;
    }
    
    for (; i<nwords_accum; i++) {
        dword_t sum = ((dword_t)carry) + accum[i];
        accum[i] = sum;
        carry = sum >> WORD_BITS;
    }
    
    return carry;
 }

 void
 barrett_negate (
    word_t *a,
    uint32_t nwords_a,
    const struct barrett_prime_t *prime
 ) {
    uint32_t i;
    dsword_t carry = 0;
    
    barrett_reduce(a,nwords_a,0,prime);
    
    /* Have p = 2^big - p_lo.  Want p - a = 2^big - p_lo - a */
    
    for (i=0; i<prime->nwords_lo; i++) {
        a[i] = carry = carry - prime->p_lo[i] - a[i];
        carry >>= WORD_BITS;
    }
    for (; i<prime->nwords_p; i++) {
        a[i] = carry = carry - a[i];
        if (i<prime->nwords_p-1) {
            carry >>= WORD_BITS;
        }
    }
    
    a[prime->nwords_p-1] = carry = carry + (((word_t)1) << prime->p_shift);
    
    for (; i<nwords_a; i++) {
        assert(!a[i]);
    }
    
    assert(!(carry>>WORD_BITS));
 }

 void
 barrett_reduce(
    word_t *a,
    uint32_t nwords_a,
    word_t a_carry,
    const struct barrett_prime_t *prime
 ) {
    uint32_t repeat, nwords_left_in_a=nwords_a;
    
    /* Is there a point to this a_carry business? */
    assert(a_carry < ((word_t)1) << prime->p_shift);
    assert(nwords_a >= prime->nwords_p);
    assert(prime->nwords_p > 0); /* scan-build: prevent underflow */
    
    for (; nwords_left_in_a >= prime->nwords_p; nwords_left_in_a--) {
        for (repeat=0; repeat<2; repeat++) {
            /* PERF: surely a more careful implementation could
             * avoid this double round
             */
            word_t mand = a[nwords_left_in_a-1] >> prime->p_shift;
            a[nwords_left_in_a-1] &= (((word_t)1)<<prime->p_shift)-1;
            if (prime->p_shift && !repeat) {
                /* collect high bits when there are any */
                if (nwords_left_in_a < nwords_a) {
                    mand |= a[nwords_left_in_a] << (WORD_BITS-prime->p_shift);
                    a[nwords_left_in_a] = 0;
                } else {
                    mand |= a_carry << (WORD_BITS-prime->p_shift);
                }
            }
            
            word_t carry = widemac(
                a+nwords_left_in_a-prime->nwords_p,
                prime->nwords_p,
                prime->p_lo,
                prime->nwords_lo,
                mand,
                0
            );
            assert(!carry);
            (void)carry;
        }
    }
    
    assert(nwords_left_in_a == prime->nwords_p-1);
    
    /* OK, but it still isn't reduced.  Add and subtract p_lo. */
    word_t cout = add_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,-1);
    if (prime->p_shift) {
        cout = (cout<<(WORD_BITS-prime->p_shift)) + (a[prime->nwords_p-1]>>prime->p_shift);
        a[prime->nwords_p-1] &= (((word_t)1)<<prime->p_shift)-1;
    }
    
    /* mask = carry-1: if no carry then do sub, otherwise don't */
    sub_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,cout-1);
 }

 /* PERF: This function is horribly slow.  Enough to break 1%. */
 void
 barrett_mul_or_mac(
    word_t *accum,
    uint32_t nwords_accum,
    
    const word_t *a,
    uint32_t nwords_a,
    
    const word_t *b,
    uint32_t nwords_b,
    
    const struct barrett_prime_t *prime,
    
    mask_t doMac
 ) {
    assert(nwords_accum >= prime->nwords_p);
    
    /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
    uint32_t nwords_tmp = (nwords_a > prime->nwords_p) ? nwords_a : prime->nwords_p;
    nwords_tmp++;
    assert(nwords_tmp > 0); /* scan-build: prevent underflow. */
    if (nwords_tmp < nwords_accum && doMac)
        nwords_tmp = nwords_accum;
    
    word_t tmp[nwords_tmp];
    int bpos, idown;
    uint32_t i;
    
    for (i=0; i<nwords_tmp; i++) {
        tmp[i] = 0;
    }
    
    for (bpos=nwords_b-1; bpos >= 0; bpos--) {
        /* Invariant at the beginning of the loop: the high word is unused. */
        assert(tmp[nwords_tmp-1] == 0);
        
        /* shift up */
        for (idown=nwords_tmp-2; idown>=0; idown--) {
            tmp[idown+1] = tmp[idown];
        }
        tmp[0] = 0;

        /* mac and reduce */
        word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
        
        /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
        assert(!carry);
        barrett_reduce(tmp, nwords_tmp, carry, prime);
        
        /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
         * so the high word is again clear */
    }
    
    if (doMac) {
        word_t cout = add_nr_packed(tmp, accum, nwords_accum);
        barrett_reduce(tmp, nwords_tmp, cout, prime);
    }
    
    for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
        accum[i] = tmp[i];
    }
    for (; i<nwords_tmp; i++) {
        assert(tmp[i] == 0);
    }
    for (; i<nwords_accum; i++) {
        accum[i] = 0;
    }
 }
 mask_t
 barrett_deserialize (
    word_t *x,
    const uint8_t *serial,
    const struct barrett_prime_t *prime
 ) {
    unsigned int i,j,nserial = prime->nwords_p * sizeof(word_t);
    if (prime->p_shift) {
        nserial -= (WORD_BITS - prime->p_shift) / 8;
    }

    
    /* Track x < p, p = 2^k - p_lo <==> x + p_lo < 2^k */
    dword_t carry = 0;
    
    for (i=0; i*sizeof(word_t)<nserial; i++) {
        carry >>= WORD_BITS;
        
        word_t the = 0;
        for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
            the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
        }
        x[i] = the;
        
        carry += the;
        if (i < prime->nwords_lo) carry += prime->p_lo[i];
    }
    
    /* check for reduction */
    if (prime->p_shift) {
        carry >>= prime->p_shift;
    } else {
        carry >>= WORD_BITS;
    }
    
    /* at this point, carry > 0 indicates failure */
    dsword_t scarry = carry;
    scarry = -scarry;
    scarry >>= WORD_BITS;
    scarry >>= WORD_BITS;
    
    return (mask_t) ~scarry;
 }
    
 void
 barrett_deserialize_and_reduce (
    word_t *x,
    const uint8_t *serial,
    uint32_t nserial,
    const struct barrett_prime_t *prime
 ) {
    unsigned int size = (nserial + sizeof(word_t) - 1)/sizeof(word_t);
    if (size < prime->nwords_p) {
        size = prime->nwords_p;
    }
    word_t tmp[size];
    memset(tmp,0,sizeof(tmp));
    
    unsigned int i,j;
    for (i=0; i*sizeof(word_t)<nserial; i++) {
        word_t the = 0;
        for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
            the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
        }
        tmp[i] = the;
    }
    
    barrett_reduce(tmp,size,0,prime);
    for (i=0; i<prime->nwords_p; i++) {
        x[i] = tmp[i];
    }
    for (; i<size; i++) {
        assert(!tmp[i]);
    }
 }

 void
 barrett_serialize (
    uint8_t *serial,
    const word_t *x,
    uint32_t nserial
 ) {
    unsigned int i,j;
    for (i=0; i*sizeof(word_t)<nserial; i++) {
        for (j=0; j<sizeof(word_t); j++) {
            serial[sizeof(word_t)*i+j] = x[i]>>(8*j);
        }
    }
 }
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -0,0 +1,442 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* Chacha random number generator code copied from crandom */

 #include "intrinsics.h"
 #include "crandom.h"
 #include <stdio.h>

 volatile unsigned int crandom_features = 0;

 unsigned int crandom_detect_features() {
  unsigned int out = GEN;
  
 # if (defined(__i386__) || defined(__x86_64__))
    u_int32_t a,b,c,d;
    
    a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    out |= GEN;
    if (d & 1<<26) out |= SSE2;
    if (d & 1<< 9) out |= SSSE3;
    if (c & 1<<25) out |= AESNI;
    if (c & 1<<28) out |= AVX;
    if (b & 1<<5) out  |= AVX2;
    
    a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    if (c & 1<<11) out |= XOP;
    if (c & 1<<30) out |= RDRAND;
 # endif
  
  return out;
 }



 INTRINSIC u_int64_t rdrand(int abort_on_fail) {
    uint64_t out = 0;
    int tries = 1000;
    
    if (HAVE(RDRAND)) {
    # if defined(__x86_64__)
        u_int64_t out, a=0;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(out), "+a"(a) :: "cc"
            );
        }
    # elif (defined(__i386__))
        u_int32_t reg, a=0;
        uint64_t out;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(reg), "+a"(a) :: "cc"
            );
        }
        out = reg; a = 0;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(reg), "+a"(a) :: "cc"
            );
        }
        out = out << 32 | reg;
        return out;
    # else
        abort(); // whut
    # endif
    } else {
        tries = 0;
    }
    
    if (abort_on_fail && !tries) {
        abort();
    }
    
    return out;
 }


 /* ------------------------------- Vectorized code ------------------------------- */
 #define shuffle(x,i) _mm_shuffle_epi32(x, \
  i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64)

 #define add _mm_add_epi32
 #define add64 _mm_add_epi64

 #define NEED_XOP   (MIGHT_HAVE(XOP))
 #define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP))
 #define NEED_SSE2  (MIGHT_HAVE(SSE2)  && !MUST_HAVE(SSSE3))
 #define NEED_CONV  (!MUST_HAVE(SSE2))

 #if NEED_XOP
 static __inline__ void
 quarter_round_xop(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = xop_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = xop_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(7,  *b ^ *c);
 }
 #endif

 #if NEED_SSSE3
 static const ssereg shuffle8  = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull };
 static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull };
  
 INTRINSIC ssereg ssse3_rotate_8(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle8);
 }
  
 INTRINSIC ssereg ssse3_rotate_16(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle16);
 }
  
 static __inline__ void
 quarter_round_ssse3(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */

 #if NEED_SSE2
 static __inline__ void
 quarter_round_sse2(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = sse2_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif

 #define DOUBLE_ROUND(qrf) { \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,1);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,3);       \
  b2 = shuffle(b2,1);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,3);       \
                            \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,3);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,1);       \
  b2 = shuffle(b2,3);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,1);       \
                          }
                          
 #define OUTPUT_FUNCTION   { \
  output[0] = add(a1,aa);   \
  output[1] = add(b1,bb);   \
  output[2] = add(c1,cc);   \
  output[3] = add(d1,dd);   \
  output[4] = add(a2,aa);   \
  output[5] = add(b2,bb);   \
  output[6] = add(c2,add(cc,p)); \
  output[7] = add(d2,dd);   \
                            \
  output += 8;              \
                            \
  cc = add64(add64(cc,p), p); \
  a1 = a2 = aa;             \
  b1 = b2 = bb;             \
  c1 = cc; c2 = add64(cc,p);\
  d1 = d2 = dd;             \
                          }
 /* ------------------------------------------------------------------------------- */

 INTRINSIC u_int32_t rotate(int r, u_int32_t a) {
    return a<<r ^ a>>(32-r);
 }

 static __inline__ __attribute__((unused)) void
 quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) {
    *a = *a + *b; *d = rotate(16, *d^*a);
    *c = *c + *d; *b = rotate(12, *b^*c);
    *a = *a + *b; *d = rotate(8,  *d^*a);
    *c = *c + *d; *b = rotate(7,  *b^*c);
 }

 static void
 crandom_chacha_expand(u_int64_t iv,
                         u_int64_t ctr,
                         int nr,
                         int output_size,
                         const unsigned char *key_,
                         unsigned char *output_) {
 # if MIGHT_HAVE_SSE2
    if (HAVE(SSE2)) {
        ssereg *key = (ssereg *)key_;
        ssereg *output = (ssereg *)output_;
                 
        ssereg a1 = key[0], a2 = a1, aa = a1,
               b1 = key[1], b2 = b1, bb = b1,
               c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1,
               d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull},
               d2 = d1, dd = d1,
               p = {0, 1};
 
        int i,r;
 #   if (NEED_XOP)
        if (HAVE(XOP)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_xop);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSSE3)
        if (HAVE(SSSE3)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_ssse3);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSE2)
        if (HAVE(SSE2)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_sse2);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
    }
 # endif

 # if NEED_CONV
    {
        const u_int32_t *key = (const u_int32_t *)key_;
        u_int32_t
        x[16],
        input[16] = {
            key[0], key[1], key[2], key[3],
            key[4], key[5], key[6], key[7],
            iv, iv>>32, ctr, ctr>>32,
            0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
        },
        *output = (u_int32_t *)output_;
        int i, r;

        for (i=0; i<output_size; i+= 64) {
            for (r=0; r<16; r++) {
                x[r] = input[r];
            }
                for (r=nr; r>0; r-=2) {
                quarter_round(&x[0], &x[4],  &x[8], &x[12]);
                quarter_round(&x[1], &x[5],  &x[9], &x[13]);
                quarter_round(&x[2], &x[6], &x[10], &x[14]);
                quarter_round(&x[3], &x[7], &x[11], &x[15]);

                quarter_round(&x[0], &x[5], &x[10], &x[15]);
                quarter_round(&x[1], &x[6], &x[11], &x[12]);
                quarter_round(&x[2], &x[7],  &x[8], &x[13]);
                quarter_round(&x[3], &x[4],  &x[9], &x[14]);
            }
            for (r=0; r<16; r++) {
                output[r] = x[r] + input[r];
            }

            output += 16;
            input[11] ++;
            if (!input[11]) input[12]++;
        }
    }
  
 #endif /* NEED_CONV */
 }

 /* "return 4", cf xkcd #221 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 int
 crandom_init_from_file(
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) {
    state->fill = 0;
    state->reseed_countdown = reseed_interval;
    state->reseed_interval = reseed_interval;
    state->ctr = 0;

    state->randomfd = open(filename, O_RDONLY);
    if (state->randomfd == -1) {
        int err = errno;
        return err ? err : -1;
    }

    ssize_t offset = 0, red;
    do {
        red = read(state->randomfd, state->seed + offset, 32 - offset);
        if (red > 0) offset += red;
    } while (red > 0 && offset < 32);

    if (offset < 32) {
        int err = errno;
        return err ? err : -1;
    }

    memset(state->buffer, 0, 96);

    state->magic = CRANDOM_MAGIC;
    state->reseeds_mandatory = reseeds_mandatory;

    return 0;
 }

 void
 crandom_init_from_buffer(
    struct crandom_state_t *state,
    const char initial_seed[32]
 ) {
    memcpy(state->seed, initial_seed, 32);
    memset(state->buffer, 0, 96);
    state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
    state->randomfd = -1;
    state->magic = CRANDOM_MAGIC;
 }

 int
 crandom_generate(
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 ) {
    /* the generator isn't seeded; maybe they ignored the return value of init_from_file */
    if (unlikely(state->magic != CRANDOM_MAGIC)) {
        abort();
    }

    int ret = 0;

    while (length) {
        if (unlikely(state->fill <= 0)) {
            uint64_t iv = 0;
            if (state->reseed_interval) {
                /* it's nondeterministic, stir in some rdrand() or rdtsc() */
                if (HAVE(RDRAND)) {
                    iv = rdrand(0);
                    if (!iv) iv = rdtsc();
                } else {
                    iv = rdtsc();
                }

                state->reseed_countdown--;
                if (unlikely(state->reseed_countdown <= 0)) {
                    /* reseed by xoring in random state */
                    state->reseed_countdown = state->reseed_interval;
                    ssize_t offset = 0, red;
                    do {
                        red = read(state->randomfd, state->buffer + offset, 32 - offset);
                        if (red > 0) offset += red;
                    } while (red > 0 && offset < 32);

                    if (offset < 32) {
                        /* The read failed.  Signal an error with the return code.
                         *
                         * If reseeds are mandatory, crash.
                         *
                         * If not, the generator is still probably safe to use, because reseeding
                         * is basically over-engineering for caution.  Also, the user might ignore
                         * the return code, so we still need to fill the request.
                         *
                         * Set reseed_countdown = 1 so we'll try again later.  If the user's
                         * performance sucks as a result of ignoring the error code while calling
                         * us in a loop, well, that's life.
                         */
                        if (state->reseeds_mandatory) {
                            abort();
                        }

                        ret = errno;
                        if (ret == 0) ret = -1;
                        state->reseed_countdown = 1;
                    }

                    int i;
                    for (i=0; i<32; i++) {
                        /* Stir in the buffer.  If somehow the read failed, it'll be zeros. */
                        state->seed[i] ^= state->buffer[i];
                    }
                }
            }
            crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
            state->ctr++;
            state->fill = sizeof(state->buffer);
        }

        unsigned long long copy = (length > state->fill) ? state->fill : length;
        state->fill -= copy;
        memcpy(output, state->buffer + state->fill, copy);
        memset(state->buffer + state->fill, 0, copy);
        output += copy; length -= copy;
    }

    return ret;
 }

 void
 crandom_destroy(
    struct crandom_state_t *state
 ) { 
    if (state->magic == CRANDOM_MAGIC && state->randomfd) {
        (void) close(state->randomfd);
        /* Ignore the return value from close(), because what would it mean?
         * "Your random device, which you were reading over NFS, lost some data"?
         */
    }

    memset(state, 0, sizeof(*state));
 }
--- a/src/exported.sym
+++ b/src/exported.sym
@@ -0,0 +1,6 @@
 _goldilocks_init
 _goldilocks_keygen
 _goldilocks_shared_secret
 _goldilocks_sign
 _goldilocks_verify
 _goldilocks_private_to_public
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -0,0 +1,393 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "config.h"
 #include "word.h"

 #include <errno.h>

 #if GOLDILOCKS_USE_PTHREAD
 #include <pthread.h>
 #endif

 #include "goldilocks.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"
 #include "sha512.h"
 #include "intrinsics.h"

 #ifndef GOLDILOCKS_RANDOM_INIT_FILE
 #define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom"
 #endif

 #ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL
 #define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000
 #endif

 /* We'll check it ourselves */
 #ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif

 /* FUTURE: auto */
 const struct affine_t goldilocks_base_point = {
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
    }},
    {{ 19 }}
 };

 static const char *G_INITING = "initializing";
 static const char *G_INITED = "initialized";
 static const char *G_FAILED = "failed to initialize";

 /* FUTURE: auto */
 static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
    U64LE(0xdc873d6d54a7bb0d),
    U64LE(0xde933d8d723a70aa),
    U64LE(0x3bb124b65129c96f),
    0x8335dc16
 };
 const struct barrett_prime_t goldi_q448 = {
    448/WORD_BITS,
    62 % WORD_BITS,
    sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
    goldi_q448_lo
 };

 /* FUTURE: auto */
 struct {
    const char * volatile state;
 #if GOLDILOCKS_USE_PTHREAD
    pthread_mutex_t mutex;
 #endif
    struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64];
    struct fixed_base_table_t fixed_base;
    struct tw_niels_t wnafs[32];
    struct crandom_state_t rand;
 } goldilocks_global;

 static inline mask_t
 goldilocks_check_init() {
    if (likely(goldilocks_global.state == G_INITED)) {
        return MASK_SUCCESS;
    } else {
        return MASK_FAILURE;
    }
 }

 int
 goldilocks_init () {
    const char *res = compare_and_swap(&goldilocks_global.state, NULL, G_INITING);
    if (res == G_INITED) return GOLDI_EALREADYINIT;
    else if (res) {
        return GOLDI_ECORRUPT;
    }

 #if GOLDILOCKS_USE_PTHREAD
    int ret = pthread_mutex_init(&goldilocks_global.mutex, NULL);
    if (ret) goto fail;
 #endif
    
    struct extensible_t ext;
    struct tw_extensible_t text;
    
    /* Sanity check: the base point is on the curve. */
    assert(validate_affine(&goldilocks_base_point));
    
    /* Convert it to twisted Edwards. */
    convert_affine_to_extensible(&ext, &goldilocks_base_point);
    twist_even(&text, &ext);
    
    /* Precompute the tables. */
    mask_t succ;
    
    int big = (WORD_BITS==64);
    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, 5);
    
    int criff_res = crandom_init_from_file(&goldilocks_global.rand,
        GOLDILOCKS_RANDOM_INIT_FILE,
        GOLDILOCKS_RANDOM_RESEED_INTERVAL,
        GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
        
    if (succ & !criff_res) {
        if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) {
            abort();
        }
        return 0;
    }
    
    /* it failed! fall though... */

 fail:
    if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_FAILED)) {
        /* ok something is seriously wrong */
        abort();
    }
    return -1;
 }

 static const struct p448_t
 sqrt_d_minus_1 = {{
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 }};

 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }
    
    word_t sk[448*2/WORD_BITS];
    
    struct tw_extensible_t exta;
    struct p448_t pk;

 #if GOLDILOCKS_USE_PTHREAD
    int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex);
    if (ml_ret) return ml_ret;
 #endif

    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
    int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
    if (!ret) ret = ret2;

 #if GOLDILOCKS_USE_PTHREAD
    ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
    if (ml_ret) abort();
 #endif
    
    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448);
    barrett_serialize(privkey->opaque, sk, 448/8);
    
    scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base);
    //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
    untwist_and_double_and_serialize(&pk, &exta);
    
    p448_serialize(pubkey->opaque, &pk);
    memcpy(&privkey->opaque[56], pubkey->opaque, 56);
    
    return ret ? GOLDI_ENODICE : GOLDI_EOK;
 }

 int
 goldilocks_private_to_public (
    struct goldilocks_public_key_t *pubkey,
    const struct goldilocks_private_key_t *privkey
 ) {
    struct p448_t pk;
    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]);
    
    if (msucc) {
        p448_serialize(pubkey->opaque, &pk);
        return GOLDI_EOK;
    } else {
        return GOLDI_ECORRUPT;
    }
 }

 int
 goldilocks_shared_secret (
    uint8_t shared[64],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) {
    /* This function doesn't actually need anything in goldilocks_global,
     * so it doesn't check init.
     */
    
    word_t sk[448/WORD_BITS];
    struct p448_t pk;
    
    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    struct p448_t sum, prod;
    msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
    p448_mul(&prod,&pk,&sum);
    p448_add(&sum,&pk,&sum);
 #endif
    
    msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
    succ &= montgomery_ladder(&pk,&pk,sk,446,2);
    
    p448_serialize(shared,&pk);
    
    /* obliterate records of our failure by adjusting with obliteration key */
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);

 #ifdef EXPERIMENT_ECDH_OBLITERATE_CT
    uint8_t oblit[40];
    unsigned i;
    for (i=0; i<8; i++) {
        oblit[i] = "noshared"[i] & ~(succ&msucc);
    }
    for (i=0; i<32; i++) {
        oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
    }
    sha512_update(&ctx, oblit, 40);
 #endif
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    /* stir in the sum and product of the pubkeys. */
    uint8_t a_pk[56];
    p448_serialize(a_pk, &sum);
    sha512_update(&ctx, a_pk, 56);
    p448_serialize(a_pk, &prod);
    sha512_update(&ctx, a_pk, 56);
 #endif
       
    /* stir in the shared key and finish */
    sha512_update(&ctx, shared, 56);
    sha512_final(&ctx, shared);
    
    return (GOLDI_ECORRUPT & ~msucc)
        | (GOLDI_EINVAL & msucc &~ succ)
        | (GOLDI_EOK & msucc & succ);
 }

 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }
    
    /* challenge = H(pk, [nonceG], message). */
    word_t skw[448/WORD_BITS];
    mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
    if (!succ) {
        memset(skw,0,sizeof(skw));
        return GOLDI_ECORRUPT;
    }
        
    /* Derive a nonce.  TODO: use HMAC. FUTURE: factor. */
    unsigned char sha_out[512/8];
    word_t tk[448/WORD_BITS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, (const unsigned char *)"signonce", 8);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_update(&ctx, message, message_len);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448);
    
    /* 4[nonce]G */
    uint8_t signature_tmp[56];
    struct tw_extensible_t exta;
    struct p448_t gsk;
    scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base);
    double_tw_extensible(&exta);
    untwist_and_double_and_serialize(&gsk, &exta);
    p448_serialize(signature_tmp, &gsk);
    
    word_t challenge[448/WORD_BITS];
    sha512_update(&ctx, &privkey->opaque[56], 56);
    sha512_update(&ctx, signature_tmp, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
    
    // reduce challenge and sub.
    barrett_negate(challenge,448/WORD_BITS,&goldi_q448);

    barrett_mac(
        tk,448/WORD_BITS,
        challenge,448/WORD_BITS,
        skw,448/WORD_BITS,
        &goldi_q448
    );
        
    word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1);
    barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448);
        
    memcpy(signature_out, signature_tmp, 56);
    barrett_serialize(signature_out+56, tk, 448/8);
    memset((unsigned char *)tk,0,sizeof(tk));
    memset((unsigned char *)skw,0,sizeof(skw));
    memset((unsigned char *)challenge,0,sizeof(challenge));
    
    /* response = 2(nonce_secret - sk*challenge)
     * Nonce = 8[nonce_secret]*G
     * PK = 2[sk]*G, except doubled (TODO)
     * so [2] ( [response]G + 2[challenge]PK ) = Nonce
     */
    
    return 0;
 }

 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }
    
    struct p448_t pk;
    word_t s[448/WORD_BITS];
    
    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
    if (!succ) return GOLDI_EINVAL;
    
    succ = barrett_deserialize(s, &signature[56], &goldi_q448);
    if (!succ) return GOLDI_EINVAL;
    
    /* challenge = H(pk, [nonceG], message). */
    unsigned char sha_out[512/8];
    word_t challenge[448/WORD_BITS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, pubkey->opaque, 56);
    sha512_update(&ctx, signature, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
    
    struct p448_t eph;
    struct tw_extensible_t pk_text;
    
    /* deserialize [nonce]G */
    succ = p448_deserialize(&eph, signature);
    if (!succ) return GOLDI_EINVAL;
    
    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
    if (!succ) return GOLDI_EINVAL;
    
    linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 );
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
    p448_bias(&eph, 2);
    
    succ = p448_is_zero(&eph);
    
    return succ ? 0 : GOLDI_EINVAL;
 }
--- a/src/include/barrett_field.h
+++ b/src/include/barrett_field.h
@@ -0,0 +1,190 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __BARRETT_FIELD_H__
 #define __BARRETT_FIELD_H__ 1

 /**
 * @file barrett_field.h
 * @brief Slow routines for generic primes in Barrett form.
 *
 * @warning These routines are very slow, roughly implemented, and should be made more
 * flexible in the future.  I might even outright switch to Montgomery form.
 */

 #include "word.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
    
 /**
 * @brief A Barrett-form prime, 2^k - c.
 * @todo Support primes of other forms.
 */
 struct barrett_prime_t {
    uint32_t nwords_p;   /**< The number of bits in p, i.e. ceiling((k-1) / WORD_BITS) */
    uint32_t p_shift;    /**< c mod WORD_BITS. */
    uint32_t nwords_lo;  /**< The number of nonzero low words. */
    const word_t *p_lo;  /**< The low words. */
 };

 /**
 * The Goldilocks prime.  I'm not sure this is the right place for it, but oh well.
 */
 extern const struct barrett_prime_t goldi_q448;

 /**
 * Reduce a number (with optional high carry word) mod p.
 *
 * @param [inout] a The value to be reduced.
 * @param [in] nwords_a The number of words in a.
 * @param [in] a_carry A high word to be carried into the computation.
 * @param [in] prime The Barrett prime.
 */
 void
 barrett_reduce(
    word_t *a,
    uint32_t nwords_a,
    word_t a_carry,
    const struct barrett_prime_t *prime
 );
    
 /**
 * out = a+(c&mask), returning a carry.
 *
 * @param [out] out The output, of length nwords_a.
 * @param [in] a The "always" addend.
 * @param [in] nwords_a The number of words in a.
 * @param [in] c The "sometimes" addend.
 * @param [in] nwords_c The number of words in c.
 * @param [in] mask A mask of whether to add or not.
 * @return A carry word.
 */
 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    uint32_t nwords_a,
    const word_t *c,
    uint32_t nwords_c,
    word_t mask
 );
  
 /**
 * out = a-(c&mask), returning a borrow.
 *
 * @param [out] out The output, of length nwords_a.
 * @param [in] a The "always" minuend.
 * @param [in] nwords_a The number of words in a.
 * @param [in] c The "sometimes" subtrahend.
 * @param [in] nwords_c The number of words in c.
 * @param [in] mask A mask of whether to add or not.
 * @return A borrow word.
 */  
 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    uint32_t nwords_a,
    const word_t *c,
    uint32_t nwords_c,
    word_t mask
 );

 /**
 * a -> reduce(-a) mod p
 *
 * @param [in] a The value to be reduced and negated.
 * @param [in] nwords_a The number of words in a.  Must be >= nwords_p.
 * @param [in] prime The prime.
 */   
 void
 barrett_negate (
    word_t *a,
    uint32_t nwords_a,
    const struct barrett_prime_t *prime
 );

 /*
 * If doMac, accum = accum + a*b mod p.
 * Otherwise, accum = a*b mod p.
 *
 * This function is not __restrict__; you may pass accum,
 * a, b, etc all from the same location.
 */
 void
 barrett_mul_or_mac(
    word_t *accum,
    uint32_t nwords_accum,

    const word_t *a,
    uint32_t nwords_a,

    const word_t *b,
    uint32_t nwords_b,

    const struct barrett_prime_t *prime,
    
    mask_t doMac
 );
    
 static inline void
 barrett_mul(
    word_t *out,
    int nwords_out,

    const word_t *a,
    uint32_t nwords_a,

    const word_t *b,
    uint32_t nwords_b,

    const struct barrett_prime_t *prime
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,0);
 }
    
 static inline void
 barrett_mac(
    word_t *out,
    uint32_t nwords_out,

    const word_t *a,
    uint32_t nwords_a,

    const word_t *b,
    uint32_t nwords_b,

    const struct barrett_prime_t *prime
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,-1);
 }

 mask_t
 barrett_deserialize (
    word_t *x,
    const uint8_t *serial,
    const struct barrett_prime_t *prime
 );

 void
 barrett_serialize (
    uint8_t *serial,
    const word_t *x,
    uint32_t nserial
 );
    
 void
 barrett_deserialize_and_reduce (
    word_t *x,
    const uint8_t *serial,
    uint32_t nserial,
    const struct barrett_prime_t *prime
 );

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __BARRETT_FIELD_H__ */
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -0,0 +1,8 @@
 #ifndef __GOLDILOCKS_CONFIG_H__
 #define __GOLDILOCKS_CONFIG_H__ 1

 #define GOLDILOCKS_USE_PTHREAD          1
 #define EXPERIMENT_ECDH_OBLITERATE_CT   1
 #define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1

 #endif // __GOLDILOCKS_CONFIG_H__
--- a/src/include/crandom.h
+++ b/src/include/crandom.h
@@ -0,0 +1,140 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file crandom.h
 * @author Mike Hamburg
 * @brief A miniature version of the (as of yet incomplete) crandom project.
 */

 #ifndef __GOLDI_CRANDOM_H__
 #define __GOLDI_CRANDOM_H__ 1

 #include <stdint.h>  /* for uint64_t */
 #include <fcntl.h>   /* for open */
 #include <errno.h>   /* for returning errors after open */
 #include <stdlib.h>  /* for abort */
 #include <string.h>  /* for memcpy */
 #include <strings.h> /* for bzero */
 #include <unistd.h>  /* for read */

 /**
 * @brief The state of a crandom generator.
 *
 * This object is opaque.  It is not protected by a lock, and so must
 * not be accessed by multiple threads at the same time.
 */
 struct crandom_state_t {
    /** @privatesection */
    unsigned char seed[32];
    unsigned char buffer[96];
    uint64_t ctr;
    uint64_t magic;
    unsigned int fill;
    int reseed_countdown;
    int reseed_interval;
    int reseeds_mandatory;
    int randomfd;
 } __attribute__((aligned(16))) ;

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Initialize a crandom state from the chosen file.
 * 
 * This function initializes a state from a given state file, or
 * from a random device (eg. /dev/random or /dev/urandom).
 *
 * You must check the return value of this function.
 *
 * @param [out] state The crandom state variable to initalize.
 * @param [in] filename The name of the seed file or random device.
 * @param [in] reseed_interval The number of 96-byte blocks which can be
 *        generated without reseeding.  Suggest 10000.
 * @param [in] reseeds_mandatory If nonzero, call abort() if a reseed fails.
 *        Suggest 1.
 *
 * @retval 0 Success.
 * @retval Nonzero An error to be interpreted by strerror().
 */
 int
 crandom_init_from_file (
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) __attribute__((warn_unused_result));


 /**
 * Initialize a crandom state from a buffer, for deterministic operation.
 * 
 * This function is used to initialize a crandom state deterministically,
 * mainly for testing purposes.  It can also be used to expand a secret
 * random value deterministically.
 *
 * @warning The crandom implementation is not guaranteed to be stable.
 * That is, a later release might produce a different random stream from
 * the same seed.
 *
 * @param [out] state The crandom state variable to initalize.
 * @param [in] initial_seed The seed value.
 */
 void
 crandom_init_from_buffer (
    struct crandom_state_t *state,
    const char initial_seed[32]
 );

 /**
 * Fill the output buffer with random data.
 *
 * This function uses the given crandom state to produce pseudorandom data
 * in the output buffer.
 *
 * This function may perform reads from the state's random device if it needs
 * to reseed.  This could block if that file is a blocking source, such as
 * a pipe or /dev/random on Linux.  If reseeding fails and the state has
 * reseeds_mandatory set, this function will call abort().  Otherwise, it will
 * return an error code, but it will still randomize the buffer.
 *
 * If called on a corrupted, uninitialized or destroyed state, this function
 * will abort().
 *
 * @warning This function is not thread-safe with respect to the state.  Don't
 * call it from multiple threads with the same state at the same time.
 *
 * @param [inout] state The crandom state to use for generation.
 * @param [out] output The buffer to fill with random data.
 * @param [in] length The length of the buffer.
 *
 * @retval 0 Success.
 * @retval Nonezero A non-mandatory reseed operation failed.
 */
 int
 crandom_generate (
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 );

 /**
 * Destroy the random state.  Further calls to crandom_generate() on that state
 * will abort().
 *
 * @param [inout] state The state to be destroyed.
 */
 void
 crandom_destroy (
    struct crandom_state_t *state
 );

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDI_CRANDOM_H__ */
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -0,0 +1,552 @@
 /**
 * @file ec_point.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #ifndef __CC_INCLUDED_EC_POINT_H__
 #define __CC_INCLUDED_EC_POINT_H__

 #include "p448.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Affine point on an Edwards curve.
 */
 struct affine_t {
    struct p448_t x, y;
 };

 /**
 * Affine point on a twisted Edwards curve.
 */
 struct tw_affine_t {
    struct p448_t x, y;
 };

 /**
 * Montgomery buffer.
 */
 struct montgomery_t {
    struct p448_t z0, xd, zd, xa, za;
 };

 /**
 * Extensible coordinates for Edwards curves, suitable for
 * accumulators.
 * 
 * Represents the point (x/z, y/z).  The extra coordinates
 * t,u satisfy xy = tuz, allowing for conversion to Extended
 * form by multiplying t and u.
 * 
 * The idea is that you don't have to do this multiplication
 * when doubling the accumulator, because the t-coordinate
 * isn't used there.  At the same time, as long as you only
 * have one point in extensible form, additions don't cost
 * extra.
 * 
 * This is essentially a lazier version of Hisil et al's
 * lookahead trick.  It might be worth considering that trick
 * instead.
 */
 struct extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /**
 * Extensible coordinates for twisted Edwards curves,
 * suitable for accumulators.
 */
 struct tw_extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /**
 * Niels coordinates for twisted Edwards curves.
 * 
 * Good for mixed readdition; suitable for fixed tables.
 */
 struct tw_niels_t {
    struct p448_t a, b, c;
 };

 /**
 * Projective niels coordinates for twisted Edwards curves.
 * 
 * Good for readdition; suitable for temporary tables.
 */
 struct tw_pniels_t {
    struct tw_niels_t n;
    struct p448_t z;
 };


 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
 */
 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
 */
 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in projective Niels form.
 */
 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in projective Niels form.
 */
 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 );

 /**
 * Double a point on a twisted Edwards curve, in "extensible" coordinates.
 */
 void
 double_tw_extensible (
    struct tw_extensible_t* a
 );

 /**
 * Double a point on an Edwards curve, in "extensible" coordinates.
 */
 void
 double_extensible (
    struct extensible_t* a
 );

 /**
 * Double a point, and transfer it to the twisted curve.
 * 
 * That is, apply the 4-isogeny.
 */
 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 /**
 * Double a point, and transfer it to the untwisted curve.
 * 
 * That is, apply the dual isogeny.
 */
 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 );

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 );

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 );

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 );

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 );

 void
 montgomery_step (
    struct montgomery_t* a
 );

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 );

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 );

 /**
 * Serialize a point on an Edwards curve.
 * 
 * The serialized form would be sqrt((z-y)/(z+y)) with sign of xz.
 * 
 * It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x.
 * 
 * But 4/(1-d) isn't square, so we need to twist it:
 * 
 * -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x
 */
 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 );

 /**
 * 
 */
 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 );

 /**
 * Expensive transfer from untwisted to twisted.  Roughly equivalent to halve and isogeny.
 * Correctly transfers point of order 2.
 * 
 * Can't have x=+1 (it's not even).  There is code to fix the exception that would otherwise
 * occur at (0,1).
 * 
 * Input point must be even.
 */
 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 /**
 * Expensive transfer from untwisted to twisted.  Roughly equivalent to halve and isogeny.
 * 
 * This function is for testing purposes only, because it can return odd points on the
 * twist.  This can cause exceptions in the point addition formula.  What's more, this
 * function should be able to return points of order 4, which are at infinity.
 * 
 * This function probably doesn't properly handle special cases, such as the point at
 * infinity (FUTURE).
 * 
 * This function probably isn't a homomorphism, in that it probably doesn't consistently
 * handle adjustments by the point of order 2 when the input is odd.    (FUTURE)
 */
 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 mask_t
 is_square (
    const struct p448_t* x
 );

 mask_t
 is_even_pt (
    const struct extensible_t* a
 );

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 );

 /**
 * Deserialize a point to an untwisted affine curve.
 */
 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 );

 /**
 * Deserialize a point and transfer it to the twist.
 * 
 * Not guaranteed to preserve the 4-torsion component.
 * 
 * Refuses to deserialize +-1, which are the points of order 2.
 */
 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 );

 void
 set_identity_extensible (
    struct extensible_t* a
 );

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 );

 void
 set_identity_affine (
    struct affine_t* a
 );

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 );

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 );

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 );

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 );

 mask_t
 validate_affine (
    const struct affine_t* a
 );

 /**
 * Check the invariants for struct tw_extensible_t.
 * NOTE: This function was automatically generated
 * with no regard for speed.
 */
 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 );

 /**
 * Check the invariants for struct extensible_t.
 * NOTE: This function was automatically generated
 * with no regard for speed.
 */
 mask_t
 validate_extensible (
    const struct extensible_t* ext
 );


 void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) {
    p448_copy ( &a->z0, &ds->z0 );
    p448_copy ( &a->xd, &ds->xd );
    p448_copy ( &a->zd, &ds->zd );
    p448_copy ( &a->xa, &ds->xa );
    p448_copy ( &a->za, &ds->za );
 }

 void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) {
    p448_copy ( &a->a, &ds->a );
    p448_copy ( &a->b, &ds->b );
    p448_copy ( &a->c, &ds->c );
 }

 void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) {
    copy_tw_niels( &a->n, &ds->n );
    p448_copy ( &a->z, &ds->z );
 }



 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __CC_INCLUDED_EC_POINT_H__ */
--- a/src/include/intrinsics.h
+++ b/src/include/intrinsics.h
@@ -0,0 +1,244 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /** @file intrinsics.h
 * @brief cRandom intrinsics header.
 */

 #ifndef __CRANDOM_INTRINSICS_H__
 #define __CRANDOM_INTRINSICS_H__ 1

 #include <sys/types.h>

 #include <immintrin.h>

 #define INTRINSIC \
  static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))

 #define GEN    1
 #define SSE2   2
 #define SSSE3  4
 #define AESNI  8
 #define XOP    16
 #define AVX    32
 #define AVX2   64
 #define RDRAND 128

 /**
 * If on x86, read the timestamp counter.  Otherwise, return 0.
 */
 INTRINSIC u_int64_t rdtsc() {
  u_int64_t out = 0;
 # if (defined(__i386__) || defined(__x86_64__))
    __asm__ __volatile__ ("rdtsc" : "=A"(out));
 # endif
  return out;
 }

 /**
 * Return x unchanged, but confuse the compiler.
 *
 * This is mainly for use in test scripts, to prevent the value from
 * being constant-folded or removed by dead code elimination.
 *
 * @param x A 64-bit number.
 * @return The same number in a register.
 */
 INTRINSIC u_int64_t opacify(u_int64_t x) {
  __asm__ volatile("mov %0, %0" : "+r"(x));
  return x;
 }

 #ifdef __AVX2__
 #  define MIGHT_HAVE_AVX2 1
 #  ifndef MUST_HAVE_AVX2
 #    define MUST_HAVE_AVX2 0
 #  endif
 #else
 #  define MIGHT_HAVE_AVX2 0
 #  define MUST_HAVE_AVX2  0
 #endif

 #ifdef __AVX__
 #  define MIGHT_HAVE_AVX 1
 #  ifndef MUST_HAVE_AVX
 #    define MUST_HAVE_AVX MUST_HAVE_AVX2
 #  endif
 #else
 #  define MIGHT_HAVE_AVX 0
 #  define MUST_HAVE_AVX 0
 #endif

 #ifdef __SSSE3__
 #  define MIGHT_HAVE_SSSE3 1
 #  ifndef MUST_HAVE_SSSE3
 #    define MUST_HAVE_SSSE3 MUST_HAVE_AVX
 #  endif
 #else
 #  define MIGHT_HAVE_SSSE3 0
 #  define MUST_HAVE_SSSE3 0
 #endif

 #ifdef __SSE2__
 #  define MIGHT_HAVE_SSE2 1
 #  ifndef MUST_HAVE_SSE2
 #    define MUST_HAVE_SSE2 MUST_HAVE_SSSE3
 #  endif
   typedef __m128i ssereg;
 #  define pslldq _mm_slli_epi32
 #  define pshufd _mm_shuffle_epi32

 INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
 }

 #else
 #  define MIGHT_HAVE_SSE2 0
 #  define MUST_HAVE_SSE2  0
 #endif

 #ifdef __AES__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_AESNI 1
 #  ifndef MIGHT_HAVE_RDRAND
 #    define MIGHT_HAVE_RDRAND 1
 #  endif
 #  ifndef MUST_HAVE_RDRAND
 #    define MUST_HAVE_RDRAND 0
 #  endif
 #  ifndef MUST_HAVE_AESNI
 #    define MUST_HAVE_AESNI 0
 #  endif

 #else
 #  define MIGHT_HAVE_AESNI 0
 #  define MUST_HAVE_AESNI 0
 #  define MIGHT_HAVE_RDRAND 0
 #  define MUST_HAVE_RDRAND 0
 #endif

 #ifdef __XOP__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_XOP 1
 #  ifndef MUST_HAVE_XOP
 #    define MUST_HAVE_XOP 0
 #  endif
 INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  ssereg out;
  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
  return out;
 }
 #else
 #  define MIGHT_HAVE_XOP 0
 #  define MUST_HAVE_XOP 0
 #endif

 #define MIGHT_MASK \
  ( SSE2   * MIGHT_HAVE_SSE2   \
  | SSSE3  * MIGHT_HAVE_SSSE3  \
  | AESNI  * MIGHT_HAVE_AESNI  \
  | XOP    * MIGHT_HAVE_XOP    \
  | AVX    * MIGHT_HAVE_AVX    \
  | RDRAND * MIGHT_HAVE_RDRAND \
  | AVX2   * MIGHT_HAVE_AVX2)

 #define MUST_MASK \
  ( SSE2   * MUST_HAVE_SSE2   \
  | SSSE3  * MUST_HAVE_SSSE3  \
  | AESNI  * MUST_HAVE_AESNI  \
  | XOP    * MUST_HAVE_XOP    \
  | AVX    * MUST_HAVE_AVX    \
  | RDRAND * MUST_HAVE_RDRAND \
  | AVX2   * MUST_HAVE_AVX2 )

 #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
 #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)

 #ifdef __cplusplus
 #  define extern_c extern "C"
 #else
 #  define extern_c
 #endif

 extern_c
 unsigned int crandom_detect_features();

 #ifndef likely
 #  define likely(x)       __builtin_expect((x),1)
 #  define unlikely(x)     __builtin_expect((x),0)
 #endif
  
 /**
 * Atomic compare and swap, return by fetching.
 *
 * Equivalent to:
 * ret = *target; if (*target == old) *target = new; return ret;
 *
 * @param [inout] target The volatile memory area to be CAS'd
 * @param [in] old The expected old value of the target.
 * @param [in] new A value to replace the target on success.
 */
 INTRINSIC const char *
 compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 );
    
 const char *compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 ) {
    return __sync_val_compare_and_swap(target,old,new);
 }
  
 /**
 * Atomic compare and swap.  Return whether successful.
 *
 * Equivalent to:
 * if (*target == old) { *target = new; return nonzero; } else { return 0; }
 *
 * @param [inout] target The volatile memory area to be CAS'd
 * @param [in] old The expected old value of the target.
 * @param [in] new A value to replace the target on success.
 */
 INTRINSIC int
 bool_compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 );

 int
 bool_compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 ) {
    return __sync_bool_compare_and_swap(target,old,new);
 }

 /**
 * Determine whether the current processor supports the given feature.
 *
 * This function is designed so that it should only have runtime overhead
 * if the feature is not known at compile time -- that is, if
 * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not.
 */
 extern volatile unsigned int crandom_features;
 INTRINSIC int HAVE(unsigned int feature);

 int HAVE(unsigned int feature) {
  unsigned int features;
  if (!MIGHT_HAVE(feature)) return 0;
  if (MUST_HAVE(feature))   return 1;
  features = crandom_features;
  if (unlikely(!features))
    crandom_features = features = crandom_detect_features();
  return likely((features & feature) == feature);
 }

 #endif /* __CRANDOM_INTRINSICS_H__ */
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -0,0 +1,289 @@
 /**
 * @file scalarmul.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */

 #ifndef __P448_ALGO_H__
 #define __P448_ALGO_H__ 1

 #include "ec_point.h"
 #include "intrinsics.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * A precomputed table for fixed-base scalar multiplication.
 *
 * This uses a signed combs format.
 */
 struct fixed_base_table_t {
   /** Comb tables containing multiples of the base point. */
  struct tw_niels_t *table;
  
  /** Adjustments to the scalar in even and odd cases, respectively. */
  word_t scalar_adjustments[2*(448/WORD_BITS)];
  
  /** The number of combs in the table. */
  unsigned int n;
  
  /** The number of teeth in each comb. */
  unsigned int t;
  
  /** The spacing between the teeth. */
  unsigned int s;
  
  /** If nonzero, the table was malloc'd by precompute_for_combs. */
  unsigned int own_table;
 };
    
 /**
 * Full Montgomery ladder in inverse square root format.
 *
 * Out = [2^n_extra_doubles * scalar] * in, where
 * scalar is little-endian and has length $nbits$ bits.
 *
 * If the scalar is even and/or n_extra_doubles >= 1,
 * then this function will reject points which are not
 * on the curve by returning MASK_FAILURE.
 *
 * This function will also reject multiplies which output
 * the identity or the point of order 2.  It may be worth
 * revisiting this decision in the FUTURE.  The idea is that
 * this can only happen when: the input is the identity or the
 * point of order 2; or the input is the point of order 4 on
 * the twist; or the scalar is 0 or a multiple of the curve
 * order; or the scalar is a multiple of the twist order and
 * the input point is on the twist.
 *
 * This function takes constant time with respect to $*in$
 * and $*scalar$, but not of course with respect to nbits or
 * n_extra_doubles.
 *
 * For security, we recommend setting n_extra_doubles = 1.
 * Because the cofactor of Goldilocks is 4 and input points
 * are always even (when on the curve), this will cancel the
 * cofactor.
 *
 * @param [out] out The output point.
 * @param [in] in The base point.
 * @param [in] scalar The scalar's little-endian representation.
 * @param [in] nbits The number of bits in the scalar.  Note that
 * unlike in Curve25519, we do not require the top bit to be set.
 * @param [in] n_extra_doubles The number of extra doubles to do at
 * the end.
 *
 * @retval MASK_SUCCESS The operation was successful.
 * @retval MASK_FAILURE The input point was invalid, or the output
 * would be the identity or the point of order 2.
 */
 mask_t
 montgomery_ladder (
    struct p448_t *out,
    const struct p448_t *in,
    const word_t *scalar,
    unsigned int nbits,
    unsigned int n_extra_doubles
 ) __attribute__((warn_unused_result));
    
 /**
 * Scalar multiply a twisted Edwards-form point.
 *
 * This function takes constant time.
 *
 * Currently the scalar is always exactly 448 bits long.
 *
 * @param [inout] working The point to multply.
 * @param [in] scalar The scalar, in little-endian form.
 */
 void
 scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    /* TODO? int nbits */
 );
    
 /**
 * Scalar multiply a twisted Edwards-form point.  Use the same
 * algorithm as scalarmul(), but uses variable array indices.
 *
 * Currently the scalar is always exactly 448 bits long.
 *
 * @warning This function uses variable array indices,
 * so it is insecure against cache-timing attacks.  It is intended
 * for microbenchmarking, to see how much constant-time arithmetic
 * costs us.
 *
 * @param [inout] working The point to multply.
 * @param [in] scalar The scalar, in little-endian form.
 */
 void
 scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    /* TODO? int nbits */
 );

 /**
 * Precompute a table to accelerate fixed-point scalar
 * multiplication using the "multiple signed combs" approach.
 *
 * This function computes $n$ "comb" tables, each containing
 * 2^(t-1) points in tw_niels_t format.  You must have
 * n * t * s >= 446 for complete coverage.
 *
 * The scalar multiplication algorithm may adjust the scalar by
 * a multiple of q.  Therefore, we strongly recommend to use base
 * points in the q-torsion group (i.e. doubly even points).
 *
 * @param [out] out The table to compute.
 * @param [in] base The base point.
 * @param [in] n The number of combs in the table.
 * @param [in] t The number of teeth in each comb.
 * @param [in] s The spacing between the teeth.
 * @param [out] prealloc An optional preallocated array containing
 * space for n<<(t-1) values of type tw_niels_t.
 *
 * @retval MASK_SUCCESS Success.
 * @retval MASK_FAILURE Failure, most likely because we are out
 * of memory.
 */
 mask_t
 precompute_fixed_base (
  struct fixed_base_table_t *out,
  const struct tw_extensible_t *base,
  unsigned int n,
  unsigned int t,
  unsigned int s,
  struct tw_niels_t *prealloc
 ) __attribute__((warn_unused_result));

 /**
  * Destroy a fixed-base table.  Frees any memory that we allocated
  * for the combs.
  *
  * @param [in] table The table to destroy.
  */
 void
 destroy_fixed_base (
    struct fixed_base_table_t *table
 );

 /**
 * Scalar multiplication with precomputation.  Set working to
 * to [scalar] * Base, where Base is the base point passed to
 * precompute_for_combs().
 *
 * The scalar may be adjusted by a multiple of q, so this routine
 * can be wrong by a cofactor if the base has cofactor components.
 *
 * @param [out] out The output point.
 * @param [in] scalar The scalar.
 * @param [in] nbits The number of bits in the scalar.  Must be <= n*t*s.
 * @param [in] table The precomputed table.
 *
 * @retval MASK_SUCCESS Success.
 * @retval MASK_FAILURE Failure, because n*t*s < nbits
 */ 
 mask_t
 scalarmul_fixed_base (
    struct tw_extensible_t *out,
    const word_t *scalar,
    unsigned int nbits,
    const struct fixed_base_table_t *table
 );

 /**
 * Variable-time scalar multiplication.
 *
 * @warning This function takes variable time.  It is intended for
 * microbenchmarking.
 *
 * @param [inout] working The input and output point.
 * @param [in] scalar The scalar.
 */ 
 void
 scalarmul_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 );


 /**
 * Precompute a table to accelerate fixed-point scalar
 * multiplication (and, more importantly, linear combos)
 * using the "windowed non-adjacent form" approach.
 *
 * @param [out] out The output table.  Must have room for 1<<i entries.
 * @param [in] base The base point.
 * @param [in] tbits The number of bits to put in the table.
 *
 * @retval MASK_SUCCESS Success.
 * @retval MASK_FAILURE Failure, most likely because we are out
 * of memory.
 */
 mask_t
 precompute_fixed_base_wnaf (
    struct tw_niels_t *out,
    const struct tw_extensible_t *base,
    unsigned int tbits
 ) __attribute__((warn_unused_result));

 /**
 * Variable-time scalar multiplication with precomputed WNAF
 * tables.
 *
 * @warning This function takes variable time.  It is intended for
 * microbenchmarking.
 *
 * @param [out] out The output point.
 * @param [in] scalar The scalar.
 * @param [in] nbits The number of bits in the scalar.
 * @param [in] precmp The precomputed WNAF table.
 * @param [in] table_bits The number of bits in the WNAF table.
 */ 
 void
 scalarmul_fixed_base_wnaf_vt (
    struct tw_extensible_t *out,
    const word_t *scalar,
    unsigned int nbits,
    const struct tw_niels_t *precmp,
    unsigned int table_bits
 );


 /**
 * Variable-time scalar linear combination of two points: one
 * variable, and one fixed (with fixed-base WNAF tables)
 *
 * @warning This function takes variable time.  It is intended for
 * signature verification.
 *
 * @param [inout] working The output point, and also the variable input.
 * @param [in] scalar_var The scalar for the variable input.
 * @param [in] nbits_var The number of bits in scalar_var.
 * @param [in] scalar_pre The scalar for the fixed input.
 * @param [in] nbits_pre The number of bits in scalar_pre.
 * @param [in] precmp The precomputed WNAF table.
 * @param [in] table_bits_pre The number of bits in the WNAF table.
 */ 
 void
 linear_combo_var_fixed_vt (
    struct tw_extensible_t *working,
    const word_t scalar_var[448/WORD_BITS],
    unsigned int nbits_var,
    const word_t scalar_pre[448/WORD_BITS],
    unsigned int nbits_pre,
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
 );

 #ifdef __cplusplus
 };
 #endif

 #endif /* __P448_ALGO_H__ */
--- a/src/include/sha512.h
+++ b/src/include/sha512.h
@@ -0,0 +1,47 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __GOLDI_SHA512_H__
 #define __GOLDI_SHA512_H__ 1

 #include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * SHA512 hashing context.
 *
 * This structure is opaque.
 */
 struct sha512_ctx_t {
    /** @privatesection */
    uint64_t chain[8];
    uint8_t block[128];
    uint64_t nbytes;
 };

 void
 sha512_init (
    struct sha512_ctx_t *ctx
 );

 void
 sha512_update (
    struct sha512_ctx_t *ctx,
    const unsigned char *data,
    uint64_t bytes
 );
    
 void
 sha512_final (
    struct sha512_ctx_t *ctx,
    uint8_t result[64]
 );
    
 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
    
 #endif /* __GOLDI_SHA512_H__ */
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -0,0 +1,128 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __WORD_H__
 #define __WORD_H__

 /* for posix_memalign */
 #define _XOPEN_SOURCE 600

 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>

 #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
 /* It's a 64-bit machine if:
 * // limits.h thinks so
 * __uint128_t exists
 * size_t is 64 bits
 * Either longs are 64-bits (doesn't happen on Windows)
 *   or pointers are 64-bits (doesn't happen on 32/64 arches)
 * FUTURE: validate this hack on more architectures.
 */
 typedef uint32_t hword_t;
 typedef uint64_t word_t;
 typedef __uint128_t dword_t;
 typedef int32_t hsword_t;
 typedef int64_t sword_t;
 typedef __int128_t dsword_t;
 #define PRIxWORD PRIx64
 #define PRIxWORDfull "%016" PRIx64
 #define PRIxWORD58   "%014" PRIx64
 #define U64LE(x) x##ull
 #define U58LE(x) x##ull
 #else
 typedef uint16_t hword_t;
 typedef uint32_t word_t;
 typedef uint64_t dword_t;
 typedef int16_t hsword_t;
 typedef int32_t sword_t;
 typedef int64_t dsword_t;
 #define PRIxWORD PRIx32
 #define PRIxWORDfull "%08" PRIx32
 #define PRIxWORD58   "%07" PRIx32
 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #endif

 #define WORD_BITS (sizeof(word_t) * 8)

 /* TODO: vector width for procs like ARM; gcc support */
 typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;

 /* FIXME this only works on clang */
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));

 #if __AVX2__
 typedef uint32x8_t big_register_t;
 typedef uint64x4_t uint64xn_t;
 typedef uint32x8_t uint32xn_t;
 #elif __SSE2__ || __ARM_NEON__
 typedef uint32x4_t big_register_t;
 typedef uint64x2_t uint64xn_t;
 typedef uint32x4_t uint32xn_t;
 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
 typedef uint64_t big_register_t, uint64xn_t;
 typedef uint32_t uint32xn_t;
 #else
 typedef uint64_t uint64xn_t;
 typedef uint32_t uint32xn_t;
 typedef uint32_t big_register_t;
 #endif


 #if __AVX2__ || __SSE2__ || __ARM_NEON__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)(x == (big_register_t)0);
 }
 #else
 static __inline__ mask_t
 br_is_zero(word_t x) {
    return (((dword_t)x) - 1)>>WORD_BITS;
 }
 #endif



 /**
 * Allocate memory which is sufficiently aligned to be used for the
 * largest vector on the system (for now that's a big_register_t).
 *
 * Man malloc says that it does this, but at least for AVX2 on MacOS X,
 * it's lying.
 *
 * @param size The size of the region to allocate.
 * @return A suitable pointer, which can be free'd with free(),
 * or NULL if no memory can be allocated.
 */
 static __inline__ void *
 malloc_vector (
    size_t size
 ) __attribute__((always_inline, unused));

 void *
 malloc_vector(size_t size) {
    void *out = NULL;
    
    int ret = posix_memalign(&out, sizeof(big_register_t), size);
    
    if (ret) {
        return NULL;
    } else {
        return out;
    }
 }

 #endif /* __WORD_H__ */
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -0,0 +1,844 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include "word.h"

 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>

 #include "intrinsics.h"
 #include "scalarmul.h"
 #include "barrett_field.h"

 mask_t
 montgomery_ladder (
    struct p448_t *out,
    const struct p448_t *in,
    const word_t *scalar,
    unsigned int nbits,
    unsigned int n_extra_doubles
 ) { 
    struct montgomery_t mont;
    deserialize_montgomery(&mont, in);
    
    int i,j,n=(nbits-1)%WORD_BITS;
    mask_t pflip = 0;
    for (j=(nbits+WORD_BITS-1)/WORD_BITS-1; j>=0; j--) {
        word_t w = scalar[j];
        for (i=n; i>=0; i--) {
            mask_t flip = -((w>>i)&1);
            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
            montgomery_step(&mont);
            pflip = flip;
        }
        n = WORD_BITS-1;
    }
    p448_cond_swap(&mont.xa,&mont.xd,pflip);
    p448_cond_swap(&mont.za,&mont.zd,pflip);
    
    assert(n_extra_doubles < INT_MAX);
    for (j=0; j<(int)n_extra_doubles; j++) {
        montgomery_step(&mont);
    }
    
    return serialize_montgomery(out, &mont, in);
 }

 static __inline__ void
 cond_negate_tw_niels (
    struct tw_niels_t *n,
    mask_t doNegate
 ) {
    p448_cond_swap(&n->a, &n->b, doNegate);
    p448_cond_neg(&n->c, doNegate);
 }

 static __inline__ void
 cond_negate_tw_pniels (
    struct tw_pniels_t *n,
    mask_t doNegate
 ) {
    cond_negate_tw_niels(&n->n, doNegate);
 }

 void    
 constant_time_lookup_tw_pniels (
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static __inline__ void    
 constant_time_lookup_tw_niels (
    struct tw_niels_t *out,
    const struct tw_niels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static void
 convert_to_signed_window_form (
    word_t *out,
    const word_t *scalar,
    int nwords_scalar,
    const word_t *prepared_data,
    int nwords_pd
 ) {
    assert(nwords_pd <= nwords_scalar);
    mask_t mask = -(scalar[0]&1);

    word_t carry = add_nr_ext_packed(out, scalar, nwords_scalar, prepared_data, nwords_pd, ~mask);
    carry += add_nr_ext_packed(out, out, nwords_scalar, prepared_data+nwords_pd, nwords_pd, mask);
    
    assert(!(out[0]&1));
    
    int i;
    for (i=0; i<nwords_scalar; i++) {
        out[i] >>= 1;
        if (i<nwords_scalar-1) {
            out[i] |= out[i+1]<<(WORD_BITS-1);
        } else {
            out[i] |= carry<<(WORD_BITS-1);
        }
    }
 }

 void
 scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {

    const int nbits=448; /* HACK? */
    word_t prepared_data[448*2/WORD_BITS] = {
        U64LE(0x9595b847fdf73126),
        U64LE(0x9bb9b8a856af5200),
        U64LE(0xb3136e22f37d5c4f),
        U64LE(0x0000000189a19442),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),

        U64LE(0x721cf5b5529eec33),
        U64LE(0x7a4cf635c8e9c2ab),
        U64LE(0xeec492d944a725bf),
        U64LE(0x000000020cd77058),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* TODO: split off */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
        inv = (bits>>3)-1;
    bits ^= inv;
    
    constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
        inv = (bits>>3)-1;
        bits ^= inv;
    
        constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
 }

 void
 scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {

    const int nbits=448; /* HACK? */
    word_t prepared_data[448*2/WORD_BITS] = {
        U64LE(0x9595b847fdf73126),
        U64LE(0x9bb9b8a856af5200),
        U64LE(0xb3136e22f37d5c4f),
        U64LE(0x0000000189a19442),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),

        U64LE(0x721cf5b5529eec33),
        U64LE(0x7a4cf635c8e9c2ab),
        U64LE(0xeec492d944a725bf),
        U64LE(0x000000020cd77058),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* TODO: split off */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
        inv = (bits>>3)-1;
    bits ^= inv;

 	copy_tw_pniels(&pn, &multiples[bits&7]);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
        inv = (bits>>3)-1;
        bits ^= inv;
    
 		copy_tw_pniels(&pn, &multiples[bits&7]);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
 }


 mask_t
 scalarmul_fixed_base (
    struct tw_extensible_t *out,
    const word_t scalar[448/WORD_BITS],
    unsigned int nbits,
    const struct fixed_base_table_t *table
 ) {
    unsigned int n = table->n, t = table->t, s = table->s;
    assert(n >= 1 && t >= 1 && s >= 1);
    
    if (n*t*s < nbits) {
        return MASK_FAILURE;
    }
    
    unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS,
        scalar2_words = scalar_words;
    if (scalar2_words < 448 / WORD_BITS)
        scalar2_words = 448 / WORD_BITS;
    word_t scalar2[scalar2_words], scalar3[scalar2_words];
    
    /* Copy scalar to scalar3, but clear its high bits (if there are any) */
    unsigned int i,j,k;
    for (i=0; i<scalar_words; i++) {
        scalar3[i] = scalar[i];
    }
    if (likely(i) && (nbits % WORD_BITS)) {
        scalar3[i-1] &= (((word_t)1) << (nbits%WORD_BITS)) - 1;
    }
    for (; i<scalar2_words; i++) {
        scalar3[i] = 0;
    }
    
    convert_to_signed_window_form (
        scalar2,
        scalar3, scalar2_words,
        table->scalar_adjustments , 448 / WORD_BITS
    );
    
    struct tw_niels_t ni;
    
    for (i=0; i<s; i++) {
        if (i) double_tw_extensible(out);
        
        for (j=0; j<n; j++) {
            int tab = 0;
 			
 			/*
             * PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
 			 * time of a keygen or sign op.  Surely it is possible to speed it up.
             */
            for (k=0; k<t; k++) {
                unsigned int bit = (s-1-i) + k*s + j*(s*t);
                if (bit < scalar2_words * WORD_BITS) {
                    tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
                }
            }
            
            mask_t invert = (tab>>(t-1))-1;
            tab ^= invert;
            tab &= (1<<(t-1)) - 1;
            
            constant_time_lookup_tw_niels(&ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
            cond_negate_tw_niels(&ni, invert);
            if (i||j) {
                add_tw_niels_to_tw_extensible(out, &ni);
            } else {
                convert_tw_niels_to_tw_extensible(out, &ni);
            }
        }
    }
    
    return MASK_SUCCESS;
 }

 mask_t
 precompute_fixed_base (
  struct fixed_base_table_t *out,
  const struct tw_extensible_t *base,
  unsigned int n,
  unsigned int t,
  unsigned int s,
  struct tw_niels_t *prealloc
 ) {
    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) {
        memset(out, 0, sizeof(*out));
        return 0;
    }
    
    out->n = n;
    out->t = t;
    out->s = s;
  
    struct tw_extensible_t working, start;
    copy_tw_extensible(&working, base);
    struct tw_pniels_t pn_tmp;
  
    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
    
    struct tw_niels_t *table = prealloc;
    if (prealloc) {
        out->own_table = 0;
    } else {
        table = (struct tw_niels_t *) malloc_vector(sizeof(*table) * (n<<(t-1)));
        out->own_table = 1;
    }
    out->table = table;
  
    if (!doubles || !zs || !zis || !table) {
        free(doubles);
        free(zs);
        free(zis);
        memset(out, 0, sizeof(*out));
        memset(table, 0, sizeof(*table) * (n<<(t-1)));
        if (!prealloc) free(table);
        return 0;
    }
  
    unsigned int i,j,k;
    
    /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */
    unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1;
    assert(adjustment_size >= 448/WORD_BITS);
    word_t adjustment[adjustment_size];
    for (i=0; i<adjustment_size; i++) {
        adjustment[i] = -1;
    }
    
    adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);

    /* FIXME: factor out somehow */
    const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
        U64LE(0xdc873d6d54a7bb0d),
        U64LE(0xde933d8d723a70aa),
        U64LE(0x3bb124b65129c96f),
        0x8335dc16
    };
    const struct barrett_prime_t goldi_q448 = {
        448/WORD_BITS, 62 % WORD_BITS, sizeof(goldi_q448_lo)/sizeof(word_t), goldi_q448_lo
    };
    
    /* The low adjustment is 2^nbits - 1 mod q */
    barrett_reduce(adjustment, adjustment_size, 0, &goldi_q448);
    word_t *low_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)],
        *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)];
    for (i=0; i<448/WORD_BITS; i++) {
        low_adjustment[i] = adjustment[i];
    }
    
    /* The high adjustment is low + q = low - q_lo + 2^big */
    (void)
    sub_nr_ext_packed(
        high_adjustment,
        adjustment, 448/WORD_BITS,
        goldi_q448.p_lo, goldi_q448.nwords_lo,
        -1
    );
    if (goldi_q448.p_shift) {
        high_adjustment[goldi_q448.nwords_p - 1] += ((word_t)1)<<goldi_q448.p_shift;
    }
    
    /* OK, now compute the tables */
    for (i=0; i<n; i++) {

        /* doubling phase */
        for (j=0; j<t; j++) {
            if (j) {
                convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
                add_tw_pniels_to_tw_extensible(&start, &pn_tmp);
            } else {
                copy_tw_extensible(&start, &working);
            }

            if (j==t-1 && i==n-1) {
                break;
            }

            double_tw_extensible(&working);
            if (j<t-1) {
                convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
            }

            for (k=0; k<s-1; k++) {
                double_tw_extensible(&working);
            }
        }

        /* Gray-code phase */
        for (j=0;; j++) {
            int gray = j ^ (j>>1);
            int idx = ((i+1)<<(t-1))-1 ^ gray;

            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
            copy_tw_niels(&table[idx], &pn_tmp.n);
            p448_copy(&zs[idx], &pn_tmp.z);
 			
            if (j >= (1<<(t-1)) - 1) break;
            int delta = (j+1) ^ ((j+1)>>1) ^ gray;

            for (k=0; delta>1; k++)
                delta >>=1;
            
            if (gray & (1<<k)) {
                /* start += doubles[k] */
                add_tw_pniels_to_tw_extensible(&start, &doubles[k]);
            } else {
                /* start -= doubles[k] */
                sub_tw_pniels_from_tw_extensible(&start, &doubles[k]);
            }
            
            
        }
    }
 	
    simultaneous_invert_p448(zis, zs, n<<(t-1));

    p448_t product;
    for (i=0; i<n<<(t-1); i++) {
        p448_mul(&product, &table[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].a, &product);
        
        p448_mul(&product, &table[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].b, &product);
        
        p448_mul(&product, &table[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].c, &product);
    }
 	
 	mask_t ret = ~p448_is_zero(&zis[0]);

    free(doubles);
    free(zs);
    free(zis);

    if (unlikely(!ret)) {
        memset(table, 0, sizeof(*table) * (n<<(t-1)));
        if (!prealloc) free(table);
        memset(out, 0, sizeof(*out));
        return 0;
    }

    return ret;
 }

 void
 destroy_fixed_base (
    struct fixed_base_table_t *table
 ) {
    if (table->table) {
        memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
    }
    if (table->own_table) {
        free(table->table);
    }
    memset(table,0,sizeof(*table));
 }

 mask_t
 precompute_fixed_base_wnaf (
    struct tw_niels_t *out,
    const struct tw_extensible_t *const_base,
    unsigned int tbits
 ) {
    int i;
    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs)<<tbits);
    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis)<<tbits);

    if (!zs || !zis) {
        free(zs);
        free(zis);
        return 0;
    }

    struct tw_extensible_t base;
    copy_tw_extensible(&base,const_base);
    
    struct tw_pniels_t twop, tmp;
    
    convert_tw_extensible_to_tw_pniels(&tmp, &base);
    p448_copy(&zs[0], &tmp.z);
    copy_tw_niels(&out[0], &tmp.n);

    if (tbits > 0) {
        double_tw_extensible(&base);
        convert_tw_extensible_to_tw_pniels(&twop, &base);
        add_tw_pniels_to_tw_extensible(&base, &tmp);
        
        convert_tw_extensible_to_tw_pniels(&tmp, &base);
        p448_copy(&zs[1], &tmp.z);
        copy_tw_niels(&out[1], &tmp.n);

        for (i=2; i < 1<<tbits; i++) {
            add_tw_pniels_to_tw_extensible(&base, &twop);
            convert_tw_extensible_to_tw_pniels(&tmp, &base);
            p448_copy(&zs[i], &tmp.z);
            copy_tw_niels(&out[i], &tmp.n);
        }
    }
    
    simultaneous_invert_p448(zis, zs, 1<<tbits);

    p448_t product;
    for (i=0; i<1<<tbits; i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
    }

    free(zs);
    free(zis);

    return -1;
 }

 /**
 * @cond internal
 * Control for variable-time scalar multiply algorithms.
 */
 struct smvt_control {
  int power, addend;
 };

 static int
 recode_wnaf(
    struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
    const word_t *scalar,
    unsigned int nbits,
    unsigned int tableBits)
 {
    int current = 0, i, j;
    unsigned int position = 0;

    /* PERF: negate scalar if it's large
     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
     */
    for (i=nbits-1; i >= 0; i--) {
        int bit = (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1;
        current = 2*current + bit;

        /*
         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
         * So current loses (tableBits+1) bits every time.  It otherwise gains
         * 1 bit per iteration.  The number of iterations is
         * (nbits + 2 + tableBits), and an additional control word is added at
         * the end.  So the total number of control words is at most
         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
         * There's also the stopper with power -1, for a total of +3.
         */
        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
            int delta = (current + 1) >> 1; // |delta| < 2^tablebits
            current = -(current & 1);

            for (j=i; (delta & 1) == 0; j++) {
                delta >>= 1;
            }
            control[position].power = j+1;
            control[position].addend = delta;
            position++;
            assert(position <= nbits/(tableBits+1) + 2);
        }
    }
    
    if (current) {
        for (j=0; (current & 1) == 0; j++) {
            current >>= 1;
        }
        control[position].power = j;
        control[position].addend = current;
        position++;
        assert(position <= nbits/(tableBits+1) + 2);
    }
    
  
    control[position].power = -1;
    control[position].addend = 0;
    return position;
 }


 static void
 prepare_wnaf_table(
    struct tw_pniels_t *output,
    struct tw_extensible_t *working,
    unsigned int tbits
 ) {
    convert_tw_extensible_to_tw_pniels(&output[0], working);

    if (tbits == 0) return;

    double_tw_extensible(working);
    struct tw_pniels_t twop;
    convert_tw_extensible_to_tw_pniels(&twop, working);

    add_tw_pniels_to_tw_extensible(working, &output[0]);
    convert_tw_extensible_to_tw_pniels(&output[1], working);

    for (int i=2; i < 1<<tbits; i++) {
        add_tw_pniels_to_tw_extensible(working, &twop);
        convert_tw_extensible_to_tw_pniels(&output[i], working);
    }
 }

 void
 scalarmul_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {
    /* HACK: not 448? */
    const int nbits=448, table_bits = 3;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    struct tw_pniels_t precmp[1<<table_bits];
    prepare_wnaf_table(precmp, working, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return;
    }
  
    int conti = 1, i;
    for (i = control[0].power - 1; i >= 0; i--) {
        double_tw_extensible(working);

        if (i == control[conti].power) {
            assert(control[conti].addend);

            if (control[conti].addend > 0) {
                add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
            } else {
                sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
            }
            conti++;
            assert(conti <= control_bits);
        }
    }
 }

 void
 scalarmul_fixed_base_wnaf_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS],
    unsigned int nbits,
    const struct tw_niels_t *precmp,
    unsigned int table_bits
 ) {
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return;
    }
  
    int conti = 1, i;
    for (; control[conti].power >= 0; conti++) {
        assert(conti <= control_bits);
        for (i = control[conti-1].power - control[conti].power; i; i--) {
            double_tw_extensible(working);
        }
        
        assert(control[conti].addend);
        if (control[conti].addend > 0) {
            add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
        } else {
            sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
        }
    }

    for (i = control[conti-1].power; i; i--) {
        double_tw_extensible(working);
    }
 }

 void
 linear_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const word_t scalar_var[448/WORD_BITS],
    unsigned int nbits_var,
    const word_t scalar_pre[448/WORD_BITS],
    unsigned int nbits_pre,
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
 ) {
    const int table_bits_var = 3;
    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
    
    int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
    int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
    (void)ncb_var;
    (void)ncb_pre;
  
    struct tw_pniels_t precmp_var[1<<table_bits_var];
    prepare_wnaf_table(precmp_var, working, table_bits_var);
  
    int contp=0, contv=0, i;
  
    i = control_var[0].power;
    if (i > control_pre[0].power) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        contv++;
    } else if (i == control_pre[0].power && i >=0 ) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
        contv++; contp++;
    } else {
        i = control_pre[0].power;
        convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
        contp++;
    }
    
    if (i < 0) {
        set_identity_tw_extensible(working);
        return;
    }
    
    for (i--; i >= 0; i--) {
        double_tw_extensible(working);

        if (i == control_var[contv].power) {
            assert(control_var[contv].addend);

            if (control_var[contv].addend > 0) {
                add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]);
            } else {
                sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]);
            }
            contv++;
        }

        if (i == control_pre[contp].power) {
            assert(control_pre[contp].addend);

            if (control_pre[contp].addend > 0) {
                add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]);
            } else {
                sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]);
            }
            contp++;
        }
    }
    
    assert(contv == ncb_var);
    assert(contp == ncb_pre);
 }



--- a/src/sha512.c
+++ b/src/sha512.c
@@ -0,0 +1,187 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __APPLE__
 #define _BSD_SOURCE
 #include <endian.h>
 #endif

 #include "sha512.h"

 #include <string.h>
 #include <assert.h>

 static inline uint64_t
 rotate_r (
    uint64_t x,
    int d
 ) {
  return (x >> d) | (x << (64-d));
 }

 #ifdef __APPLE__
 static inline uint64_t
 htobe64 (uint64_t x) {
    __asm__ ("bswapq %0" : "+r"(x));
    return x;
 }
 #endif

 static const uint64_t
 sha512_init_state[8] = {
    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 
    0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 };

 static const uint64_t
 sha512_k[80] = {
    0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
    0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
    0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
    0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
    0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 
    0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
    0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
    0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
    0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
    0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, 
    0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
    0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
    0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
    0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 
    0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
    0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
    0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
    0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 };

 static inline uint64_t S0 (uint64_t h1) {
    return rotate_r(h1, 28) ^ rotate_r(h1, 34) ^ rotate_r(h1, 39);
 }

 static inline uint64_t S1 (uint64_t h4) {
    return rotate_r(h4,14) ^ rotate_r(h4,18) ^ rotate_r(h4,41);
 }

 static inline uint64_t s0 (uint64_t a) {
    return rotate_r(a,1) ^ rotate_r(a,8) ^ a>>7;
 }

 static inline uint64_t s1 (uint64_t b) {
    return rotate_r(b,19) ^ rotate_r(b,61) ^ b>>6;
 }

 static inline uint64_t ch (uint64_t h4, uint64_t h5, uint64_t h6) {
    return h6^(h4 & (h6^h5));
 }

 static inline uint64_t maj(uint64_t h1, uint64_t h2, uint64_t h3) {
    return (h1&h2) ^ (h3&(h1^h2));
 }

 static void
 sha512_process_block (
    struct sha512_ctx_t *ctx
 ) {
    uint64_t i, tmp, a, b,
        *w = (uint64_t *) ctx->block,
        *state = ctx->chain,
        h0 = state[0], h1 = state[1], h2 = state[2], h3 = state[3],
        h4 = state[4], h5 = state[5], h6 = state[6], h7 = state[7];

    /* Clang doesn't unswitch this automatically */
    for (i=0; i<16; i++) {
        /* load up the input word for this round */
        tmp = w[i] = htobe64(w[i]);
        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
  
        /* shift register */
        h7 = h6; h6 = h5; h5 = h4;
        h4 = h3 + tmp;
        h3 = h2; h2 = h1; h1 = h0;
        h0 = tmp + maj(h1,h2,h3) + S0(h1);
    }
  
    for (; i<80; i++) {
        /* load up the input word for this round */
        a   = w[(i+1 ) & 15];
        b   = w[(i+14) & 15];
        tmp = w[i&15] = s0(a) + s1(b) + w[i&15] + w[(i+9) & 15];
        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
  
        /* shift register */
        h7 = h6; h6 = h5; h5 = h4;
        h4 = h3 + tmp;
        h3 = h2; h2 = h1; h1 = h0;
        h0 = tmp + maj(h1,h2,h3) + S0(h1);
    }
 
    state[0] += h0;
    state[1] += h1;
    state[2] += h2;
    state[3] += h3;
    state[4] += h4;
    state[5] += h5;
    state[6] += h6;
    state[7] += h7;
 }

 void
 sha512_init (
    struct sha512_ctx_t *ctx
 ) {
    ctx->nbytes = 0;
    memcpy(ctx->chain, sha512_init_state, sizeof(sha512_init_state));
    memset(ctx->block, 0, sizeof(ctx->block));
 }

 void
 sha512_update (
    struct sha512_ctx_t *ctx,
    const unsigned char *data,
    uint64_t bytes
 ) {
    assert(ctx->nbytes < 1ull<<56);
    assert(bytes < 1ull<<56);
    
    while (bytes) {
        uint64_t fill = ctx->nbytes % 128, accept = 128 - fill;
        if (accept > bytes) accept = bytes;
        ctx->nbytes += accept;
        memcpy(ctx->block + fill, data, accept);
        
        if (fill+accept == 128)
            sha512_process_block(ctx);

        bytes -= accept;
        data += accept;
    }
    
    assert(ctx->nbytes < 1ull<<56);
 }

 void
 sha512_final (
    struct sha512_ctx_t *ctx,
    uint8_t result[64]
 ) {
    uint64_t fill = ctx->nbytes % 128, i;
    ctx->block[fill++] = 0x80;
    if (fill > 112) {
        memset(ctx->block + fill, 0, 128-fill);
        sha512_process_block(ctx);
        fill = 0;
    }
    memset(ctx->block + fill, 0, 112-fill);
    *((uint64_t *)&ctx->block[112]) = 0;
    *((uint64_t *)&ctx->block[120]) = htobe64((ctx->nbytes * 8));
    sha512_process_block(ctx);
    for (i=0; i<8; i++) {
        ctx->chain[i] = htobe64(ctx->chain[i]);
    }
    memcpy(result, ctx->chain, sizeof(ctx->chain));
    sha512_init(ctx);
 }
--- a/test/bench.c
+++ b/test/bench.c
@@ -0,0 +1,684 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"

 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdio.h>
 #include <memory.h>

 #include "p448.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"
 #include "goldilocks.h"
 #include "sha512.h"

 double now() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  
  return tv.tv_sec + tv.tv_usec/1000000.0;
 }

 void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
    crandom_generate(crand, (unsigned char *)a, sizeof(*a));
    p448_strong_reduce(a);
 }

 void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
    crandom_generate(crand, (unsigned char *)sk, 448/8);
 }

 void p448_print( const char *descr, const struct p448_t *a ) {
    p448_t b;
    p448_copy(&b, a);
    p448_strong_reduce(&b);
    int j;
    printf("%s = 0x", descr);
    for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) {
        printf(PRIxWORD58, b.limb[j]);
    }
    printf("\n");
 }

 void p448_print_full( const char *descr, const struct p448_t *a ) {
    int j;
    printf("%s = 0x", descr);
    for (j=15; j>=0; j--) {
        printf("%02" PRIxWORD "_" PRIxWORD58 " ",
            a->limb[j]>>28, a->limb[j]&(1<<28)-1);
    }
    printf("\n");
 }

 void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
    int j;
    printf("%s = 0x", descr);
    for (j=448/WORD_BITS-1; j>=0; j--) {
        printf(PRIxWORDfull, secret[j]);
    }
    printf("\n");
 }

 #ifndef N_TESTS_BASE
 #define N_TESTS_BASE 10000
 #endif

 int main(int argc, char **argv) {
    (void)argc;
    (void)argv;

    struct tw_extensible_t ext;
    struct extensible_t exta;
    struct tw_niels_t niels;
    struct tw_pniels_t pniels;
    struct affine_t affine;
    struct montgomery_t mb;
    struct p448_t a,b,c,d;
    
    
    double when;
    int i;

    int nbase = N_TESTS_BASE;
    
    /* Bad randomness so we can debug. */
    char initial_seed[32];
    for (i=0; i<32; i++) initial_seed[i] = i;
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, initial_seed);
    
    word_t sk[448/WORD_BITS],tk[448/WORD_BITS];
    q448_randomize(&crand, sk);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
        p448_mul(&c, &b, &a);
    }
    when = now() - when;
    printf("mul:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
        p448_sqr(&c, &a);
    }
    when = now() - when;
    printf("sqr:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*500; i++) {
        p448_mul(&c, &b, &a);
        p448_mul(&a, &b, &c);
    }
    when = now() - when;
    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
        p448_mulw(&c, &b, 1234562);
    }
    when = now() - when;
    printf("mulw:        %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*10; i++) {
        p448_randomize(&crand, &a);
    }
    when = now() - when;
    printf("rand448:     %5.1fns\n", when * 1e9 / i);
    
    struct sha512_ctx_t sha;
    uint8_t hashout[128];
    when = now();
    for (i=0; i<nbase; i++) {
        sha512_init(&sha);
        sha512_final(&sha, hashout);
    }
    when = now() - when;
    printf("sha512 1blk: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        sha512_update(&sha, hashout, 128);
    }
    when = now() - when;
    printf("sha512 blk:  %5.1fns (%0.2f MB/s)\n", when * 1e9 / i, 128*i/when/1e6);
    
    when = now();
    for (i=0; i<nbase; i++) {
        p448_isr(&c, &a);
    }
    when = now() - when;
    printf("isr auto:    %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        p448_isr(&d,&a);
        p448_sqr(&b,&d);
        p448_mul(&c,&b,&a);
        p448_sqr(&b,&c);
        p448_subw(&b,1);
        p448_bias(&b,1);
        if (!p448_is_zero(&b)) {
            printf("ISR validation failure!\n");
            p448_print("a", &a);
            p448_print("s", &d);
        }
    }
    
    when = now();
    for (i=0; i<nbase; i++) {
        elligator_2s_inject(&affine, &a);
    }
    when = now() - when;
    printf("elligator:   %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        elligator_2s_inject(&affine, &a);
        if (!validate_affine(&affine)) {
            printf("Elligator validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        }
    }
    
    when = now();
    for (i=0; i<nbase; i++) {
        deserialize_affine(&affine, &a);
    }
    when = now() - when;
    printf("decompress:  %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        serialize_extensible(&a, &exta);
    }
    when = now() - when;
    printf("compress:    %5.1fµs\n", when * 1e6 / i);
    
    int goods = 0;
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        mask_t good = deserialize_affine(&affine, &a);
        if (good & !validate_affine(&affine)) {
            printf("Deserialize validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        } else if (good) {
            goods++;
            convert_affine_to_extensible(&exta,&affine);
            serialize_extensible(&b, &exta);
            p448_sub(&c,&b,&a);
            p448_bias(&c,2);
            if (!p448_is_zero(&c)) {
                printf("Reserialize validation failure!\n");
                p448_print("a", &a);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                deserialize_affine(&affine, &b);
                p448_print("b", &b);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                printf("\n");
            }
        }
    }
    if (goods<i/3) {
        printf("Deserialization validation failure! Deserialized %d/%d points\n", goods, i);
    }
    
    word_t lsk[768/WORD_BITS];
    crandom_generate(&crand, (unsigned char *)lsk, sizeof(lsk));
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&goldi_q448);
    }
    when = now() - when;
    printf("barrett red: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*10; i++) {
        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&goldi_q448);
    }
    when = now() - when;
    printf("barrett mac: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        add_tw_niels_to_tw_extensible(&ext, &niels);
    }
    when = now() - when;
    printf("exti+niels:  %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        add_tw_pniels_to_tw_extensible(&ext, &pniels);
    }
    when = now() - when;
    printf("exti+pniels: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        double_tw_extensible(&ext);
    }
    when = now() - when;
    printf("exti dbl:    %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        untwist_and_double(&exta, &ext);
    }
    when = now() - when;
    printf("i->a isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        twist_and_double(&ext, &exta);
    }
    when = now() - when;
    printf("a->i isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        montgomery_step(&mb);
    }
    when = now() - when;
    printf("monty step:  %5.1fns\n", when * 1e9 / i);
 	
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)montgomery_ladder(&a,&b,sk,448,0);
    }
    when = now() - when;
    printf("full ladder: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        scalarmul(&ext,sk);
    }
    when = now() - when;
    printf("edwards smz: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        scalarmul_vlook(&ext,sk);
        untwist_and_double_and_serialize(&a,&ext);
    }
    when = now() - when;
    printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        scalarmul_vt(&ext,sk);
    }
    when = now() - when;
    printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
    
    struct tw_niels_t wnaft[1<<6];
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,6);
    }
    when = now() - when;
    printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,6);
    }
    when = now() - when;
    printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,4);
    }
    when = now() - when;
    printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,4);
    }
    when = now() - when;
    printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,5);
    }
    when = now() - when;
    printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,5);
    }
    when = now() - when;
    printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        q448_randomize(&crand, tk);
        linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
    }
    when = now() - when;
    printf("vt vf combo: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        deserialize_affine(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        scalarmul(&ext,sk);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
    }
    when = now() - when;
    printf("edwards sm:  %5.1fµs\n", when * 1e6 / i);
    
    struct fixed_base_table_t t_5_5_18, t_3_5_30, t_8_4_14, t_5_3_30, t_15_3_10;

    while (1) {
        p448_randomize(&crand, &a);
        if (deserialize_affine(&affine, &a)) break;
    }
    convert_affine_to_extensible(&exta,&affine);
    twist_and_double(&ext,&exta);
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_5_5_18);
        (void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL);
    }
    when = now() - when;
    printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_3_5_30);
        (void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL);
    }
    when = now() - when;
    printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_5_3_30);
        (void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL);
    }
    when = now() - when;
    printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_15_3_10);
        (void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL);
    }
    when = now() - when;
    printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_8_4_14);
        (void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL);
    }
    when = now() - when;
    printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i);
 	
    when = now();
    for (i=0; i<nbase; i++) {
        scalarmul_fixed_base(&ext, sk, 448, &t_5_5_18);
    }
    when = now() - when;
    printf("com(5,5,18): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        scalarmul_fixed_base(&ext, sk, 448, &t_3_5_30);
    }
    when = now() - when;
    printf("com(3,5,30): %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<nbase; i++) {
        scalarmul_fixed_base(&ext, sk, 448, &t_8_4_14);
    }
    when = now() - when;
    printf("com(8,4,14): %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<nbase; i++) {
        scalarmul_fixed_base(&ext, sk, 448, &t_5_3_30);
    }
    when = now() - when;
    printf("com(5,3,30): %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<nbase; i++) {
        scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10);
    }
    when = now() - when;
    printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i);
    
    printf("\nGoldilocks:\n");
    
    int res = goldilocks_init();
    assert(!res);
    
    struct goldilocks_public_key_t gpk,hpk;
    struct goldilocks_private_key_t gsk,hsk;
    
    when = now();
    for (i=0; i<nbase; i++) {
        if (i&1) {
            res = goldilocks_keygen(&gsk,&gpk);
        } else {
            res = goldilocks_keygen(&hsk,&hpk);
        }
        assert(!res);
    }
    when = now() - when;
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    uint8_t ss1[64],ss2[64];
    int gres1,gres2;
    when = now();
    for (i=0; i<nbase; i++) {
        if (i&1) {
            gres1 = goldilocks_shared_secret(ss1,&gsk,&hpk);
        } else {
            gres2 = goldilocks_shared_secret(ss2,&hsk,&gpk);
        }
    }
    when = now() - when;
    printf("ecdh:        %5.1fµs\n", when * 1e6 / i);
    if (gres1 || gres2 || memcmp(ss1,ss2,64)) {
        printf("[FAIL] %d %d\n",gres1,gres2);
        
        printf("sk1 = ");
        for (i=0; i<56; i++) {
            printf("%02x", gsk.opaque[i]);
        }
        printf("\nsk2 = ");
        for (i=0; i<56; i++) {
            printf("%02x", hsk.opaque[i]);
        }
        printf("\nss1 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss1[i]);
        }
        printf("\nss2 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss2[i]);
        }
        printf("\n");
    }
    
    uint8_t sout[56*2];
    const char *message = "hello world";
    size_t message_len = strlen(message);
    when = now();
    for (i=0; i<nbase; i++) {
        res = goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
        assert(!res);
    }
    when = now() - when;
    printf("sign:        %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        (void)res;
    }
    when = now() - when;
    printf("verify:      %5.1fµs\n", when * 1e6 / i);
    
    printf("\nTesting...\n");
    
    
    int failures=0, successes = 0;
    for (i=0; i<nbase/10; i++) {
        (void)goldilocks_keygen(&gsk,&gpk);
        goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        if (res) failures++;
    }
    if (failures) {
        printf("FAIL %d/%d signature checks!\n", failures, i);
    }
    
    failures=0; successes = 0;
    for (i=0; i<nbase/10; i++) {
        p448_randomize(&crand, &a);
 		word_t two = 2;
        mask_t good = montgomery_ladder(&b,&a,&two,2,0);
 		if (!good) continue;
 		
 		word_t x,y;
        crandom_generate(&crand, (unsigned char *)&x, sizeof(x));
        crandom_generate(&crand, (unsigned char *)&y, sizeof(y));
        x = (hword_t)x;
        y = (hword_t)y;
        word_t z=x*y;
        
 	(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0);
        (void)montgomery_ladder(&c,&b,&y,WORD_BITS,0);
        (void)montgomery_ladder(&b,&a,&z,WORD_BITS,0);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
 		if (!p448_is_zero(&d)) {
            printf("Odd ladder validation failure %d!\n", ++failures);
            p448_print("a", &a);
            printf("x=%"PRIxWORD", y=%"PRIxWORD", z=%"PRIxWORD"\n", x,y,z);
            p448_print("c", &c);
            p448_print("b", &b);
 			printf("\n");
 		}
 	}
    
    failures = 0;
    for (i=0; i<nbase/10; i++) {
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = deserialize_affine(&affine, &a);
        } while (!good);
        
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
        untwist_and_double_and_serialize(&c, &ext);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (good && !p448_is_zero(&d)){
            printf("Iso+serial validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("b", &b);
            p448_print("c", &c);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Iso+serial variation: only %d/%d successful.\n", successes, i);
    }
    
    successes = failures = 0;
    for (i=0; i<nbase/10; i++) {
        struct p448_t aa;
        struct tw_extensible_t exu,exv,exw;
        
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = deserialize_affine(&affine, &a);
            convert_affine_to_extensible(&exta,&affine);
            twist_and_double(&ext,&exta);
        } while (!good);
        do {
            p448_randomize(&crand, &aa);
            good = deserialize_affine(&affine, &aa);
            convert_affine_to_extensible(&exta,&affine);
            twist_and_double(&exu,&exta);
        } while (!good);
        p448_randomize(&crand, &aa);
        
        q448_randomize(&crand, sk);
 		if (i==0 || i==2) memset(&sk, 0, sizeof(sk));
        q448_randomize(&crand, tk);
 		if (i==0 || i==1) memset(&tk, 0, sizeof(tk));
        
        copy_tw_extensible(&exv, &ext);
        copy_tw_extensible(&exw, &exu);
        scalarmul(&exv,sk);
        scalarmul(&exw,tk);
        convert_tw_extensible_to_tw_pniels(&pniels, &exw);
        add_tw_pniels_to_tw_extensible(&exv,&pniels);
        untwist_and_double(&exta,&exv);
        serialize_extensible(&b, &exta);

        (void)precompute_fixed_base_wnaf(wnaft,&exu,5);
        linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
        untwist_and_double(&exta,&exv);
        serialize_extensible(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("PreWNAF combo validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("A", &aa);
            q448_print("s", sk);
            q448_print("t", tk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i) {
        printf("PreWNAF combo variation: only %d/%d successful.\n", successes, i);
    }
    
    return 0;
 }
--- a/test/test.c
+++ b/test/test.c
@@ -0,0 +1,134 @@
 #include "test.h"

 #include <stdio.h>
 #include <string.h>


 int failed_tests, n_tests, failed_this_test, running_a_test;

 void end_test() {
    if (!failed_this_test) {
        printf("[PASS]\n");
    }
    n_tests ++;
    running_a_test = 0;
 }

 void begin_test(const char *name) {
    if (running_a_test) end_test();
    printf("%s...%*s",name,(int)(30-strlen(name)),"");
    fflush(stdout);
    failed_this_test = 0;
    running_a_test = 1;
 }

 void youfail() {
    if (failed_this_test) return;
    failed_this_test = 1;
    failed_tests ++;
    printf("[FAIL]\n");   
 }

 static int
 hexchar (char c) {
    if (c >= '0' && c <= '9') {
        return c - '0';
    } else if (c >= 'a' && c <= 'f') {
        return 10 + c - 'a';
    } else if (c >= 'A' && c <= 'F') {
        return 10 + c - 'A';
    } else {
        return -1;
    }
 }

 int
 hexdecode (
    unsigned char *bytes,
    const char *hex,
    unsigned int nbytes
 ) {
    if (strlen(hex) != nbytes*2) {
        return -1;
    }
    
    unsigned int i;
    for (i=0; i<nbytes; i++) {
        int hi = hexchar(hex[2*i]),
            lo = hexchar(hex[2*i+1]);
        if (hi<0 || lo<0) return -1;
        bytes[i] = hi*16 + lo;
    }
    
    return 0;
 }

 void
 hexprint (
    const char *descr,
    const unsigned char *bytes,
    unsigned int nbytes
 ) {
    if (descr) printf("%s = ", descr);
    unsigned int i;
    for (i=0; i<nbytes; i++) {
        printf("%02x", bytes[i]);
    }
    printf("\n");
 }

 void p448_print (
    const char *descr,
    const struct p448_t *a
 ) {
    p448_t b;
    p448_copy(&b, a);
    p448_strong_reduce(&b);
    int j;
    printf("%s = 0x", descr);
    for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) {
        printf(PRIxWORD58, b.limb[j]);
    }
    printf("\n");
 }

 void scalar_print (
    const char *descr,
    const word_t *scalar,
    int nwords
 ) {
    int j;
    printf("%s = 0x", descr);
    for (j=nwords-1; j>=0; j--) {
        printf(PRIxWORDfull, scalar[j]);
    }
    printf("\n");
 }

 int main(int argc, char **argv) {
    (void) argc;
    (void) argv;
    
    n_tests = running_a_test = failed_tests = 0;
    begin_test("SHA-512 NIST Monte Carlo");
    test_sha512_monte_carlo();

    begin_test("EC point operations");
    test_pointops();
    
    begin_test("Scalarmul compatibility");
    test_scalarmul_compatibility();
    
    begin_test("Scalarmul commutativity");
    test_scalarmul_commutativity();
    
    if (running_a_test) end_test();
    printf("\n");
    if (failed_tests) {
        printf("Failed %d / %d tests.\n", failed_tests, n_tests);
    } else {
        printf("Passed all %d tests.\n", n_tests);
    }
    
    return failed_tests ? 1 : 0;
 }
--- a/test/test.h
+++ b/test/test.h
@@ -0,0 +1,42 @@
 #ifndef __GOLDILOCKS_TEST_H__
 #define __GOLDILOCKS_TEST_H__ 1

 #include "word.h"
 #include "p448.h"

 int
 hexdecode (
    unsigned char *bytes,
    const char *hex,
    unsigned int nbytes
 );

 void
 hexprint (
    const char *descr,
    const unsigned char *bytes,
    unsigned int nbytes
 );
    
 void p448_print (
    const char *descr,
    const struct p448_t *a
 );
    
 void scalar_print (
    const char *descr,
    const word_t *scalar,
    int nwords
 );

 void youfail();

 int test_sha512_monte_carlo();

 int test_scalarmul_compatibility ();

 int test_scalarmul_commutativity ();

 int test_pointops ();

 #endif // __GOLDILOCKS_TEST_H__
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -0,0 +1,287 @@
 #include "test.h"

 #include <stdio.h>

 #include "ec_point.h"
 #include "p448.h"
 #include "crandom.h"


 static void
 failprint_ext (
    const struct extensible_t *a
 ) {
    struct p448_t zi, scaled;
    p448_print("    x", &a->x);
    p448_print("    y", &a->y);
    p448_print("    z", &a->z);
    p448_inverse(&zi, &a->z);
    p448_mul(&scaled, &zi, &a->x);
    p448_print("    X", &scaled);
    p448_mul(&scaled, &zi, &a->y);
    p448_print("    Y", &scaled);
    printf("\n");
 }

 static void
 failprint_tw_ext (
    const struct tw_extensible_t *a
 ) {
    failprint_ext((const struct extensible_t *)a);
 }

 static mask_t
 fail_if_different (
    const struct extensible_t *a,
    const struct extensible_t *b,
    const char *faildescr,
    const char *adescr,
    const char *bdescr
 ) {
    mask_t succ = eq_extensible(a, b);
    
    if (!succ) {
        youfail();
        printf("    %s\n", faildescr);
        
        printf("\n    %s:\n", adescr);
        failprint_ext(a);
        
        printf("\n    %s:\n", bdescr);
        failprint_ext(b);
    }
    
    return succ;
 }

 static mask_t
 validate_ext(
    const struct extensible_t *ext,
    int evenness,
    const char *description
 ) {
    mask_t succ = validate_extensible(ext), succ2;
    const char *error = "Point isn't on the curve.";
    if (evenness > 0) {
        succ2 = is_even_pt(ext);
        if (succ &~ succ2) error = "Point isn't even.";
        succ &= succ2;
    } else if (evenness < 0) {
        succ2 = is_even_pt(ext);
        if (succ &~ succ2) error = "Point is even but shouldn't be.";
        succ &= succ2;
    } /* FUTURE: quadness */
    
    if (~succ) {
        youfail();
        printf("    %s\n", error);
        printf("    %s\n", description);
        failprint_ext(ext);
    }
    
    return succ;
 }

 static mask_t
 validate_tw_ext(
    const struct tw_extensible_t *ext,
    int evenness,
    const char *description
 ) {
    mask_t succ = validate_tw_extensible(ext), succ2;
    const char *error = "Point isn't on the twisted curve.";
    if (evenness > 0) {
        succ2 = is_even_tw(ext);
        if (succ &~ succ2) error = "Point isn't even.";
        succ &= succ2;
    } else if (evenness < 0) {
        succ2 = is_even_tw(ext);
        if (succ &~ succ2) error = "Point is even but shouldn't be.";
        succ &= succ2;
    } /* FUTURE: quadness */
    
    if (~succ) {
        youfail();
        printf("    %s\n", error);
        printf("    %s\n", description);
        failprint_tw_ext(ext);
    }
    
    return succ;
 }

 static mask_t
 fail_if_different_tw (
    const struct tw_extensible_t *a,
    const struct tw_extensible_t *b,
    const char *faildescr,
    const char *adescr,
    const char *bdescr
 ) {
    return fail_if_different(
        (const struct extensible_t *)a, (const struct extensible_t *)b,
        faildescr,adescr,bdescr
    );
 }

 static int
 add_double_test (
    const struct affine_t *base1,
    const struct affine_t *base2 
 ) {
    mask_t succ = MASK_SUCCESS;
    struct extensible_t exb;
    struct tw_extensible_t text1, text2, texta, textb;
    struct tw_pniels_t pn;
    
    /* Convert to ext */
    convert_affine_to_extensible(&exb, base1);
    succ &= validate_ext(&exb,0,"base1");
    twist_and_double(&text1, &exb);
    succ &= validate_tw_ext(&text1,2,"iso1");
    convert_affine_to_extensible(&exb, base2);
    succ &= validate_ext(&exb,0,"base2");
    twist_and_double(&text2, &exb);
    succ &= validate_tw_ext(&text2,2,"iso2");
    
    /* a + b == b + a? */
    convert_tw_extensible_to_tw_pniels(&pn, &text1);
    copy_tw_extensible(&texta, &text2);
    add_tw_pniels_to_tw_extensible(&texta, &pn);
    
    convert_tw_extensible_to_tw_pniels(&pn, &text2);
    copy_tw_extensible(&textb, &text1);
    add_tw_pniels_to_tw_extensible(&textb, &pn);
    
    succ &= fail_if_different_tw(&texta,&textb,"Addition commutativity","a+b","b+a");
    
    copy_tw_extensible(&textb, &text2);
    add_tw_pniels_to_tw_extensible(&textb, &pn);
    copy_tw_extensible(&texta, &text2);
    double_tw_extensible(&texta);
    
    succ &= fail_if_different_tw(&texta,&textb,"Doubling test","2b","b+b");
    
    if (~succ) {
        printf("    Bases were:\n");
        p448_print("    x1", &base1->x);
        p448_print("    y1", &base1->y);
        p448_print("    x2", &base2->x);
        p448_print("    y2", &base2->y);
    }
    
    return succ ? 0 : -1;
 }

 static int
 single_twisting_test (
    const struct affine_t *base
 ) {
    struct extensible_t exb, ext, tmpext;
    struct tw_extensible_t text, text2;
    mask_t succ = MASK_SUCCESS;
    
    convert_affine_to_extensible(&exb, base);
    succ &= validate_ext(&exb,0,"base");
    
    /* check: dual . iso = 4 */
    twist_and_double(&text, &exb);
    succ &= validate_tw_ext(&text,2,"iso");
    untwist_and_double(&ext, &text);
    succ &= validate_ext(&ext,2,"dual.iso");
    
    copy_extensible(&tmpext,&exb);
    double_extensible(&tmpext);
    succ &= validate_ext(&tmpext,1,"2*base");
    
    double_extensible(&tmpext);
    succ &= validate_ext(&tmpext,2,"4*base");
    
    succ &= fail_if_different(&ext,&tmpext,"Isogeny and dual","Dual . iso","4*base");
    
    /* check: twist and serialize */
    test_only_twist(&text, &exb);
    succ &= validate_tw_ext(&text,0,"tot");
    mask_t evt = is_even_tw(&text), evb = is_even_pt(&exb);
    if (evt != evb) {
        youfail();
        printf("    Different evenness from twist base: %d, twist: %d\n", (int)-evt, (int)-evb);
        
        succ = 0;
    } /* FUTURE: quadness */
    
    p448_t sera,serb;
    untwist_and_double_and_serialize(&sera,&text);
    copy_extensible(&tmpext,&exb);
    double_extensible(&tmpext);
    serialize_extensible(&serb,&tmpext);
    
    /* check that their (doubled; FUTURE?) serializations are equal */
    if (~p448_eq(&sera,&serb)) {
        youfail();
        printf("    Different serialization from twist + double ()\n");
        p448_print("    t", &sera);
        p448_print("    b", &serb);
        succ = 0;
    }
    
    untwist_and_double(&ext, &text);
    succ &= validate_ext(&tmpext,1,"dual.tot");
    
    twist_and_double(&text2, &ext);
    succ &= validate_tw_ext(&text2,2,"iso.dual.tot");

    double_tw_extensible(&text);
    succ &= validate_tw_ext(&text,1,"2*tot");

    double_tw_extensible(&text);
    succ &= validate_tw_ext(&text,2,"4*tot");
    
    succ &= fail_if_different_tw(&text,&text2,"Dual and isogeny","4*tot","iso.dual.tot");
    
    if (~succ) {
        printf("    Base was:\n");
        p448_print("    x", &base->x);
        p448_print("    y", &base->y);
    }
    
    
    return succ ? 0 : -1;
 }

 int test_pointops () {
    struct affine_t base, pbase;
    struct p448_t ser448;
    
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, "test_pointops random initializer");
    
    int i, ret;
    for (i=0; i<1000; i++) {
        uint8_t ser[56];
        crandom_generate(&crand, ser, sizeof(ser));
        
        /* TODO: we need a p448 generate, which can return random or pathological. */
        mask_t succ = p448_deserialize(&ser448, ser);
        if (!succ) {
            youfail();
            printf("   Unlikely: fail at p448_deserialize\n");
            return -1;
        }
        
        if (i) {
            copy_affine(&pbase, &base);
        }
        elligator_2s_inject(&base, &ser448);
        
        if (i) {
            ret = add_double_test(&base, &pbase);
            if (ret) return ret;
        }
        
        ret = single_twisting_test(&base);
        if (ret) return ret;
    }
    
    return 0;
 }
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -0,0 +1,289 @@
 #include "test.h"

 #include <stdio.h>

 #include "scalarmul.h"
 #include "ec_point.h"
 #include "p448.h"
 #include "crandom.h"

 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_compatibility_test (
    const struct p448_t *base,
    const word_t *scalar,
    int nbits
 ) {
    struct tw_extensible_t text, work;
    struct p448_t mont, ct, vl, vt;
    
    int ret = 0, i;
    mask_t succ, succm;
    
    const struct p448_t
    sqrt_d_minus_1 = {{
        U58LE(0xd2e21836749f46),
        U58LE(0x888db42b4f0179),
        U58LE(0x5a189aabdeea38),
        U58LE(0x51e65ca6f14c06),
        U58LE(0xa49f7b424d9770),
        U58LE(0xdcac4628c5f656),
        U58LE(0x49443b8748734a),
        U58LE(0x12fec0c0b25b7a)
    }};
    
    succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base);
    
    succm = montgomery_ladder(&mont,base,scalar,nbits,1);
    
    if (succ != succm) {
        youfail();
        printf("    Deserialize_and_twist_approx succ=%d, montgomery_ladder succ=%d\n",
            (int)-succ, (int)-succm);
        printf("    nbits = %d\n", nbits);
        p448_print("    base", base);
        scalar_print("    scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS);
        return -1;
    }
    
    if (!succ) {
        return 1;
    }
    
    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
    const int nparams = sizeof(params)/sizeof(params[0]);
    struct fixed_base_table_t fbt;
    struct p448_t fbout[nparams], wout[6];
    memset(&fbt, 0, sizeof(fbt));
    memset(&fbout, 0, sizeof(fbout));
    memset(&wout, 0, sizeof(wout));
        
    /* compute using combs */
    for (i=0; i<nparams; i++) {
        int n=params[i].n, t=params[i].t, s=params[i].s;
        succ = precompute_fixed_base(&fbt, &text, n, t, s, NULL);
        if (!succ) {
            youfail();
            printf("    Failed to precompute_fixed_base(%d,%d,%d)\n", n, t, s);
            continue;
        }
        
        succ = scalarmul_fixed_base(&work, scalar, nbits, &fbt);
        destroy_fixed_base(&fbt);
        if (!succ) {
            youfail();
            printf("    Failed to scalarmul_fixed_base(%d,%d,%d)\n", n, t, s);
            continue;
        }
        
        untwist_and_double_and_serialize(&fbout[i], &work);
    }
    
    /* compute using precomp wNAF */
    for (i=0; i<=5; i++) {
        struct tw_niels_t pre[1<<i];
        
        succ = precompute_fixed_base_wnaf(pre, &text, i);
        if (!succ) {
            youfail();
            printf("    Failed to precompute_fixed_base_wnaf(%d)\n", i);
            continue;
        }
        
        scalarmul_fixed_base_wnaf_vt(&work, scalar, nbits, pre, i);
        
        untwist_and_double_and_serialize(&wout[i], &work);
    }
    
    mask_t consistent = MASK_SUCCESS;
    
    if (nbits == 448) {
        /* window methods currently only work on 448 bits. */
        copy_tw_extensible(&work, &text);
        scalarmul(&work, scalar);
        untwist_and_double_and_serialize(&ct, &work);
        
        copy_tw_extensible(&work, &text);
        scalarmul_vlook(&work, scalar);
        untwist_and_double_and_serialize(&vl, &work);
        
        copy_tw_extensible(&work, &text);
        scalarmul_vt(&work, scalar);
        untwist_and_double_and_serialize(&vt, &work);
        
    
        /* check consistency mont vs window */
        consistent &= p448_eq(&mont, &ct);
        consistent &= p448_eq(&mont, &vl);
        consistent &= p448_eq(&mont, &vt);
    }
    
    /* check consistency mont vs combs */
    for (i=0; i<nparams; i++) {
        consistent &= p448_eq(&mont,&fbout[i]);
    }
    
    /* check consistency mont vs wNAF */
    for (i=0; i<6; i++) {
        consistent &= p448_eq(&mont,&wout[i]);
    }
    
    /* If inconsistent, complain. */
    if (!consistent) {
        youfail();
        printf("    Failed scalarmul consistency test with nbits=%d.\n",nbits);
        p448_print("    base", base);
        scalar_print("    scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS);
        p448_print("    mont", &mont);
        
        for (i=0; i<nparams; i++) {
            printf("    With n=%d, t=%d, s=%d:\n", params[i].n, params[i].t, params[i].s);
            p448_print("    out ", &fbout[i]);
        }
        
        for (i=0; i<6; i++) {
            printf("    With w=%d:\n",i);
            p448_print("    wNAF", &wout[i]);
        }
        
    
        if (nbits == 448) {
            p448_print("    ct ", &ct);
            p448_print("    vl ", &vl);
            p448_print("    vt ", &vt);
        }
        
        ret = -1;
    }
    
    return ret;
 }

 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_commutativity_test (
    const struct p448_t *base,
    const word_t *scalar1,
    int nbits1,
    int ned1,
    const word_t *scalar2,
    int nbits2,
    int ned2
 ) {
    struct p448_t m12, m21, tmp1, tmp2;
    mask_t succ12a = montgomery_ladder(&tmp1,base,scalar1,nbits1,ned1);
    mask_t succ12b = montgomery_ladder(&m12,&tmp1,scalar2,nbits2,ned2);
    
    mask_t succ21a = montgomery_ladder(&tmp2,base,scalar2,nbits2,ned2);
    mask_t succ21b = montgomery_ladder(&m21,&tmp2,scalar1,nbits1,ned1);
    
    mask_t succ12 = succ12a & succ12b, succ21 = succ21a & succ21b;
    
    if (succ12 != succ21) {
        youfail();
        printf("    Failed scalarmul commutativity test with (nbits,ned) = (%d,%d), (%d,%d).\n",
            nbits1,ned1,nbits2,ned2);
        p448_print("    base", base);
        p448_print("    tmp1", &tmp1);
        p448_print("    tmp2", &tmp2);
        scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
        scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
        printf("    good = ((%d,%d),(%d,%d))\n", (int)-succ12a,
            (int)-succ12b, (int)-succ21a, (int)-succ21b);
        return -1;
    } else if (!succ12) {
        // printf("    (nbits,ned) = (%d,%d), (%d,%d).\n", nbits1,ned1,nbits2,ned2);
        // printf("    succ = (%d,%d), (%d,%d).\n", (int)-succ12a, (int)-succ12b, (int)-succ21a, (int)-succ21b);
        return 1;
    }
    
    mask_t consistent = p448_eq(&m12,&m21);
    if (consistent) {
        return 0;
    } else {
        youfail();
        printf("    Failed scalarmul commutativity test with (nbits,ned) = (%d,%d), (%d,%d).\n",
            nbits1,ned1,nbits2,ned2);
        p448_print("    base", base);
        scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
        scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
        p448_print("    m12 ", &m12);
        p448_print("    m21 ", &m21);
        return -1;
    }
 }

 int test_scalarmul_commutativity () {
    int i,j,k,got;
    
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, "scalarmul_commutativity_test RNG");
    
    for (i=0; i<=448; i+=7) {
        for (j=0; j<=448; j+=7) {
            got = 0;
            
            for (k=0; k<128 && !got; k++) {
                uint8_t ser[56];
                word_t scalar1[7], scalar2[7];
                crandom_generate(&crand, ser, sizeof(ser));
                crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1));
                crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2));
            
                p448_t base;
                mask_t succ = p448_deserialize(&base, ser);
                if (!succ) continue;
            
                int ret = single_scalarmul_commutativity_test (&base, scalar1, i, i%3, scalar2, j, j%3);
                got = !ret;
                if (ret == -1) return -1;
            }

            if (!got) {
                youfail();
                printf("    Unlikely: rejected 128 scalars in a row.\n");
                return -1;
            }
            
        }
    }
    
    return 0;
 }

 int test_scalarmul_compatibility () {
    int i,j,k,got;
    
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, "scalarmul_compatibility_test RNG");
    
    for (i=0; i<=448; i+=7) {
        for (j=0; j<=20; j++) {
            got = 0;
            
            for (k=0; k<128 && !got; k++) {
                uint8_t ser[56];
                word_t scalar[7];
                crandom_generate(&crand, ser, sizeof(ser));
                crandom_generate(&crand, (uint8_t *)scalar, sizeof(scalar));
            
                p448_t base;
                mask_t succ = p448_deserialize(&base, ser);
                if (!succ) continue;
            
                int ret = single_scalarmul_compatibility_test (&base, scalar, i);
                got = !ret;
                if (ret == -1) return -1;
            }

            if (!got) {
                youfail();
                printf("    Unlikely: rejected 128 scalars in a row.\n");
                return -1;
            }
            
        }
    }
    
    return 0;
 }
--- a/test/test_sha512.c
+++ b/test/test_sha512.c
@@ -0,0 +1,270 @@
 #include "test.h"

 #include <stdio.h>
 #include <string.h>

 #include "sha512.h"



 static int sha512_monte_carlo_core (
    const char *seed,
    const char *checks[100]
 ) { 
    struct sha512_ctx_t sha;
    sha512_init(&sha);
    
    unsigned char md0[64],md1[64],md2[64];
    
    int ret = hexdecode(md0,seed,64);
    if (ret) {
        youfail();
        printf("    SHA-512 NIST Monte Carlo validation seed hex decode failure.\n");
        return -1;
    }
    
    int i,j;

    memcpy(md1,md0,sizeof(md1));
    memcpy(md2,md0,sizeof(md1));
    
    for (j=0; j<100; j++) {
        
        for (i=3; i<1003; i++) {
            sha512_update(&sha,md0,sizeof(md0));
            sha512_update(&sha,md1,sizeof(md1));
            sha512_update(&sha,md2,sizeof(md2));
            memcpy(md0,md1,sizeof(md1));
            memcpy(md1,md2,sizeof(md1));
            sha512_final(&sha,md2);
        }
        
        ret = hexdecode(md0,checks[j],64);
        if (ret) {
            youfail();
            printf("    SHA-512 NIST Monte Carlo validation hex decode failure at iteration %d\n", j);
            return -1;
        } else if (memcmp(md0,md2,sizeof(md2))) {
            youfail();
            printf("    SHA-512 NIST Monte Carlo validation failure at iteration %d\n", j);
            hexprint("    Expected", md0, 64);
            hexprint("    But got ", md2, 64);
            return j+1;
        }
        
        memcpy(md0,md2,sizeof(md1));
        memcpy(md1,md2,sizeof(md1));
    }
    
    return 0;
 }

 int test_sha512_monte_carlo() {
    const char *seed =
        "5c337de5caf35d18ed90b5cddfce001ca1b8ee8602f367e7c24ccca6f893802f"
        "b1aca7a3dae32dcd60800a59959bc540d63237876b799229ae71a2526fbc52cd";
    const char *checks[100] = {
        "ada69add0071b794463c8806a177326735fa624b68ab7bcab2388b9276c036e4"
        "eaaff87333e83c81c0bca0359d4aeebcbcfd314c0630e0c2af68c1fb19cc470e",
        "ef219b37c24ae507a2b2b26d1add51b31fb5327eb8c3b19b882fe38049433dbe"
        "ccd63b3d5b99ba2398920bcefb8aca98cd28a1ee5d2aaf139ce58a15d71b06b4",
        "c3d5087a62db0e5c6f5755c417f69037308cbce0e54519ea5be8171496cc6d18"
        "023ba15768153cfd74c7e7dc103227e9eed4b0f82233362b2a7b1a2cbcda9daf",
        "bb3a58f71148116e377505461d65d6c89906481fedfbcfe481b7aa8ceb977d25"
        "2b3fe21bfff6e7fbf7575ceecf5936bd635e1cf52698c36ef6908ddbd5b6ae05",
        "b68f0cd2d63566b3934a50666dec6d62ca1db98e49d7733084c1f86d91a8a08c"
        "756fa7ece815e20930dd7cb66351bad8c087c2f94e8757cb98e7f4b86b21a8a8",
        "937d7856a82a84c163c79417d0540c47daaf9ffe662c843737dbbcbe5f865bf6"
        "f47a9d2bd10129a4f498073094653c324a2519a1c71ac1279b1623ff7d24647a",
        "f8fbc058c2b9f84131c9decfa543a35ade41581f670398efd61b3abfced9c1cf"
        "cb5324f2370487f9c59a65bc668ea596c8d22ce8a33014dfad28357fa7d05f04",
        "4ab0c9484ff5c30fa64ae6e81510c5fea566eafb88f175f8bc19109f40fe8001"
        "4c8b77fff10b8750778429bf3c5497e4cb92d9b30014f4cb975dff2a45244c28",
        "685179397554d276513d630234a03419808c698abf2600d7490aabb8e455c6ab"
        "6ea412c7729dc140a79dff66533c6946cbe90f9da9ed16e2e629db1651bea870",
        "335e6e941ab7dadfecdb74ea6cb4e8584b6e3408841a33a6cf7fd6a63294b193"
        "0a60983240311672acac3840a90e64cc366ce75081b2252627e9c31197ebad03",
        "e3217f6af6e279e9445dc3738cbf9ba0e9edba0455844a73648139777afdea2c"
        "4d8032e214f541bf92675fb23f24df8e4fe98e0003aadfb6d8f9cc2cd799bbf7",
        "ee2fdfb3ae630613b7d890977cf2515deac272a37f27e4a01961ecf103d4ff5b"
        "45cc8aef53b635dd75aa51aabf71c0642555ccd3281e0388f8ca09d83258cf30",
        "6a30d97cc98af6a25b673dce7aeab8d762bf2e55ea0c6dc899179281f84dd02a"
        "2896f77e9c106b472f55f7adbef7b1157be567ee1236ebdac2a3c5d8cb133eb5",
        "ac1176abdc5f71170183d92ae55856221b0d95590af11d9d72ba605ec026bbec"
        "52d6974bc43a1efb125ff2b161fbdc616fda00f04193a0bc26aacdfa052a5741",
        "59fa909480620ecc08d34531a6da1b55158b74fc93ddf68e1d242615b6f3843a"
        "7952e63e798c6445cde1b07e0be09d0d711cb7b42a0e7760a593b08acfceb63d",
        "9eb253319efa61b864f27bd334d7dd78b38d3265fb544e0c8edee950a547e1d8"
        "db921a285774ab94d66beae933298d20f2a5aa87c62fe1e383cc3b18e7af18ac",
        "81735324005671f7bdad9e685ee8257f5e0622b9fcb5d38dbdfb2df27258c3e1"
        "d46d76e24c0c92c744e1b50a2b4b0d31525b3af83cc80a75722d921bdeef59c4",
        "17498cdff4323bb8021e44eca6559e05d8ff9a0ef2ee9d4ba0ac6e73f83972a0"
        "dfbb6d47728fa70311d7c82e154966e1b7678263b0f65133e9116969193d429b",
        "228c4574d7c45eb9ba9240722133fce74abe00c7328ab30b4bde373dc79afdd6"
        "e0569d36268cd5eaa2f27205fc00512577bcbb6699e1d66ed85eafaba7548afb",
        "3d40ccd9cc445bbecca9227c67fe455d89e0b7c1c858d32f30e2b544ca9a5a60"
        "6535aea2e59fec6ec4d1ba898cc4338c6eadef9c0884bcf56aca2f481a2d7d3e",
        "e1e577aeac92e3a2b7f8a262bf2ac9c037d2274ca6618fbe4cc21db7c699e994"
        "6b6671ae45ea433a1e392a5bc9eec96fd641ba8f4a047f022a04a337227004df",
        "5e4424c0bcb2f0f7a2428821a9d5840a82401f4440ae6bed25c53cd9e71cf9d3"
        "9904d6a375bd721f4332ab0202529c91feb9c094c3e6d34ca4f66649ee6fa212",
        "56b199d63ca37189d5ca0d40006ac7bcb9f39cbdc00ef7b8a5697caa7d81d05b"
        "645a146995b1151d01958f1589337e14afc6e7dd10a815170e527a398e6ce8c3",
        "d2d498ff93fb03013a64f295b5bc68e57d2fb5600da578aa011d43ff432eae3e"
        "0c800f9e2a53155e56fdbf5e068fe2b4beb3e42b2585531b8b16c4d8ca3356c6",
        "3d3875489903710f17cf4247b5842ace6f017b1a3b99e9ee5fbc04fc7898e78b"
        "12693879878028ca40c63cd0f6925fb7d0ca0412e4f06619e3ace223690f03b8",
        "a013e21cd1234483c95c2ea2757be949bc79401ba39b09c316a1612d594642be"
        "65ca106e12695ac3808c57c6f2980e895fd1fe188946562afc238414e1e43649",
        "c5f6367d7195489e16242f912fbe0d8002e947de3a7e9c53f77b1e5e90e05bd7"
        "ca395e787e34cb5f500c02da59c9d83de35601de7ae80dae74a0d6b4a292d43b",
        "7c28c44c6aaba83c122f24d68273e28a5afd65b4071d02b7ea3300478d511897"
        "1e1356ae57cbc70d2a177ea464a1c2c50d4297b933e789c63b1481797ae8f08c",
        "af7cb42b1c70a85ac1ae1c2991b25b657c19f4fcf83af7f7dc0ae1028c1452a6"
        "a17dc98929634fe6ed3855b70b96bc2caa93d82037b94ebeddc77e4c1a7cc563",
        "bd56ad4c0cbd162706053da929d667253aadcf417affb483fff4f2699bf406d1"
        "28cfdf5196dfbb05bb89ccbf04c5147bd2ebb3156b0bc1768ca6faa171c91c01",
        "004d7b0fff9bcddf4b3913ae190a76728705a3d23874d92a8b7ff246c8fcad46"
        "623cb04723c8aded0cba4968d1a8cc1375b99005786c1bcb7ae4bf13325c3ae0",
        "8299a5bf5ed64f525c4eebbeca969fc1b91a81adb58c584bdd2d7676386a31fa"
        "546643a3cf505007584f02fb712d708cab645bf078a1b9339f5a76aee985d017",
        "ce7100f3455db1a9776a9f40d562ea998afca1f9fee7e0d81c8db34cf68ad23a"
        "8bfa6fc04774703e1e56d5196b66966158fcf2a8335a58c6ba7ba1af756ba1dc",
        "90aaabcb655ee921b8350229efe6064a60051cf0cac858fa3d43afd5b97cc823"
        "01bd1b8cc1f874022e5af948185638783a13ca1bbd5049ace7fbf4f6d90c201f",
        "3cf0a25b33ded3e0806dfe603b9987f1d6f2b3fdcb1ec7f8566828c00e17e8f5"
        "9e38b3bca302396c7525ca194e6cc8501369059e2e34ae21e3141215876847c4",
        "bdc5266aee339a1ff13fcf5229773cd3d14b47101e83076927c160bb71bf7445"
        "590525a2012d52af008e118e16df1b6bfcaf8f22b4e45f9e749f3c20625a2bc8",
        "ef8d2ba885381ab97756d59dbbbf53a1ea35d152b2d8f82c3518430aa34e7083"
        "59194ea43950d032e151f576d343a5c3cfe6b71d4ed0ead9d3a107402589bad0",
        "194ea5324c4179998dd7057755f255fdea04dadf533f7851e3e9718b610948e3"
        "2fd28323077d9421142ac808978adfa325b668c8599a2e01c757a5a14ed2dd37",
        "106984d2f0087e621dae760552bc6279072267883c204079481af6034354f1a2"
        "b77c17e6c039a1063e479342aa3ccd90330dd3fb5a7d5e976619497e2d3326cd",
        "a1347216f1a6db47b90c4ded3c5c75440f54c22c87d538314d1340f86f88acba"
        "01378acb933ddad0adc6b75d55bfb7e8efc9c4a531b2a410610b7515b6dac66a",
        "b76e4db147e0eaa4f04880654088b9d0fce518c8c377d92c846345604dc6b2b1"
        "8d377fdb8e30f06d9bcfe6d7dacc07d6adff73d98d49f8f132b80f3084390830",
        "acd4e527763dfd4513f0def0b1edf8ea12dc78d336b7b796f3dcc32e10687254"
        "43a2f55ab4f666b27d6bf2ab39669c98293f0a9108051fd3144d31a1ed171ddd",
        "10128c15494bc87a87374f676ef9fe2df20b36ffcca41a80bd40b216637b3de7"
        "10efd070e277827820a7bba3cceb7b21f8fe7f9775d6c4df4d3da5349434ec49",
        "2632dd5c188c6ed3a4610405fdda704add752f5424d9de65a51400fe478e26cd"
        "0412e5f91ca4b744c34f4954f40a3a4254431d21954623208b527b7b4daa687e",
        "45707f5b6fc5ccd1f78d77f177d10fb8b462c74cc821518cd5cfa4b5d6b40b41"
        "8044900693c37abbb82367d340fec67f800d74072935da1706b4d90ae26099c7",
        "56c37f31220b5b3040373d91b2c5e42fe9e601a12f7f8dc4534459bf28e484b8"
        "713db243c5782c031e674003a3c14c42fd152e7188789065e82795e10f87d54b",
        "5da94c899d48bd8299fee3d81662f8d6c5f8f8bc54d18cb0368b13cebaee7ad7"
        "1e74ea80f34974ad166f04f9a0602809166fe4085a475a8ca86cade12b6754c4",
        "0664363f97ba910760b0922e31ca880ca97469506cb007e3108c36c3ce3ce180"
        "1fb4197609479339e8820632b6a38bffffee05a9adc11cc544b9aa6f5b95cc6f",
        "732c41a1edaa727c04f627ff158aaff67c18efd667216132b99ab84d108996a1"
        "0bb008b5d803b22ed1aa78bb0d10f8a762fd34777d7dccce8e84827ba88d4193",
        "fc9c21d67e393a2b05a23a17d8db630cbaebaa3def211181749f1bcad1815606"
        "27fb60ee20fae2e5980cbf50fce0a19dce807e7fb75c4da0ef008bc75d413a65",
        "0453b765afc1edffa595efe345177f5805ed3abc1297ceab757ae7161723a614"
        "4cb543299f418049276d16b7896662631634fab9549127c10f27505b7dee8665",
        "3853f3bf024e0668e8d1ea53733a97537f97d9307c5f3a19864ab4eeb1654710"
        "693bb961a344dec8a758f5e64b26fcb6dd423419c4a114fa749211a9de06c281",
        "240137f0dd57beb3f7fc283bb3ead423c67883fd46f4e27471d7be57ad469a49"
        "bad03a3658418bd55614678f3a463bceff85291314b90ef43ccbcb028f0a7a07",
        "f9050a5271edbe4cfdb9520ec05bbdc3cbcb9bce36fd212338d3e7028a39b9ab"
        "30793e561d75a2e424193264c7f0775e65599ef0c94e0ad24dbfe18252364267",
        "47caa7a5862fad837aaa409a4a9df2575e645528c35159115911b7c4e2f08ae4"
        "9d68de97249b31b83ce2c163f649cad4559dc6e6a7191f2922d79a5fd6af167b",
        "13f5825c41fa49edf6104e3e35c9c224eba93e37374f730004c39c54e7391e4a"
        "847fd61865235a3fe32224c96fbe86f7e14c3d5df496e83ec989a71b4f293a44",
        "e5b55e05efe1ca6b9a96a57e3a1523d610d70f837e93b31fa98c2736d3e114d2"
        "38d46ec6b6e3d19e774b253f6b0c7a2ebe69b7e60fc0874444806b2a2278df45",
        "f14a586ac30f0af255f597a9aef9abba5e99c04d17b01f24427c4ee2c196b52a"
        "cb1ceefc9b15cb822b3ecffdc2f7c49e11d3fc0769acee33361537d379c62e0c",
        "7e2d3398807195c48e6ec52d20710bbf8b21ea8de4d1abc197897ccc58aeff40"
        "259edc67270cdae0edcc686c0d0dccc5760c1495ab1cf48482dc2000ae2d42ad",
        "2f3d5c5f990bf615d5e8b396ccbd0337da39fad09b059f955a431db76a9dc720"
        "dffc4e02c0be397c7e0463799cd75fd6ab7c52bec66c8df5ef0d47e14a4c5927",
        "483a1764d308cc494a2b543d29ba616483aefdf91c7769fd084eedaac1add189"
        "1df95d317a47430b2bf73e4081f86597020e28afe2d34a22b77ea62b6112d09a",
        "bfa88691ec951511651c6f14af100eeb26d87729e18ac3ef49a80d73ffeaeea5"
        "3e97c4a7277a7ee9f2fba070b1c9720d6cdba407dd82267019e3f0f5662b2f2b",
        "4c17c8e2e7132dbf82afebc40efc77926d16f4d2c082d846dac28733aa767e28"
        "40ebf04f2563df75933466a36e11968d342e4157827605d04d9627ce9b5216c8",
        "70bbfc29a2a765220af84e7bb10d759a3152ad4b5643ef6b89966950ec7ef950"
        "3d57bc0a28c4ee789a60bf9dcac59139e15241d73b990410cf92eff213da9eca",
        "8d1d56f37fc19b84984a6fa33aa9c2dbdbf79a29c04ad0b4cf20333e6bec9434"
        "47be2416242f8cd2f9732e79bb925cc5a61a80c5fc9c079961243fd1c1f5900e",
        "492fd0171f4dcd5d20ea6c0d34b5576c8894664ae5955e6737f5e3b711c2804d"
        "99ccca065b7ec18c82da98b18a3029b765c51ebc7c433b36492e0ed6b8511bb6",
        "7f49e8e54db7e5b4323cae2db71f3e8b8eba172dcad3602e9b7b058007a55893"
        "58732d5afffa56072a46e89b1ea27ef8d556deb86b569c635d394f15d99d8a15",
        "56884a6a9210d5f371e25823efb2511a9c410c26a441e07c1bdffe8605084267"
        "d49c315baf6a692d7d97844b2714b4930877a5d7f52cf6fa151700fcb6980546",
        "6aaef8284eef221ecb17ea3c9596f075b5155fe7b925d737ed3c6543c761c28c"
        "7cd9d9d4b5e2a37b2f183a2a367bbd34b633497bc7a1737d61c8c1f3ef295062",
        "38ef178f5688e59d47c375252db7b39f40c0c84169878ee7ba5086e4b25fea81"
        "076b9c37847e9e6bf24ae0b343689c265ec5ca7469e619acd61b0276721efb1b",
        "e3fe1aabad120777cf24eaae289b486632ca46ceb89afae73dbae5fa87c76787"
        "9369355a9cc5c21ca604ed91d0f2f58c466573f3e6d88e52c62c0d3cb188e141",
        "82f5bd920457bb2763a0da031a7fed47b236951b1ea420c20fd2b6de1dbfbb9c"
        "4600ea7092788493e2d4be6ee24b6dba04e57af3e8f2f14d9837295420ac7631",
        "6d0b26208ba9b1615067bb3ff97b292fe67e4c02d240d649c32370e0a4cd22d0"
        "3bdf864be4d24a3f5f51aeccfd1afd5191e590edeb5f7bec323b0506c3104b89",
        "d081083158054d08371ec84f4d3aa5aa761734ac6091a30330a861fda056f835"
        "c750bf4f7981af1693ff28545366bd05cec47bccd77a7d237befb0135c534138",
        "6ba8b52780b8a07a2a2015dd8f0c5e7437b8e024c4ee428f7ba91dfea118cb72"
        "a939872550983317132b841b7cbc29a22b8f1cfea0c55203cafc69b55ed6244a",
        "312692b0a51f002b7f06d05b39d15a5637dbddd2f4f1a73e6c88a4c841cdba5c"
        "d8e69c0939ab39bb1a9c54fa35402143c97edb9704a0e9e1a98701710f6a5dad",
        "aaee960de201a8dcccff95b834fccf0dafc03fe6cffc0429162bf4aff01165ab"
        "07a0c9435e9cb412121b7ba010657ccc3152118602b665072136317d92fd4262",
        "21fdff552e08c86c07f080cefacaaaf31846eb893bfe2e4f88c3c3cd8cbf592a"
        "84500942695a5e5ae971ab343ce2695dd1baeb1f94dd4b53d678e14265e421ae",
        "ca8f1a5b2172f6adb474da53b35e3f73ffd88263d3eecde72e48b16e1a065801"
        "5b555ee319005a1d82802e91431ee777610f9b1028d819921e1044ad426b0270",
        "ce5ab25eff9c1ddc569a1eaaa66b689109ee269db7066e0b02d39b3564fd14ca"
        "6249987b7791e203d3d7c2ebf18558d2f23f94c03dd1d03aa63849e4d2889a76",
        "a6f8b0561000dd4ae8b828c5f676e8c1a6474c4a042a645f1815bd52e9ff53c9"
        "7dc36d5d8997f8ce332185feead76267f5b2e63f597fb3345ca0046e58fc0f24",
        "fec86794bad4106c5ad1c1a2d9a1b7aae480396ec231eb5cac21c4077d17a0b6"
        "52da0037363399a5a1dababa4a40e4c54b9124167580dee9108c4dbb24c57512",
        "594f5dd3f4c87bdc0d81309386e9163a9718e34c7b0dcb4613f8487aa786f9d2"
        "11cfb61bb247fa9f5ecef042e710f192850f5571807294bfd8a54397850e5773",
        "d81ad866f25ef6a0a6431d267114da564513e5ebdcf48db7e95db8cf32a89f0a"
        "b107874d796035db97420ffcf1db5f04dc1a52ddbbb960fc63b7f3f835cc8be6",
        "431d537e098e9949f6a68108d55d20952e3bfcdeb7273bac3917e37790a84fa5"
        "db04c33a79c113a06cf333e831d7702a00853a93fd0aa5146d934f4f71242a6a",
        "4ed95636c6885ae4e63d042e82f4da830c702dbf3b9746d64770a64dd666b332"
        "08315f3a947c4dff790771ef283788a9c74da83e22b97f750286a820ee46698c",
        "a9bcb60b4d7724cdddddbc232b4ac70b94d0d7e9f0724b1222d918930cbb9bdb"
        "b04b3ad43e3c8caf3bf8b004ee4aec6bd527ff8eb6189b44827f7ba7057f6a90",
        "d6d5e44d5bb07fc4144ab6ab309f048968f73f7992beb326047e9e2cd7af6240"
        "bc8abf46703c32fdb58fb2a8672594a660ef855be74f24cec09d4fb00219de82",
        "dfda9ac0c7147530da97715ccf47814182255f2f2cf40287db97a4c63b43fcd3"
        "9e6d41e560921492badb253a7dea0aba863c7c33b912bb59d1ff4de03a4f03bb",
        "0395faaaf2e907f27779d6f1cc9c9db68ec390a38fbb0702c6475b46f7a39949"
        "8d46fd8014f834b131e1e83abba0359b1f16d8fc0a393580615def2ad0caba73",
        "41cb98f09029abe85d24a0f131f116c7f69f54f7e91c250642606512bf3da4ca"
        "89ba70a4714a5f66d9ae81ff09317dadaff12a02057074c970f0f02a52bfafd2",
        "8e8f161d48e306c5533ed614b8ef3a1979df6db7e13d0780a73c4a3980ddf0a9"
        "5f93941d412c93683e39915a660c3fbec0dbb1bb6beea2e2099cd968011535c0",
        "789593f0b8fb83ef9b3ec50ab8f6e1e47344f763d4f7ceab5600989e7b6fd5fe"
        "f6ee5e487975f64474af6cd71ae4d9ecce8f009edea0227c7ebe73080b8f961b",
        "f37e1449e0b313d9537a6177f7a31158d353e5b79c781facf02526ec94e0c6cf"
        "da37105bac67098b194ea82efb307c2929a9ab8aca0e76c53e829e3f901cd245",
        "2e74e745caaf2d449ab3b031dd214b48616853a512cf2e95c40cb8e7594fe5e4"
        "879ac8a26d02eb35b3b96a5c9e7dcae3e15fd050a0bcc1fb3b9cb9c4df0fad3e",
        "6eac7069c26082e52574ca6a58abb9b1b9faf452e8cca9f1c7023679ce192ca5"
        "54892f30e38104d39088a24df35612444a0fc90084af7535fd9344fa51dded84",
        "ada6caf30c4f6e3644d952366e01519af6771b406e2c447552f0c597b8dd10e9"
        "e9b4e699c9a835de03f422be8980538d9786172dfd2fe511db272a1543d5aa35",
        "4d4b0086b2cb05d713f2805caa7e6605c8f7dbbb2e0f92aa159aebdcd6306030"
        "5f47b748f1bca6e0b6e11cf8f9697fcccb6584b878c4b54a699290728a40aa1b",
        "97420b8a0ad102aeb92139da2c052d2748dd7d2dbb93a9ea79dc15b520d0ca7c"
        "ab8cb7a00f5b5aebcb49d7e7f52a27180935ce617aeecdecba04064c668edd37",
        "4aa7dad74eb51d09a6ae7735c4b795b078f51c314f14f42a0d63071e13bdc5fd"
        "9f51612e77b36d44567502a3b5eb66c609ec017e51d8df93e58d1a44f3c1e375"
    };
    
    return sha512_monte_carlo_core(seed, checks);
 }