diff --git a/Doxyfile b/Doxyfile
index 3be58a0..55b844f 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -508,7 +508,7 @@ HIDE_SCOPE_NAMES       = NO
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
 
-SHOW_INCLUDE_FILES     = YES
+SHOW_INCLUDE_FILES     = NO
 
 # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
 # grouped member an include statement to the documentation, telling the reader
@@ -777,7 +777,7 @@ FILE_PATTERNS          =
 # be searched for input files as well.
 # The default value is: NO.
 
-RECURSIVE              = NO
+RECURSIVE              = YES
 
 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a
diff --git a/HISTORY.txt b/HISTORY.txt
index 4d0ae13..3e5f946 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,51 @@
+March 29, 2014:
+    Added a test directory with various tests.  Currently testing SHA512 Monte
+    Carlo, compatibility of the different scalarmul functions, and some
+    identities on EC point ops.  Began moving these tests out of benchmarker.
+    
+    Added scan-build support.
+    
+    Improved some internal interfaces.  Made a structure for Barrett primes
+    instead of passing parameters individually.  Moved some field operations
+    to places that make more sense, eg Barrett serialize and deserialize.  The
+    deserialize operation now checks that its argument is in [0,q).
+    
+    Added more documentation.
+    
+    Changed the names of a bunch of functions.  Still not entirely consistent,
+    but getting more so.
+    
+    Some minor speed improvements.  For example, multiply is now a couple cycles
+    faster.
+    
+    Added a hackish attempt at thread-safety and initialization sanity checking
+    in the Goldilocks top-level routines.
+    
+    Fixed some vector alignment bugs.  Compiling with -O0 should now work.
+    
+    Slightly simplified recode_wnaf.
+
+    Add a config.h file for future configuration.  EXPERIMENT flags moved here.
+    
+    I've decided against major changes to SHA512 for the moment.  They add speed
+    but also significantly bloat the code, which is going to hurt L1 cache
+    performance.  Perhaps we should link to OpenSSL if a faster SHA512 is desired.
+    
+    Reorganize the source tree into src, test; factor arch stuff into src/arch_*.
+    
+    Make most of the code 32-bit clean.  There's now a 32-bit generic and 32-bit
+    vectorless ARM version.  No NEON version yet because I don't have a test
+    machine (could use my phone in a pinch I guess?).  The 32-bit version still
+    isn't heavily optimized, but on ARM it's using a nicely reworked signed/phi-adic
+    multiplier.  The squaring is also based on this, but could really stand some
+    improvement.
+    
+    When passed an even exponent (or extra doubles), the Montgomery ladder should
+    now be accept points if and only if they lie on the curve.  This needs
+    additional testing, but it passes the zero bit exponent test.
+    
+    On 32-bit, use 8x4x14 instead of 5x5x18 table organization.  Probably there's
+    a better heuristic.
 
 March 5, 2014:
     First revision.
diff --git a/Makefile b/Makefile
index a1c6d6e..3e03193 100644
--- a/Makefile
+++ b/Makefile
@@ -2,61 +2,101 @@
 # Released under the MIT License.  See LICENSE.txt for license information.
 
 CC = clang
-CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror  \
-  -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 \
-  -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC \
-  -DEXPERIMENT_ECDH_OBLITERATE_CT=1 -DEXPERIMENT_ECDH_STIR_IN_PUBKEYS=1
+LD = clang
 
-.PHONY: clean all runbench todo doc
+ARCH = arch_x86_64
+
+WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
+	-Wgcc-compat -Wmissing-declarations
+INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
+LANGFLAGS = -std=c99
+GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
+OFLAGS = -O3
+#XFLAGS = -DN_TESTS_BASE=1000
+ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
+#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16
+
+CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
+LDFLAGS = $(ARCHFLAGS)
+ASFLAGS = $(ARCHFLAGS)
+
+.PHONY: clean all test bench todo doc lib
 .PRECIOUS: build/%.s
-	
+
 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp
 
 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
   build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o
 
-all: bench
+TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
+	build/test_pointops.o
+
+BENCHCOMPONENTS=build/bench.o
+
+all: lib build/test build/bench
+
+scan: clean
+	scan-build --use-analyzer=`which clang` \
+		 -enable-checker deadcode -enable-checker llvm \
+		 -enable-checker osx -enable-checker security -enable-checker unix \
+		make build/bench build/test build/goldilocks.so
+
+build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
+	$(LD) $(LDFLAGS) -o $@ $^
+
+build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
+	$(LD) $(LDFLAGS) -o $@ $^
+
+lib: build/goldilocks.so
+
+build/goldilocks.so: $(LIBCOMPONENTS)
+	rm -f $@
+	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
+		  -exported_symbols_list src/exported.sym \
+		  $(LIBCOMPONENTS)
 
-bench: *.h *.c
-	$(CC) $(CFLAGS) -o $@ *.c
-	
 build/timestamp:
 	mkdir -p build
 	touch $@
 
 build/%.o: build/%.s
-	$(CC) -c -o $@ $<
+	$(CC) $(ASFLAGS) -c -o $@ $<
 
-build/%.s: %.c $(HEADERS)
+build/%.s: src/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 
-build/goldilocks.so: $(LIBCOMPONENTS)
-	rm -f $@
-	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-		  -exported_symbols_list exported.sym \
-		  $(LIBCOMPONENTS)
+build/%.s: test/%.c $(HEADERS)
+	$(CC) $(CFLAGS) -S -c -o $@ $<
+
+build/%.s: src/$(ARCH)/%.c $(HEADERS)
+	$(CC) $(CFLAGS) -S -c -o $@ $<
 
 doc/timestamp:
 	mkdir -p doc
 	touch $@
 
-doc: Doxyfile doc/timestamp *.c *.h
+doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h
 	doxygen
 
 todo::
-	@egrep --color=auto -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c
+	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
+		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
 	@echo '============================='
 	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
-	  egrep -w -i $$i *.h *.c > /dev/null || continue; \
+	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
 	  /bin/echo -n $$i'       ' | head -c 10; \
-	  egrep -w -i $$i *.h *.c | wc -l; \
+	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
 	done)
 	@echo '============================='
 	@echo -n 'Total     '
-	@egrep -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c | wc -l
+	@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
+		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l
+
+bench: build/bench
+	./$<
 
-runbench: bench
+test: build/test
 	./$<
 
 clean:
-	rm -fr build bench *.o *.s
+	rm -fr build doc
diff --git a/TODO.txt b/TODO.txt
index fb0fc96..e1d05f2 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -23,7 +23,7 @@ Important work items for Ed448-Goldilocks:
     * Word_t, mask_t, bigregister_t, etc.
     * Generate asm intrinsics with a script?
 
-* Bugfix: make sure that init() and randomization are thread-safe.
+* [DONE] Bugfix: make sure that init() and randomization are thread-safe.
 
 * Security: check on deserialization that points are < p.
     * Check also that they're nonzero or otherwise non-pathological?
@@ -80,30 +80,29 @@ Important work items for Ed448-Goldilocks:
 * Portability: make the inner layers of the code 32-bit clean.
     * Write new versions of the field code.
         * 28-bit limbs give less headroom for carries.
-        * NEON and vectorless ARM.
+        * Now have a vectorless ARM version; need NEON.
+        * Improve speed of 32-bit field code.
     
     * Run through the SAGE tool to generate new bias & bound.
 
-* Portability: make the outer layers of the code 32-bit clean.
-    * There are endian bugs in the signing algorithm.
-    * NEON and vectorless constant-time comparison.
+* [DONE] Portability: make the outer layers of the code 32-bit clean.
 
-* Performance: write and incorporate some extra routines
-    * Deserialize_and_isogeny
-    * Unconditional negate (or just plain subtract)
-
-* Performance: fixed parameters?
+* Performance/flexibility: decide which parameters should be hard-coded.
     * Perhaps useful for comb precomputation.
 
 * Performance: Improve SHA512.
-    * Improve portability.
+    * [DONE?] Improve portability.
     * Improve speed.
+        * Except not, because this adds too much code size.
+        * Link OpenSSL if a fast SHA is desired.
+
+* Protocol:
     * Decide what things to stir into hashes for various functions.
     
 * Performance: improve the Barrett field code.
     * Support other primes?
     * Capture prime shape into a struct instead of passing 3 params.
-    * Make 32-bit clean.  (SAGE?)
+    * [DONE] Make 32-bit clean.
 
 * Automation:
     * Improve the SAGE tool to cover more cases
@@ -111,6 +110,10 @@ Important work items for Ed448-Goldilocks:
         * Constant-time selection
         * Intrinsics code
         * Field code?
+    
+    * SAGE tool is impossibly slow on 32-bit
+         * Currently stuck on Elligator after 19 hours.
+         * [FIXED] at least for now.
         
     * Vector-mul-chains
     * Negation "bubble pushing" optimization
diff --git a/include/goldilocks.h b/include/goldilocks.h
new file mode 100644
index 0000000..7476a6c
--- /dev/null
+++ b/include/goldilocks.h
@@ -0,0 +1,210 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file goldilocks.h
+ * @author Mike Hamburg
+ * @brief Goldilocks high-level functions.
+ */
+#ifndef __GOLDILOCKS_H__
+#define __GOLDILOCKS_H__ 1
+
+#include <stdint.h>
+
+/**
+ * @brief Serialized form of a Goldilocks public key.
+ *
+ * @warning This isn't even my final form!
+ */
+struct goldilocks_public_key_t {
+    uint8_t opaque[56]; /**< Serialized data. */
+};
+
+/**
+ * @brief Serialized form of a Goldilocks private key.
+ *
+ * Contains 56 bytes of actual private key, 56 bytes of
+ * public key, and 32 bytes of symmetric key for randomization.
+ *
+ * @warning This isn't even my final form!
+ */
+struct goldilocks_private_key_t {
+    uint8_t opaque[144]; /**< Serialized data. */
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief No error. */
+static const int GOLDI_EOK      = 0;
+
+/** @brief Error: your key or other state is corrupt. */
+static const int GOLDI_ECORRUPT = 44801;
+
+/** @brief Error: other party's key is corrupt. */
+static const int GOLDI_EINVAL   = 44802;
+
+/** @brief Error: not enough entropy. */
+static const int GOLDI_ENODICE  = 44804;
+
+/** @brief Error: you need to initialize the library first. */
+static const int GOLDI_EUNINIT  = 44805;
+
+/** @brief Error: called init() but we are already initialized. */
+static const int GOLDI_EALREADYINIT  = 44805;
+
+/**
+ * @brief Initialize Goldilocks' precomputed tables and
+ * random number generator.  This function must be called before
+ * any of the other Goldilocks routines (except
+ * goldilocks_shared_secret in the current version) and should be
+ * called only once per process.
+ *
+ * There is currently no way to tear down this state.  It is possible
+ * that a future version of this library will not require this function.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EALREADYINIT Already initialized.
+ * @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
+ * @retval Nonzero An error occurred.
+ */
+int
+goldilocks_init ()
+__attribute__((warn_unused_result));
+
+
+/**
+ * @brief Generate a new random keypair.
+ * @param [out] privkey The generated private key.
+ * @param [out] pubkey The generated public key.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ENODICE Insufficient entropy.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_keygen (
+    struct goldilocks_private_key_t *privkey,
+    struct goldilocks_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2)));
+
+/**
+ * @brief Extract the public key from a private key.
+ *
+ * This is essentially a memcpy from the public part of the privkey.
+ *    
+ * @param [out] pubkey The extracted private key.
+ * @param [in] privkey The private key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT The private key is corrupt.
+ */
+int
+goldilocks_private_to_public (
+    struct goldilocks_public_key_t *pubkey,
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2)));
+
+/**
+ * @brief Generate a Diffie-Hellman shared secret in constant time.
+ *
+ * This function uses some compile-time flags whose merit remains to
+ * be decided.
+ *
+ * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
+ * of zeros to the secret before hashing.  In the case that the other
+ * party's key is detectably corrupt, instead the symmetric part
+ * of the secret key is used to produce a pseudorandom value.
+ *
+ * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
+ * the two parties' public keys is prepended to the hash.
+ *
+ * In the current version, this function can safely be run even without
+ * goldilocks_init().  But this property is not guaranteed for future
+ * versions, so call it anyway.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] shared The shared secret established with the other party.
+ * @param [in] my_privkey My private key.
+ * @param [in] your_pubkey The other party's public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EINVAL   The other party's key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_shared_secret (
+    uint8_t shared[64],
+    const struct goldilocks_private_key_t *my_privkey,
+    const struct goldilocks_public_key_t *your_pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,3)));
+    
+/**
+ * @brief Sign a message.
+ *
+ * The signature is deterministic, using the symmetric secret found in the
+ * secret key to form a nonce.
+ *
+ * The technique used in signing is a modified Schnorr system, like EdDSA.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] signature_out Space for the output signature.
+ * @param [in] message The message to be signed.
+ * @param [in] message_len The length of the message to be signed.
+ * @param [in] privkey My private key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_sign (
+    uint8_t signature_out[56*2],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2,4)));
+
+/**
+ * @brief Verify a signature.
+ *
+ * This function is fairly strict.  It will correctly detect when
+ * the signature has the wrong cofactor component, or when the sig
+ * values aren't less than p or q.
+ * 
+ * Currently this function does not detect when the public key is weird,
+ * eg 0, has cofactor, etc.  As a result, a party with a bogus public
+ * key could create signatures that succeed on some systems and fail on
+ * others.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] signature The signature.
+ * @param [in] message The message to be verified.
+ * @param [in] message_len The length of the message to be verified.
+ * @param [in] pubkey The signer's public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EINVAL The public key or signature is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_verify (
+    const uint8_t signature[56*2],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,4)));
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __GOLDILOCKS_H__ */
diff --git a/src/arch_32/ec_point.c b/src/arch_32/ec_point.c
new file mode 100644
index 0000000..823e43d
--- /dev/null
+++ b/src/arch_32/ec_point.c
@@ -0,0 +1,959 @@
+/**
+ * @cond internal
+ * @file ec_point.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @warning This file was automatically generated.
+ */
+
+#include "ec_point.h"
+
+
+void
+p448_isr (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L1,     x );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqrn (   &L1,   &L2,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L1,   &L0,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L2,   &L0,     9 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (   &L2,     x,   &L0 );
+    p448_sqrn (   &L0,   &L2,    18 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqrn (   &L0,   &L2,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,   111 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     x,   &L0 );
+    p448_sqrn (   &L0,   &L1,   223 );
+    p448_mul  (     a,   &L2,   &L0 );
+}
+
+void
+p448_inverse (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (     a,     x,   &L0 );
+}
+
+void
+add_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->a,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->b,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_sub  ( &d->y, &d->z, &d->x );
+    p448_bias ( &d->y,     2 );
+    p448_weak_reduce( &d->y );
+    p448_add  (   &L0, &d->x, &d->z );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+sub_tw_niels_from_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->b,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->a,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_add  ( &d->y, &d->x, &d->z );
+    p448_sub  (   &L0, &d->z, &d->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+add_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    add_tw_niels_to_tw_extensible(     e, &a->n );
+}
+
+void
+sub_tw_pniels_from_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    sub_tw_niels_from_tw_extensible(     e, &a->n );
+}
+
+void
+double_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  ( &a->u,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  (   &L1, &a->t );
+    p448_sub  ( &a->t,   &L1, &a->u );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  (   &L1,   &L0,   &L2 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     1 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+double_extensible (
+    struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  (   &L1,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  ( &a->u, &a->t );
+    p448_sub  ( &a->t, &a->u,   &L1 );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  ( &a->u,   &L0,   &L2 );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     2 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+twist_and_double (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  ( &b->u, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  (   &L0, &b->t );
+    p448_sub  ( &b->t,   &L0, &b->u );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  (   &L0, &b->z, &b->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     2 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+untwist_and_double (
+    struct extensible_t*          b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  (   &L0, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  ( &b->u, &b->t );
+    p448_sub  ( &b->t, &b->u,   &L0 );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  ( &b->u, &b->z, &b->x );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     1 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+convert_tw_affine_to_tw_pniels (
+    struct tw_pniels_t*       b,
+    const struct tw_affine_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->y, &a->x );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_set_ui( &b->z,     2 );
+}
+
+void
+convert_tw_affine_to_tw_extensible (
+    struct tw_extensible_t*   b,
+    const struct tw_affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_affine_to_extensible (
+    struct extensible_t*   b,
+    const struct affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_tw_extensible_to_tw_pniels (
+    struct tw_pniels_t*           b,
+    const struct tw_extensible_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->u, &a->t );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_add  ( &b->z, &a->z, &a->z );
+    p448_weak_reduce( &b->z );
+}
+
+void
+convert_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* d
+) {
+    p448_add  ( &e->u, &d->n.b, &d->n.a );
+    p448_sub  ( &e->t, &d->n.b, &d->n.a );
+    p448_bias ( &e->t,     2 );
+    p448_weak_reduce( &e->t );
+    p448_mul  ( &e->x, &d->z, &e->t );
+    p448_mul  ( &e->y, &d->z, &e->u );
+    p448_sqr  ( &e->z, &d->z );
+}
+
+void
+convert_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  e,
+    const struct tw_niels_t* d
+) {
+    p448_add  ( &e->y, &d->b, &d->a );
+    p448_weak_reduce( &e->y );
+    p448_sub  ( &e->x, &d->b, &d->a );
+    p448_bias ( &e->x,     2 );
+    p448_weak_reduce( &e->x );
+    p448_set_ui( &e->z,     1 );
+    p448_copy ( &e->t, &e->x );
+    p448_copy ( &e->u, &e->y );
+}
+
+void
+montgomery_step (
+    struct montgomery_t* a
+) {
+    struct p448_t L0, L1;
+    p448_add  (   &L0, &a->zd, &a->xd );
+    p448_sub  (   &L1, &a->xd, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sub  ( &a->zd, &a->xa, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_add  ( &a->zd, &a->za, &a->xa );
+    p448_mul  ( &a->za,   &L1, &a->zd );
+    p448_add  ( &a->xa, &a->za, &a->xd );
+    p448_sqr  ( &a->zd, &a->xa );
+    p448_mul  ( &a->xa, &a->z0, &a->zd );
+    p448_sub  ( &a->zd, &a->xd, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_sqr  ( &a->za, &a->zd );
+    p448_sqr  ( &a->xd,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mulw ( &a->zd, &a->xd, 39082 );
+    p448_sub  (   &L1, &a->xd,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_sub  (   &L0, &a->zd,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->zd,   &L0,   &L1 );
+}
+
+void
+deserialize_montgomery (
+    struct montgomery_t* a,
+    const struct p448_t* sbz
+) {
+    p448_sqr  ( &a->z0,   sbz );
+    p448_set_ui( &a->xd,     1 );
+    p448_set_ui( &a->zd,     0 );
+    p448_set_ui( &a->xa,     1 );
+    p448_copy ( &a->za, &a->z0 );
+}
+
+mask_t
+serialize_montgomery (
+    struct p448_t*             b,
+    const struct montgomery_t* a,
+    const struct p448_t*       sbz
+) {
+    mask_t L0, L1, L2;
+    struct p448_t L3, L4, L5, L6;
+    p448_mul  (   &L6, &a->z0, &a->zd );
+    p448_sub  (   &L4,   &L6, &a->xd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6, &a->za,   &L4 );
+    p448_mul  (   &L5, &a->z0, &a->xd );
+    p448_sub  (   &L4,   &L5, &a->zd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L3, &a->xa,   &L4 );
+    p448_add  (   &L5,   &L3,   &L6 );
+    p448_sub  (   &L4,   &L6,   &L3 );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_copy (   &L5, &a->z0 );
+    p448_addw (   &L5,     1 );
+    p448_sqr  (   &L4,   &L5 );
+    p448_mulw (   &L5,   &L4, 39082 );
+    p448_neg  (   &L4,   &L5 );
+    p448_add  (   &L5, &a->z0, &a->z0 );
+    p448_bias (   &L5,     1 );
+    p448_add  (   &L3,   &L5,   &L5 );
+    p448_add  (   &L5,   &L3,   &L4 );
+    p448_weak_reduce(   &L5 );
+    p448_mul  (   &L3, &a->xd,   &L5 );
+       L1 = p448_is_zero( &a->zd );
+       L2 = -   L1;
+    p448_mask (   &L4,   &L3,    L1 );
+    p448_add  (   &L5,   &L4, &a->zd );
+       L0 = ~   L1;
+    p448_mul  (   &L4,   sbz,   &L6 );
+    p448_addw (   &L4,    L2 );
+    p448_mul  (   &L6,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_mul  (   &L5,   &L6, &a->xd );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_isr  (   &L3,   &L6 );
+    p448_mul  (   &L5,   &L4,   &L3 );
+    p448_sqr  (   &L4,   &L3 );
+    p448_mul  (   &L3,   &L6,   &L4 );
+    p448_mask (     b,   &L5,    L0 );
+    p448_subw (   &L3,     1 );
+    p448_bias (   &L3,     1 );
+       L1 = p448_is_zero(   &L3 );
+       L0 = p448_is_zero(   sbz );
+    return    L1 |    L0;
+}
+
+void
+serialize_extensible (
+    struct p448_t*             b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sub  (   &L0, &a->y, &a->z );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_add  (     b, &a->z, &a->y );
+    p448_mul  (   &L1, &a->z, &a->x );
+    p448_mul  (   &L2,   &L0,   &L1 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (     b,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+}
+
+void
+untwist_and_double_and_serialize (
+    struct p448_t*                b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->y, &a->x );
+    p448_add  (     b, &a->y, &a->x );
+    p448_sqr  (   &L1,     b );
+    p448_add  (   &L2,   &L3,   &L3 );
+    p448_sub  (     b,   &L1,   &L2 );
+    p448_bias (     b,     3 );
+    p448_weak_reduce(     b );
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1,   &L2 );
+    p448_add  (   &L2,     b,     b );
+    p448_mulw (     b,   &L2, 39082 );
+    p448_neg  (   &L2,     b );
+    p448_bias (   &L2,     2 );
+    p448_mulw (   &L0,   &L2, 39082 );
+    p448_neg  (     b,   &L0 );
+    p448_bias (     b,     2 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_mul  (   &L2,     b,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     b,   &L0 );
+    p448_sqr  (     b,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (     b,   &L1,   &L3 );
+}
+
+void
+twist_even (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    p448_sqr  ( &b->y, &a->z );
+    p448_sqr  ( &b->z, &a->x );
+    p448_sub  ( &b->u, &b->y, &b->z );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->z, &a->z, &a->x );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->y, &b->z, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->x, &b->z, &b->y );
+    p448_mul  ( &b->t, &b->x, &b->u );
+    p448_mul  ( &b->y, &b->x, &b->t );
+    p448_isr  ( &b->t, &b->y );
+    p448_mul  ( &b->u, &b->x, &b->t );
+    p448_sqr  ( &b->x, &b->t );
+    p448_mul  ( &b->t, &b->y, &b->x );
+    p448_mul  ( &b->x, &a->x, &b->u );
+    p448_mul  ( &b->y, &a->y, &b->u );
+       L1 = p448_is_zero( &b->z );
+       L0 = -   L1;
+    p448_addw ( &b->y,    L0 );
+    p448_weak_reduce( &b->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+void
+test_only_twist (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_sqr  ( &b->u, &a->z );
+    p448_sqr  ( &b->y, &a->x );
+    p448_sub  ( &b->z, &b->u, &b->y );
+    p448_bias ( &b->z,     2 );
+    p448_add  ( &b->y, &b->z, &b->z );
+    p448_add  ( &b->u, &b->y, &b->y );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->y, &a->z, &a->x );
+    p448_bias ( &b->y,     2 );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->x, &b->y, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->t, &b->z, &b->x );
+    p448_mul  (   &L3, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L3 );
+    p448_isr  (   &L2, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L2 );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  ( &b->t, &b->x,   &L3 );
+    p448_add  ( &b->x, &a->y, &a->x );
+    p448_weak_reduce( &b->x );
+    p448_sub  (   &L2, &a->x, &a->y );
+    p448_bias (   &L2,     2 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L3, &b->t,   &L2 );
+    p448_add  (   &L2,   &L3, &b->x );
+    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_bias ( &b->t,     2 );
+    p448_weak_reduce( &b->t );
+    p448_mul  ( &b->x,   &L2, &b->u );
+       L0 = p448_is_zero( &b->y );
+       L1 = -   L0;
+    p448_addw ( &b->x,    L1 );
+    p448_weak_reduce( &b->x );
+    p448_mul  ( &b->y, &b->t, &b->u );
+       L0 = p448_is_zero( &b->z );
+       L1 = -   L0;
+    p448_addw ( &b->y,    L1 );
+    p448_weak_reduce( &b->y );
+       L1 = p448_is_zero( &a->y );
+       L0 =    L1 +     1;
+    p448_set_ui( &b->z,    L0 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+mask_t
+is_square (
+    const struct p448_t* x
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_isr  (   &L2,     x );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  (   &L2,     x,   &L3 );
+    p448_subw (   &L2,     1 );
+    p448_bias (   &L2,     1 );
+       L1 = p448_is_zero(   &L2 );
+       L0 = p448_is_zero(     x );
+    return    L1 |    L0;
+}
+
+mask_t
+is_even_pt (
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+is_even_tw (
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+deserialize_affine (
+    struct affine_t*     a,
+    const struct p448_t* sz
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L1,    sz );
+    p448_copy (   &L3,   &L1 );
+    p448_addw (   &L3,     1 );
+    p448_sqr  ( &a->x,   &L3 );
+    p448_mulw (   &L3, &a->x, 39082 );
+    p448_neg  ( &a->x,   &L3 );
+    p448_add  (   &L3,   &L1,   &L1 );
+    p448_bias (   &L3,     1 );
+    p448_add  ( &a->y,   &L3,   &L3 );
+    p448_add  (   &L3, &a->y, &a->x );
+    p448_weak_reduce(   &L3 );
+    p448_copy ( &a->y,   &L1 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  ( &a->x, &a->y );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_mul  ( &a->y, &a->x,   &L3 );
+    p448_sqr  (   &L2, &a->x );
+    p448_mul  (   &L0,   &L2, &a->y );
+    p448_mul  ( &a->y, &a->x,   &L0 );
+    p448_isr  (   &L3, &a->y );
+    p448_mul  ( &a->y,   &L2,   &L3 );
+    p448_sqr  (   &L2,   &L3 );
+    p448_mul  (   &L3,   &L0,   &L2 );
+    p448_mul  (   &L0, &a->x,   &L3 );
+    p448_add  (   &L2, &a->y, &a->y );
+    p448_mul  ( &a->x,    sz,   &L2 );
+    p448_addw (   &L1,     1 );
+    p448_mul  ( &a->y,   &L1,   &L3 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+deserialize_and_twist_approx (
+    struct tw_extensible_t* a,
+    const struct p448_t*    sdm1,
+    const struct p448_t*    sz
+) {
+    struct p448_t L0, L1;
+    p448_sqr  ( &a->z,    sz );
+    p448_copy ( &a->y, &a->z );
+    p448_addw ( &a->y,     1 );
+    p448_sqr  ( &a->x, &a->y );
+    p448_mulw ( &a->y, &a->x, 39082 );
+    p448_neg  ( &a->x, &a->y );
+    p448_add  ( &a->y, &a->z, &a->z );
+    p448_bias ( &a->y,     1 );
+    p448_add  ( &a->u, &a->y, &a->y );
+    p448_add  ( &a->y, &a->u, &a->x );
+    p448_weak_reduce( &a->y );
+    p448_sqr  ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  ( &a->u, &a->x );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_mul  ( &a->x,  sdm1, &a->u );
+    p448_mul  (   &L0, &a->x, &a->y );
+    p448_mul  ( &a->t,   &L0, &a->y );
+    p448_mul  ( &a->u, &a->x, &a->t );
+    p448_mul  ( &a->t, &a->u,   &L0 );
+    p448_mul  ( &a->y, &a->x, &a->t );
+    p448_isr  (   &L0, &a->y );
+    p448_mul  ( &a->y, &a->u,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &a->u, &a->t,   &L1 );
+    p448_mul  ( &a->t, &a->x, &a->u );
+    p448_add  ( &a->x,    sz,    sz );
+    p448_mul  (   &L0, &a->u, &a->x );
+    p448_copy ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  (   &L1, &a->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->x,   &L1,   &L0 );
+    p448_mul  (   &L0, &a->u, &a->y );
+    p448_addw ( &a->z,     1 );
+    p448_mul  ( &a->y, &a->z,   &L0 );
+    p448_subw ( &a->t,     1 );
+    p448_bias ( &a->t,     1 );
+    mask_t ret = p448_is_zero( &a->t );
+    p448_set_ui( &a->z,     1 );
+    p448_copy ( &a->t, &a->x );
+    p448_copy ( &a->u, &a->y );
+    return ret;
+}
+
+void
+set_identity_extensible (
+    struct extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_affine (
+    struct affine_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+}
+
+mask_t
+eq_affine (
+    const struct affine_t* a,
+    const struct affine_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2;
+    p448_sub  (   &L2, &a->x, &b->x );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_sub  (   &L2, &a->y, &b->y );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_extensible (
+    const struct extensible_t* a,
+    const struct extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_tw_extensible (
+    const struct tw_extensible_t* a,
+    const struct tw_extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+void
+elligator_2s_inject (
+    struct affine_t*     a,
+    const struct p448_t* r
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    p448_sqr  ( &a->x,     r );
+    p448_sqr  (   &L3, &a->x );
+    p448_copy ( &a->y,   &L3 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  (   &L9, &a->y );
+    p448_bias (   &L9,     2 );
+    p448_weak_reduce(   &L9 );
+    p448_sqr  (   &L2,   &L9 );
+    p448_mulw (   &L8,   &L2, 1527402724 );
+    p448_mulw (   &L7,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_weak_reduce( &a->y );
+    p448_mulw (   &L8,   &L2, 6109454568 );
+    p448_sub  (   &L7, &a->y,   &L8 );
+    p448_bias (   &L7,     2 );
+    p448_weak_reduce(   &L7 );
+    p448_mulw (   &L4, &a->y, 78160 );
+    p448_mul  (   &L6,   &L7,   &L9 );
+    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mul  (   &L4,   &L7,   &L8 );
+    p448_isr  (   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_sqr  (   &L6,   &L5 );
+    p448_mul  (   &L5,   &L8,   &L6 );
+    p448_mul  (   &L8,   &L7,   &L5 );
+    p448_mul  (   &L7,   &L8,   &L5 );
+    p448_copy (   &L5, &a->x );
+    p448_subw (   &L5,     1 );
+    p448_addw ( &a->x,     1 );
+    p448_mul  (   &L6, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_bias ( &a->x,     3 );
+    p448_weak_reduce( &a->x );
+    p448_mul  (   &L5,   &L4, &a->x );
+    p448_mulw (   &L4,   &L5, 78160 );
+    p448_neg  ( &a->x,   &L4 );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_add  (   &L4,   &L3,   &L3 );
+    p448_add  (   &L3,   &L4,   &L2 );
+    p448_subw (   &L3,     2 );
+    p448_bias (   &L3,     1 );
+    p448_weak_reduce(   &L3 );
+    p448_mul  (   &L2,   &L3,   &L8 );
+    p448_mulw (   &L3,   &L2, 3054649120 );
+    p448_add  (   &L2,   &L3, &a->y );
+    p448_mul  ( &a->y,   &L7,   &L2 );
+       L1 = p448_is_zero(   &L9 );
+       L0 = -   L1;
+    p448_addw ( &a->y,    L0 );
+    p448_weak_reduce( &a->y );
+}
+
+mask_t
+validate_affine (
+    const struct affine_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L0, &a->y );
+    p448_sqr  (   &L2, &a->x );
+    p448_add  (   &L3,   &L2,   &L0 );
+    p448_subw (   &L3,     1 );
+    p448_mulw (   &L1,   &L2, 39081 );
+    p448_neg  (   &L2,   &L1 );
+    p448_bias (   &L2,     2 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sub  (   &L0,   &L3,   &L1 );
+    p448_bias (   &L0,     3 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+validate_tw_extensible (
+    const struct tw_extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L2, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L2 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L3, &ext->x, &ext->y );
+    p448_neg  (   &L2,   &L3 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_bias (   &L3,     2 );
+       L1 = p448_is_zero(   &L3 );
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L2,   &L4 );
+    p448_addw (   &L2,     0 );
+    p448_sqr  (   &L3, &ext->x );
+    p448_add  (   &L4,   &L3,   &L2 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L3, &ext->t );
+    p448_mul  (   &L2,   &L3,   &L5 );
+    p448_mulw (   &L3,   &L2, 39081 );
+    p448_neg  (   &L5,   &L3 );
+    p448_add  (   &L3,   &L5,   &L4 );
+    p448_neg  (   &L5,   &L2 );
+    p448_add  (   &L4,   &L5,   &L3 );
+    p448_sqr  (   &L3, &ext->z );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     4 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+validate_extensible (
+    const struct extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L3,   &L4 );
+    p448_addw (   &L3,     0 );
+    p448_sqr  (   &L2, &ext->z );
+    p448_add  (   &L4,   &L2,   &L3 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L2, &ext->t );
+    p448_mul  (   &L3,   &L2,   &L5 );
+    p448_mulw (   &L5,   &L3, 39081 );
+    p448_neg  (   &L2,   &L5 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_sqr  (   &L2, &ext->x );
+    p448_neg  (   &L4,   &L2 );
+    p448_add  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     4 );
+       L1 = p448_is_zero(   &L2 );
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L3, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L3 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L2, &ext->x, &ext->y );
+    p448_neg  (   &L3,   &L2 );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+
diff --git a/src/arch_32/p448.c b/src/arch_32/p448.c
new file mode 100644
index 0000000..d3b2956
--- /dev/null
+++ b/src/arch_32/p448.c
@@ -0,0 +1,300 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "word.h"
+#include "p448.h"
+//#include "x86-64-arith.h"
+
+static inline mask_t __attribute__((always_inline))
+is_zero (
+    word_t x
+) {
+    dword_t xx = x;
+    xx--;
+    return xx >> WORD_BITS;
+}
+
+static uint64_t widemul_32 (
+    const uint32_t a,
+    const uint32_t b
+) {
+    return ((uint64_t)a)* b;
+}
+
+void
+p448_mul (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    const p448_t *bs
+) {
+    // p448_t ar, br;
+//     p448_copy(&ar,as);
+//     p448_copy(&br,bs);
+//     p448_weak_reduce(&ar);
+//     p448_weak_reduce(&br);
+    
+    const uint32_t *a = as->limb, *b = bs->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
+    uint32_t mask = (1<<28) - 1;  
+
+    uint32_t aa[8], bb[8];
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    // unsigned int i;
+    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    // }
+    int i,j;
+    for (i=0; i<8; i++) {
+        aa[i] = a[i] + a[i+8];
+        bb[i] = b[i] + b[i+8];
+    }
+    
+    for (j=0; j<8; j++) {
+        accum2 = 0;
+    
+        for (i=0; i<=j; i++) {      
+            accum2 += widemul_32(a[j-i],b[i]);
+            accum1 += widemul_32(aa[j-i],bb[i]);
+            accum0 += widemul_32(a[8+j-i], b[8+i]);
+        }
+        
+        accum1 -= accum2;
+        accum0 += accum2;
+        accum2 = 0;
+        
+        for (; i<8; i++) {
+            accum0 -= widemul_32(a[8+j-i], b[i]);
+            accum2 += widemul_32(aa[8+j-i], bb[i]);
+            accum1 += widemul_32(a[16+j-i], b[8+i]);
+        }
+
+        accum1 += accum2;
+        accum0 += accum2;
+
+        c[j] = ((uint32_t)(accum0)) & mask;
+        c[j+8] = ((uint32_t)(accum1)) & mask;
+
+        accum0 >>= 28;
+        accum1 >>= 28;
+    }
+    
+    accum0 += accum1;
+    accum0 += c[8];
+    accum1 += c[0];
+    c[8] = ((uint32_t)(accum0)) & mask;
+    c[0] = ((uint32_t)(accum1)) & mask;
+    
+    accum0 >>= 28;
+    accum1 >>= 28;
+    c[9] += ((uint32_t)(accum0));
+    c[1] += ((uint32_t)(accum1));
+}
+
+void
+p448_mulw (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    uint64_t b
+) {
+    const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
+    
+    const uint32_t *a = as->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0, accum8;
+    uint32_t mask = (1ull<<28)-1;  
+
+    int i;
+
+    accum0 = widemul_32(blo, a[0]);
+    accum8 = widemul_32(blo, a[8]);
+    accum0 += widemul_32(bhi, a[15]);
+    accum8 += widemul_32(bhi, a[15] + a[7]);
+
+    c[0] = accum0 & mask; accum0 >>= 28;
+    c[8] = accum8 & mask; accum8 >>= 28;
+    
+    for (i=1; i<8; i++) {
+        accum0 += widemul_32(blo, a[i]);
+        accum8 += widemul_32(blo, a[i+8]);
+        
+        accum0 += widemul_32(bhi, a[i-1]);
+        accum8 += widemul_32(bhi, a[i+7]);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+    }
+
+    accum0 += accum8 + c[8];
+    c[8] = accum0 & mask;
+    c[9] += accum0 >> 28;
+
+    accum8 += c[0];
+    c[0] = accum8 & mask;
+    c[1] += accum8 >> 28;
+}
+
+void
+p448_sqr (
+    p448_t *__restrict__ cs,
+    const p448_t *as
+) {
+    p448_mul(cs,as,as); // PERF
+}
+
+void
+p448_strong_reduce (
+    p448_t *a
+) {
+    word_t mask = (1ull<<28)-1;
+
+    /* first, clear high */
+    a->limb[8] += a->limb[15]>>28;
+    a->limb[0] += a->limb[15]>>28;
+    a->limb[15] &= mask;
+
+    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+
+    dsword_t scarry = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
+        a->limb[i] = scarry & mask;
+        scarry >>= 28;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
+    * so let's add back in p.  will carry back off the top for 2^448.
+    */
+
+    assert(is_zero(scarry) | is_zero(scarry+1));
+
+    word_t scarry_mask = scarry & mask;
+    dword_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<16; i++) {
+        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
+        a->limb[i] = carry & mask;
+        carry >>= 28;
+    }
+
+    assert(is_zero(carry + scarry));
+}
+
+mask_t
+p448_is_zero (
+    const struct p448_t *a
+) {
+    struct p448_t b;
+    p448_copy(&b,a);
+    p448_strong_reduce(&b);
+
+    uint32_t any = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        any |= b.limb[i];
+    }
+    return is_zero(any);
+}
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+) {
+    int i,j;
+    p448_t red;
+    p448_copy(&red, x);
+    p448_strong_reduce(&red);
+    for (i=0; i<8; i++) {
+        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
+        for (j=0; j<7; j++) {
+            serial[7*i+j] = limb;
+            limb >>= 8;
+        }
+        assert(limb == 0);
+    }
+}
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+) {
+    int i,j;
+    for (i=0; i<8; i++) {
+        uint64_t out = 0;
+        for (j=0; j<7; j++) {
+            out |= ((uint64_t)serial[7*i+j])<<(8*j);
+        }
+        x->limb[2*i] = out & (1ull<<28)-1;
+        x->limb[2*i+1] = out >> 28;
+    }
+    
+    /* Check for reduction.
+     *
+     * The idea is to create a variable ge which is all ones (rather, 56 ones)
+     * if and only if the low $i$ words of $x$ are >= those of p.
+     *
+     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
+     */
+    uint32_t ge = -1, mask = (1ull<<28)-1;
+    for (i=0; i<8; i++) {
+        ge &= x->limb[i];
+    }
+    
+    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
+    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    
+    /* Propagate the rest */
+    for (i=9; i<16; i++) {
+        ge &= x->limb[i];
+    }
+    
+    return ~is_zero(ge ^ mask);
+}
+
+void
+simultaneous_invert_p448(
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+) {
+  if (n==0) {
+      return;
+  } else if (n==1) {
+      p448_inverse(out,in);
+      return;
+  }
+  
+  p448_copy(&out[1], &in[0]);
+  int i;
+  for (i=1; i<(int) (n-1); i++) {
+      p448_mul(&out[i+1], &out[i], &in[i]);
+  }
+  p448_mul(&out[0], &out[n-1], &in[n-1]);
+  
+  struct p448_t tmp;
+  p448_inverse(&tmp, &out[0]);
+  p448_copy(&out[0], &tmp);
+  
+  /* at this point, out[0] = product(in[i]) ^ -1
+   * out[i] = product(in[0]..in[i-1]) if i != 0
+   */
+  for (i=n-1; i>0; i--) {
+      p448_mul(&tmp, &out[i], &out[0]);
+      p448_copy(&out[i], &tmp);
+      
+      p448_mul(&tmp, &out[0], &in[i]);
+      p448_copy(&out[0], &tmp);
+  }
+}
diff --git a/src/arch_32/p448.h b/src/arch_32/p448.h
new file mode 100644
index 0000000..4628a89
--- /dev/null
+++ b/src/arch_32/p448.h
@@ -0,0 +1,378 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __P448_H__
+#define __P448_H__ 1
+
+#include "word.h"
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef struct p448_t {
+  uint32_t limb[16];
+} __attribute__((aligned(32))) p448_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) __attribute__((unused,always_inline));
+           
+static __inline__ void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t do_swap
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+            
+static __inline__ void
+p448_cond_neg (
+    p448_t *a,
+    mask_t doNegate
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_weak_reduce (
+    p448_t *inout
+) __attribute__((unused,always_inline));
+             
+void
+p448_strong_reduce (
+    p448_t *inout
+);
+
+mask_t
+p448_is_zero (
+    const p448_t *in
+);
+             
+static __inline__ void
+p448_bias (
+    p448_t *inout,
+    int amount
+) __attribute__((unused,always_inline));
+
+void
+p448_mul (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    const p448_t *b
+);
+
+void
+p448_mulw (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    uint64_t b
+);
+
+void
+p448_sqr (
+    p448_t *__restrict__ out,
+    const p448_t *a
+);
+         
+static __inline__ void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) __attribute__((unused,always_inline));
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+);
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+);
+    
+static __inline__ void
+p448_mask(
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) __attribute__((unused,always_inline));
+
+/**
+* Returns 1/x.
+* 
+* If x=0, returns 0.
+*/
+void
+p448_inverse (
+   struct p448_t*       a,
+   const struct p448_t* x
+);
+       
+void
+simultaneous_invert_p448 (
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+);
+
+static inline mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) __attribute__((always_inline,unused));
+
+/* -------------- Inline functions begin here -------------- */
+
+void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) {
+    int i;
+    out->limb[0] = x & (1<<28)-1;
+    out->limb[1] = x>>28;
+    for (i=2; i<16; i++) {
+      out->limb[i] = 0;
+    }
+}
+            
+void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t doswap
+) {
+    big_register_t *aa = (big_register_t*)a;
+    big_register_t *bb = (big_register_t*)b;
+    big_register_t m = doswap;
+
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        big_register_t x = m & (aa[i]^bb[i]);
+        aa[i] ^= x;
+        bb[i] ^= x;
+    }
+}
+
+void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    */
+}
+
+void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] - b->limb[i];
+    }
+    */
+}
+
+void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = -a->limb[i];
+    }
+    */
+}
+
+void
+p448_cond_neg(
+    p448_t *a,
+    mask_t doNegate
+) {
+    unsigned int i;
+    struct p448_t negated;
+    big_register_t *aa = (big_register_t *)a;
+    big_register_t *nn = (big_register_t*)&negated;
+    big_register_t m = doNegate;
+    
+    p448_neg(&negated, a);
+    p448_bias(&negated, 2);
+    
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        aa[i] = (aa[i] & ~m) | (nn[i] & m);
+    }
+}
+
+void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] += x;
+}
+             
+void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] -= x;
+}
+
+void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) {
+  *out = *a;
+}
+
+void
+p448_bias (
+    p448_t *a,
+    int amt
+) {
+    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
+    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
+    uint32x4_t *aa = (uint32x4_t*) a;
+    aa[0] += lo;
+    aa[1] += lo;
+    aa[2] += hi;
+    aa[3] += lo;
+}
+
+void
+p448_weak_reduce (
+    p448_t *a
+) {
+    uint64_t mask = (1ull<<28) - 1;
+    uint64_t tmp = a->limb[15] >> 28;
+    int i;
+    a->limb[8] += tmp;
+    for (i=15; i>0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+    }
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) {
+    p448_t tmp;
+    assert(n>0);
+    if (n&1) {
+        p448_sqr(y,x);
+        n--;
+    } else {
+        p448_sqr(&tmp,x);
+        p448_sqr(y,&tmp);
+        n-=2;
+    }
+    for (; n; n-=2) {
+        p448_sqr(&tmp,y);
+        p448_sqr(y,&tmp);
+    }
+}
+
+mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) {
+    struct p448_t ra, rb;
+    p448_copy(&ra, a);
+    p448_copy(&rb, b);
+    p448_weak_reduce(&ra);
+    p448_weak_reduce(&rb);
+    p448_sub(&ra, &ra, &rb);
+    p448_bias(&ra, 2);
+    return p448_is_zero(&ra);
+}
+
+void
+p448_mask (
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
+        a->limb[i] = b->limb[i] & mask;
+    }
+}
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __P448_H__ */
diff --git a/src/arch_arm_32/ec_point.c b/src/arch_arm_32/ec_point.c
new file mode 100644
index 0000000..823e43d
--- /dev/null
+++ b/src/arch_arm_32/ec_point.c
@@ -0,0 +1,959 @@
+/**
+ * @cond internal
+ * @file ec_point.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @warning This file was automatically generated.
+ */
+
+#include "ec_point.h"
+
+
+void
+p448_isr (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L1,     x );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqrn (   &L1,   &L2,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L1,   &L0,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L2,   &L0,     9 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (   &L2,     x,   &L0 );
+    p448_sqrn (   &L0,   &L2,    18 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqrn (   &L0,   &L2,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,   111 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     x,   &L0 );
+    p448_sqrn (   &L0,   &L1,   223 );
+    p448_mul  (     a,   &L2,   &L0 );
+}
+
+void
+p448_inverse (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (     a,     x,   &L0 );
+}
+
+void
+add_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->a,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->b,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_sub  ( &d->y, &d->z, &d->x );
+    p448_bias ( &d->y,     2 );
+    p448_weak_reduce( &d->y );
+    p448_add  (   &L0, &d->x, &d->z );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+sub_tw_niels_from_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->b,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->a,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_add  ( &d->y, &d->x, &d->z );
+    p448_sub  (   &L0, &d->z, &d->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+add_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    add_tw_niels_to_tw_extensible(     e, &a->n );
+}
+
+void
+sub_tw_pniels_from_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    sub_tw_niels_from_tw_extensible(     e, &a->n );
+}
+
+void
+double_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  ( &a->u,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  (   &L1, &a->t );
+    p448_sub  ( &a->t,   &L1, &a->u );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  (   &L1,   &L0,   &L2 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     1 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+double_extensible (
+    struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  (   &L1,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  ( &a->u, &a->t );
+    p448_sub  ( &a->t, &a->u,   &L1 );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  ( &a->u,   &L0,   &L2 );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     2 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+twist_and_double (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  ( &b->u, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  (   &L0, &b->t );
+    p448_sub  ( &b->t,   &L0, &b->u );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  (   &L0, &b->z, &b->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     2 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+untwist_and_double (
+    struct extensible_t*          b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  (   &L0, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  ( &b->u, &b->t );
+    p448_sub  ( &b->t, &b->u,   &L0 );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  ( &b->u, &b->z, &b->x );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     1 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+convert_tw_affine_to_tw_pniels (
+    struct tw_pniels_t*       b,
+    const struct tw_affine_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->y, &a->x );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_set_ui( &b->z,     2 );
+}
+
+void
+convert_tw_affine_to_tw_extensible (
+    struct tw_extensible_t*   b,
+    const struct tw_affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_affine_to_extensible (
+    struct extensible_t*   b,
+    const struct affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_tw_extensible_to_tw_pniels (
+    struct tw_pniels_t*           b,
+    const struct tw_extensible_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->u, &a->t );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_add  ( &b->z, &a->z, &a->z );
+    p448_weak_reduce( &b->z );
+}
+
+void
+convert_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* d
+) {
+    p448_add  ( &e->u, &d->n.b, &d->n.a );
+    p448_sub  ( &e->t, &d->n.b, &d->n.a );
+    p448_bias ( &e->t,     2 );
+    p448_weak_reduce( &e->t );
+    p448_mul  ( &e->x, &d->z, &e->t );
+    p448_mul  ( &e->y, &d->z, &e->u );
+    p448_sqr  ( &e->z, &d->z );
+}
+
+void
+convert_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  e,
+    const struct tw_niels_t* d
+) {
+    p448_add  ( &e->y, &d->b, &d->a );
+    p448_weak_reduce( &e->y );
+    p448_sub  ( &e->x, &d->b, &d->a );
+    p448_bias ( &e->x,     2 );
+    p448_weak_reduce( &e->x );
+    p448_set_ui( &e->z,     1 );
+    p448_copy ( &e->t, &e->x );
+    p448_copy ( &e->u, &e->y );
+}
+
+void
+montgomery_step (
+    struct montgomery_t* a
+) {
+    struct p448_t L0, L1;
+    p448_add  (   &L0, &a->zd, &a->xd );
+    p448_sub  (   &L1, &a->xd, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sub  ( &a->zd, &a->xa, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_add  ( &a->zd, &a->za, &a->xa );
+    p448_mul  ( &a->za,   &L1, &a->zd );
+    p448_add  ( &a->xa, &a->za, &a->xd );
+    p448_sqr  ( &a->zd, &a->xa );
+    p448_mul  ( &a->xa, &a->z0, &a->zd );
+    p448_sub  ( &a->zd, &a->xd, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_sqr  ( &a->za, &a->zd );
+    p448_sqr  ( &a->xd,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mulw ( &a->zd, &a->xd, 39082 );
+    p448_sub  (   &L1, &a->xd,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_sub  (   &L0, &a->zd,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->zd,   &L0,   &L1 );
+}
+
+void
+deserialize_montgomery (
+    struct montgomery_t* a,
+    const struct p448_t* sbz
+) {
+    p448_sqr  ( &a->z0,   sbz );
+    p448_set_ui( &a->xd,     1 );
+    p448_set_ui( &a->zd,     0 );
+    p448_set_ui( &a->xa,     1 );
+    p448_copy ( &a->za, &a->z0 );
+}
+
+mask_t
+serialize_montgomery (
+    struct p448_t*             b,
+    const struct montgomery_t* a,
+    const struct p448_t*       sbz
+) {
+    mask_t L0, L1, L2;
+    struct p448_t L3, L4, L5, L6;
+    p448_mul  (   &L6, &a->z0, &a->zd );
+    p448_sub  (   &L4,   &L6, &a->xd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6, &a->za,   &L4 );
+    p448_mul  (   &L5, &a->z0, &a->xd );
+    p448_sub  (   &L4,   &L5, &a->zd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L3, &a->xa,   &L4 );
+    p448_add  (   &L5,   &L3,   &L6 );
+    p448_sub  (   &L4,   &L6,   &L3 );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_copy (   &L5, &a->z0 );
+    p448_addw (   &L5,     1 );
+    p448_sqr  (   &L4,   &L5 );
+    p448_mulw (   &L5,   &L4, 39082 );
+    p448_neg  (   &L4,   &L5 );
+    p448_add  (   &L5, &a->z0, &a->z0 );
+    p448_bias (   &L5,     1 );
+    p448_add  (   &L3,   &L5,   &L5 );
+    p448_add  (   &L5,   &L3,   &L4 );
+    p448_weak_reduce(   &L5 );
+    p448_mul  (   &L3, &a->xd,   &L5 );
+       L1 = p448_is_zero( &a->zd );
+       L2 = -   L1;
+    p448_mask (   &L4,   &L3,    L1 );
+    p448_add  (   &L5,   &L4, &a->zd );
+       L0 = ~   L1;
+    p448_mul  (   &L4,   sbz,   &L6 );
+    p448_addw (   &L4,    L2 );
+    p448_mul  (   &L6,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_mul  (   &L5,   &L6, &a->xd );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_isr  (   &L3,   &L6 );
+    p448_mul  (   &L5,   &L4,   &L3 );
+    p448_sqr  (   &L4,   &L3 );
+    p448_mul  (   &L3,   &L6,   &L4 );
+    p448_mask (     b,   &L5,    L0 );
+    p448_subw (   &L3,     1 );
+    p448_bias (   &L3,     1 );
+       L1 = p448_is_zero(   &L3 );
+       L0 = p448_is_zero(   sbz );
+    return    L1 |    L0;
+}
+
+void
+serialize_extensible (
+    struct p448_t*             b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sub  (   &L0, &a->y, &a->z );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_add  (     b, &a->z, &a->y );
+    p448_mul  (   &L1, &a->z, &a->x );
+    p448_mul  (   &L2,   &L0,   &L1 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (     b,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+}
+
+void
+untwist_and_double_and_serialize (
+    struct p448_t*                b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->y, &a->x );
+    p448_add  (     b, &a->y, &a->x );
+    p448_sqr  (   &L1,     b );
+    p448_add  (   &L2,   &L3,   &L3 );
+    p448_sub  (     b,   &L1,   &L2 );
+    p448_bias (     b,     3 );
+    p448_weak_reduce(     b );
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1,   &L2 );
+    p448_add  (   &L2,     b,     b );
+    p448_mulw (     b,   &L2, 39082 );
+    p448_neg  (   &L2,     b );
+    p448_bias (   &L2,     2 );
+    p448_mulw (   &L0,   &L2, 39082 );
+    p448_neg  (     b,   &L0 );
+    p448_bias (     b,     2 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_mul  (   &L2,     b,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     b,   &L0 );
+    p448_sqr  (     b,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (     b,   &L1,   &L3 );
+}
+
+void
+twist_even (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    p448_sqr  ( &b->y, &a->z );
+    p448_sqr  ( &b->z, &a->x );
+    p448_sub  ( &b->u, &b->y, &b->z );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->z, &a->z, &a->x );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->y, &b->z, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->x, &b->z, &b->y );
+    p448_mul  ( &b->t, &b->x, &b->u );
+    p448_mul  ( &b->y, &b->x, &b->t );
+    p448_isr  ( &b->t, &b->y );
+    p448_mul  ( &b->u, &b->x, &b->t );
+    p448_sqr  ( &b->x, &b->t );
+    p448_mul  ( &b->t, &b->y, &b->x );
+    p448_mul  ( &b->x, &a->x, &b->u );
+    p448_mul  ( &b->y, &a->y, &b->u );
+       L1 = p448_is_zero( &b->z );
+       L0 = -   L1;
+    p448_addw ( &b->y,    L0 );
+    p448_weak_reduce( &b->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+void
+test_only_twist (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_sqr  ( &b->u, &a->z );
+    p448_sqr  ( &b->y, &a->x );
+    p448_sub  ( &b->z, &b->u, &b->y );
+    p448_bias ( &b->z,     2 );
+    p448_add  ( &b->y, &b->z, &b->z );
+    p448_add  ( &b->u, &b->y, &b->y );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->y, &a->z, &a->x );
+    p448_bias ( &b->y,     2 );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->x, &b->y, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->t, &b->z, &b->x );
+    p448_mul  (   &L3, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L3 );
+    p448_isr  (   &L2, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L2 );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  ( &b->t, &b->x,   &L3 );
+    p448_add  ( &b->x, &a->y, &a->x );
+    p448_weak_reduce( &b->x );
+    p448_sub  (   &L2, &a->x, &a->y );
+    p448_bias (   &L2,     2 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L3, &b->t,   &L2 );
+    p448_add  (   &L2,   &L3, &b->x );
+    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_bias ( &b->t,     2 );
+    p448_weak_reduce( &b->t );
+    p448_mul  ( &b->x,   &L2, &b->u );
+       L0 = p448_is_zero( &b->y );
+       L1 = -   L0;
+    p448_addw ( &b->x,    L1 );
+    p448_weak_reduce( &b->x );
+    p448_mul  ( &b->y, &b->t, &b->u );
+       L0 = p448_is_zero( &b->z );
+       L1 = -   L0;
+    p448_addw ( &b->y,    L1 );
+    p448_weak_reduce( &b->y );
+       L1 = p448_is_zero( &a->y );
+       L0 =    L1 +     1;
+    p448_set_ui( &b->z,    L0 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+mask_t
+is_square (
+    const struct p448_t* x
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_isr  (   &L2,     x );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  (   &L2,     x,   &L3 );
+    p448_subw (   &L2,     1 );
+    p448_bias (   &L2,     1 );
+       L1 = p448_is_zero(   &L2 );
+       L0 = p448_is_zero(     x );
+    return    L1 |    L0;
+}
+
+mask_t
+is_even_pt (
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+is_even_tw (
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+deserialize_affine (
+    struct affine_t*     a,
+    const struct p448_t* sz
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L1,    sz );
+    p448_copy (   &L3,   &L1 );
+    p448_addw (   &L3,     1 );
+    p448_sqr  ( &a->x,   &L3 );
+    p448_mulw (   &L3, &a->x, 39082 );
+    p448_neg  ( &a->x,   &L3 );
+    p448_add  (   &L3,   &L1,   &L1 );
+    p448_bias (   &L3,     1 );
+    p448_add  ( &a->y,   &L3,   &L3 );
+    p448_add  (   &L3, &a->y, &a->x );
+    p448_weak_reduce(   &L3 );
+    p448_copy ( &a->y,   &L1 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  ( &a->x, &a->y );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_mul  ( &a->y, &a->x,   &L3 );
+    p448_sqr  (   &L2, &a->x );
+    p448_mul  (   &L0,   &L2, &a->y );
+    p448_mul  ( &a->y, &a->x,   &L0 );
+    p448_isr  (   &L3, &a->y );
+    p448_mul  ( &a->y,   &L2,   &L3 );
+    p448_sqr  (   &L2,   &L3 );
+    p448_mul  (   &L3,   &L0,   &L2 );
+    p448_mul  (   &L0, &a->x,   &L3 );
+    p448_add  (   &L2, &a->y, &a->y );
+    p448_mul  ( &a->x,    sz,   &L2 );
+    p448_addw (   &L1,     1 );
+    p448_mul  ( &a->y,   &L1,   &L3 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+deserialize_and_twist_approx (
+    struct tw_extensible_t* a,
+    const struct p448_t*    sdm1,
+    const struct p448_t*    sz
+) {
+    struct p448_t L0, L1;
+    p448_sqr  ( &a->z,    sz );
+    p448_copy ( &a->y, &a->z );
+    p448_addw ( &a->y,     1 );
+    p448_sqr  ( &a->x, &a->y );
+    p448_mulw ( &a->y, &a->x, 39082 );
+    p448_neg  ( &a->x, &a->y );
+    p448_add  ( &a->y, &a->z, &a->z );
+    p448_bias ( &a->y,     1 );
+    p448_add  ( &a->u, &a->y, &a->y );
+    p448_add  ( &a->y, &a->u, &a->x );
+    p448_weak_reduce( &a->y );
+    p448_sqr  ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  ( &a->u, &a->x );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_mul  ( &a->x,  sdm1, &a->u );
+    p448_mul  (   &L0, &a->x, &a->y );
+    p448_mul  ( &a->t,   &L0, &a->y );
+    p448_mul  ( &a->u, &a->x, &a->t );
+    p448_mul  ( &a->t, &a->u,   &L0 );
+    p448_mul  ( &a->y, &a->x, &a->t );
+    p448_isr  (   &L0, &a->y );
+    p448_mul  ( &a->y, &a->u,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &a->u, &a->t,   &L1 );
+    p448_mul  ( &a->t, &a->x, &a->u );
+    p448_add  ( &a->x,    sz,    sz );
+    p448_mul  (   &L0, &a->u, &a->x );
+    p448_copy ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  (   &L1, &a->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->x,   &L1,   &L0 );
+    p448_mul  (   &L0, &a->u, &a->y );
+    p448_addw ( &a->z,     1 );
+    p448_mul  ( &a->y, &a->z,   &L0 );
+    p448_subw ( &a->t,     1 );
+    p448_bias ( &a->t,     1 );
+    mask_t ret = p448_is_zero( &a->t );
+    p448_set_ui( &a->z,     1 );
+    p448_copy ( &a->t, &a->x );
+    p448_copy ( &a->u, &a->y );
+    return ret;
+}
+
+void
+set_identity_extensible (
+    struct extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_affine (
+    struct affine_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+}
+
+mask_t
+eq_affine (
+    const struct affine_t* a,
+    const struct affine_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2;
+    p448_sub  (   &L2, &a->x, &b->x );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_sub  (   &L2, &a->y, &b->y );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_extensible (
+    const struct extensible_t* a,
+    const struct extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_tw_extensible (
+    const struct tw_extensible_t* a,
+    const struct tw_extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+void
+elligator_2s_inject (
+    struct affine_t*     a,
+    const struct p448_t* r
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    p448_sqr  ( &a->x,     r );
+    p448_sqr  (   &L3, &a->x );
+    p448_copy ( &a->y,   &L3 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  (   &L9, &a->y );
+    p448_bias (   &L9,     2 );
+    p448_weak_reduce(   &L9 );
+    p448_sqr  (   &L2,   &L9 );
+    p448_mulw (   &L8,   &L2, 1527402724 );
+    p448_mulw (   &L7,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_weak_reduce( &a->y );
+    p448_mulw (   &L8,   &L2, 6109454568 );
+    p448_sub  (   &L7, &a->y,   &L8 );
+    p448_bias (   &L7,     2 );
+    p448_weak_reduce(   &L7 );
+    p448_mulw (   &L4, &a->y, 78160 );
+    p448_mul  (   &L6,   &L7,   &L9 );
+    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mul  (   &L4,   &L7,   &L8 );
+    p448_isr  (   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_sqr  (   &L6,   &L5 );
+    p448_mul  (   &L5,   &L8,   &L6 );
+    p448_mul  (   &L8,   &L7,   &L5 );
+    p448_mul  (   &L7,   &L8,   &L5 );
+    p448_copy (   &L5, &a->x );
+    p448_subw (   &L5,     1 );
+    p448_addw ( &a->x,     1 );
+    p448_mul  (   &L6, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_bias ( &a->x,     3 );
+    p448_weak_reduce( &a->x );
+    p448_mul  (   &L5,   &L4, &a->x );
+    p448_mulw (   &L4,   &L5, 78160 );
+    p448_neg  ( &a->x,   &L4 );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_add  (   &L4,   &L3,   &L3 );
+    p448_add  (   &L3,   &L4,   &L2 );
+    p448_subw (   &L3,     2 );
+    p448_bias (   &L3,     1 );
+    p448_weak_reduce(   &L3 );
+    p448_mul  (   &L2,   &L3,   &L8 );
+    p448_mulw (   &L3,   &L2, 3054649120 );
+    p448_add  (   &L2,   &L3, &a->y );
+    p448_mul  ( &a->y,   &L7,   &L2 );
+       L1 = p448_is_zero(   &L9 );
+       L0 = -   L1;
+    p448_addw ( &a->y,    L0 );
+    p448_weak_reduce( &a->y );
+}
+
+mask_t
+validate_affine (
+    const struct affine_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L0, &a->y );
+    p448_sqr  (   &L2, &a->x );
+    p448_add  (   &L3,   &L2,   &L0 );
+    p448_subw (   &L3,     1 );
+    p448_mulw (   &L1,   &L2, 39081 );
+    p448_neg  (   &L2,   &L1 );
+    p448_bias (   &L2,     2 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sub  (   &L0,   &L3,   &L1 );
+    p448_bias (   &L0,     3 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+validate_tw_extensible (
+    const struct tw_extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L2, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L2 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L3, &ext->x, &ext->y );
+    p448_neg  (   &L2,   &L3 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_bias (   &L3,     2 );
+       L1 = p448_is_zero(   &L3 );
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L2,   &L4 );
+    p448_addw (   &L2,     0 );
+    p448_sqr  (   &L3, &ext->x );
+    p448_add  (   &L4,   &L3,   &L2 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L3, &ext->t );
+    p448_mul  (   &L2,   &L3,   &L5 );
+    p448_mulw (   &L3,   &L2, 39081 );
+    p448_neg  (   &L5,   &L3 );
+    p448_add  (   &L3,   &L5,   &L4 );
+    p448_neg  (   &L5,   &L2 );
+    p448_add  (   &L4,   &L5,   &L3 );
+    p448_sqr  (   &L3, &ext->z );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     4 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+validate_extensible (
+    const struct extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L3,   &L4 );
+    p448_addw (   &L3,     0 );
+    p448_sqr  (   &L2, &ext->z );
+    p448_add  (   &L4,   &L2,   &L3 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L2, &ext->t );
+    p448_mul  (   &L3,   &L2,   &L5 );
+    p448_mulw (   &L5,   &L3, 39081 );
+    p448_neg  (   &L2,   &L5 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_sqr  (   &L2, &ext->x );
+    p448_neg  (   &L4,   &L2 );
+    p448_add  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     4 );
+       L1 = p448_is_zero(   &L2 );
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L3, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L3 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L2, &ext->x, &ext->y );
+    p448_neg  (   &L3,   &L2 );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+
diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c
new file mode 100644
index 0000000..c764955
--- /dev/null
+++ b/src/arch_arm_32/p448.c
@@ -0,0 +1,1021 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "word.h"
+#include "p448.h"
+//#include "x86-64-arith.h"
+
+static inline mask_t __attribute__((always_inline))
+is_zero (
+    word_t x
+) {
+    dword_t xx = x;
+    xx--;
+    return xx >> WORD_BITS;
+}
+
+static uint64_t widemul_32 (
+    const uint32_t a,
+    const uint32_t b
+) {
+    return ((uint64_t)a)* b;
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smlal (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    uint32_t lo = *acc, hi = (*acc)>>32;
+    
+    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
+        : [lo]"+&r"(lo), [hi]"+&r"(hi)
+        : [a]"r"(a), [b]"r"(b));
+    
+    *acc = lo + (((uint64_t)hi)<<32);
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smlal2 (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    uint32_t lo = *acc, hi = (*acc)>>32;
+    
+    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
+        : [lo]"+&r"(lo), [hi]"+&r"(hi)
+        : [a]"r"(a), [b]"r"(2*b));
+    
+    *acc = lo + (((uint64_t)hi)<<32);
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smull (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    uint32_t lo, hi;
+    
+    __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
+        : [lo]"=&r"(lo), [hi]"=&r"(hi)
+        : [a]"r"(a), [b]"r"(b));
+    
+    *acc = lo + (((uint64_t)hi)<<32);
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smull2 (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    uint32_t lo, hi;
+    
+    __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
+        : [lo]"=&r"(lo), [hi]"=&r"(hi)
+        : [a]"r"(a), [b]"r"(2*b));
+    
+    *acc = lo + (((uint64_t)hi)<<32);
+}
+
+void
+p448_mul (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    const p448_t *bs
+) {
+    // p448_t ar, br;
+//     p448_copy(&ar,as);
+//     p448_copy(&br,bs);
+//     p448_weak_reduce(&ar);
+//     p448_weak_reduce(&br);
+    
+    const uint32_t *a = as->limb, *b = bs->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
+    uint32_t mask = (1<<28) - 1;  
+
+    uint32_t aa[8], bm[8];
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    // unsigned int i;
+    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    // }
+    int i;
+    for (i=0; i<8; i++) {
+        aa[i] = a[i] + a[i+8];
+        bm[i] = b[i] - b[i+8];
+    }
+
+    uint32_t ax,bx;
+    {
+        /* t^3 terms */
+        smull(&accum1, ax = aa[1], bx = b[15]);
+        smull(&accum3, ax = aa[2], bx);
+        smlal(&accum1, ax, bx = b[14]);
+        smlal(&accum3, ax = aa[3], bx);
+        smlal(&accum1, ax, bx = b[13]);
+        smlal(&accum3, ax = aa[4], bx);
+        smlal(&accum1, ax, bx = b[12]);
+        smlal(&accum3, ax = aa[5], bx);
+        smlal(&accum1, ax, bx = b[11]);
+        smlal(&accum3, ax = aa[6], bx);
+        smlal(&accum1, ax, bx = b[10]);
+        smlal(&accum3, ax = aa[7], bx);
+        smlal(&accum1, ax, bx = b[9]);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        smlal(&accum2, ax = aa[0], bx);
+        smlal(&accum0, ax, bx = b[8]);
+        smlal(&accum2, ax = aa[1], bx);
+        
+        smlal(&accum0, ax = a[9], bx = b[7]);
+        smlal(&accum2, ax = a[10], bx);
+        smlal(&accum0, ax, bx = b[6]);
+        smlal(&accum2, ax = a[11], bx);
+        smlal(&accum0, ax, bx = b[5]);
+        smlal(&accum2, ax = a[12], bx);
+        smlal(&accum0, ax, bx = b[4]);
+        smlal(&accum2, ax = a[13], bx);
+        smlal(&accum0, ax, bx = b[3]);
+        smlal(&accum2, ax = a[14], bx);
+        smlal(&accum0, ax, bx = b[2]);
+        smlal(&accum2, ax = a[15], bx);
+        smlal(&accum0, ax, bx = b[1]);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        smlal(&accum3, ax = a[8], bx);
+        smlal(&accum1, ax, bx = b[0]);
+        smlal(&accum3, ax = a[9], bx);
+        
+        smlal(&accum1, ax = a[1], bx = bm[7]);
+        smlal(&accum3, ax = a[2], bx);
+        smlal(&accum1, ax, bx = bm[6]);
+        smlal(&accum3, ax = a[3], bx);
+        smlal(&accum1, ax, bx = bm[5]);
+        smlal(&accum3, ax = a[4], bx);
+        smlal(&accum1, ax, bx = bm[4]);
+        smlal(&accum3, ax = a[5], bx);
+        smlal(&accum1, ax, bx = bm[3]);
+        smlal(&accum3, ax = a[6], bx);
+        smlal(&accum1, ax, bx = bm[2]);
+        smlal(&accum3, ax = a[7], bx);
+        smlal(&accum1, ax, bx = bm[1]);
+        
+        /* 1 terms */
+        smlal(&accum2, ax = a[0], bx);
+        smlal(&accum0, ax, bx = bm[0]);
+        smlal(&accum2, ax = a[1], bx);
+        
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[0] = ((uint32_t)(accum0)) & mask;
+        c[1] = ((uint32_t)(accum2)) & mask;
+        c[8] = ((uint32_t)(accum1)) & mask;
+        c[9] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        /* t^3 terms */
+        smull(&accum1, ax = aa[3], bx = b[15]);
+        smull(&accum3, ax = aa[4], bx);
+        smlal(&accum1, ax, bx = b[14]);
+        smlal(&accum3, ax = aa[5], bx);
+        smlal(&accum1, ax, bx = b[13]);
+        smlal(&accum3, ax = aa[6], bx);
+        smlal(&accum1, ax, bx = b[12]);
+        smlal(&accum3, ax = aa[7], bx);
+        smlal(&accum1, ax, bx = b[11]);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        smlal(&accum2, ax = aa[0], bx);
+        smlal(&accum0, ax, bx = b[10]);
+        smlal(&accum2, ax = aa[1], bx);
+        smlal(&accum0, ax, bx = b[9]);
+        smlal(&accum2, ax = aa[2], bx);
+        smlal(&accum0, ax, bx = b[8]);
+        smlal(&accum2, ax = aa[3], bx);
+        
+        smlal(&accum0, ax = a[11], bx = b[7]);
+        smlal(&accum2, ax = a[12], bx);
+        smlal(&accum0, ax, bx = b[6]);
+        smlal(&accum2, ax = a[13], bx);
+        smlal(&accum0, ax, bx = b[5]);
+        smlal(&accum2, ax = a[14], bx);
+        smlal(&accum0, ax, bx = b[4]);
+        smlal(&accum2, ax = a[15], bx);
+        smlal(&accum0, ax, bx = b[3]);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        smlal(&accum3, ax = a[8], bx);
+        smlal(&accum1, ax, bx = b[2]);
+        smlal(&accum3, ax = a[9], bx);
+        smlal(&accum1, ax, bx = b[1]);
+        smlal(&accum3, ax = a[10], bx);
+        smlal(&accum1, ax, bx = b[0]);
+        smlal(&accum3, ax = a[11], bx);
+        
+        smlal(&accum1, ax = a[3], bx = bm[7]);
+        smlal(&accum3, ax = a[4], bx);
+        smlal(&accum1, ax, bx = bm[6]);
+        smlal(&accum3, ax = a[5], bx);
+        smlal(&accum1, ax, bx = bm[5]);
+        smlal(&accum3, ax = a[6], bx);
+        smlal(&accum1, ax, bx = bm[4]);
+        smlal(&accum3, ax = a[7], bx);
+        smlal(&accum1, ax, bx = bm[3]);
+        
+        /* 1 terms */
+        smlal(&accum2, ax = a[0], bx);
+        smlal(&accum0, ax, bx = bm[2]);
+        smlal(&accum2, ax = a[1], bx);
+        smlal(&accum0, ax, bx = bm[1]);
+        smlal(&accum2, ax = a[2], bx);
+        smlal(&accum0, ax, bx = bm[0]);
+        smlal(&accum2, ax = a[3], bx);
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[2] = ((uint32_t)(accum0)) & mask;
+        c[3] = ((uint32_t)(accum2)) & mask;
+        c[10] = ((uint32_t)(accum1)) & mask;
+        c[11] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        
+        /* t^3 terms */
+        smull(&accum1, ax = aa[5], bx = b[15]);
+        smull(&accum3, ax = aa[6], bx);
+        smlal(&accum1, ax, bx = b[14]);
+        smlal(&accum3, ax = aa[7], bx);
+        smlal(&accum1, ax, bx = b[13]);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        
+        smlal(&accum2, ax = aa[0], bx);
+        smlal(&accum0, ax, bx = b[12]);
+        smlal(&accum2, ax = aa[1], bx);
+        smlal(&accum0, ax, bx = b[11]);
+        smlal(&accum2, ax = aa[2], bx);
+        smlal(&accum0, ax, bx = b[10]);
+        smlal(&accum2, ax = aa[3], bx);
+        smlal(&accum0, ax, bx = b[9]);
+        smlal(&accum2, ax = aa[4], bx);
+        smlal(&accum0, ax, bx = b[8]);
+        smlal(&accum2, ax = aa[5], bx);
+        
+        
+        smlal(&accum0, ax = a[13], bx = b[7]);
+        smlal(&accum2, ax = a[14], bx);
+        smlal(&accum0, ax, bx = b[6]);
+        smlal(&accum2, ax = a[15], bx);
+        smlal(&accum0, ax, bx = b[5]);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        
+        smlal(&accum3, ax = a[8], bx);
+        smlal(&accum1, ax, bx = b[4]);
+        smlal(&accum3, ax = a[9], bx);
+        smlal(&accum1, ax, bx = b[3]);
+        smlal(&accum3, ax = a[10], bx);
+        smlal(&accum1, ax, bx = b[2]);
+        smlal(&accum3, ax = a[11], bx);
+        smlal(&accum1, ax, bx = b[1]);
+        smlal(&accum3, ax = a[12], bx);
+        smlal(&accum1, ax, bx = b[0]);
+        smlal(&accum3, ax = a[13], bx);
+        
+        
+        smlal(&accum1, ax = a[5], bx = bm[7]);
+        smlal(&accum3, ax = a[6], bx);
+        smlal(&accum1, ax, bx = bm[6]);
+        smlal(&accum3, ax = a[7], bx);
+        smlal(&accum1, ax, bx = bm[5]);
+        
+        /* 1 terms */
+        
+        smlal(&accum2, ax = a[0], bx);
+        smlal(&accum0, ax, bx = bm[4]);
+        smlal(&accum2, ax = a[1], bx);
+        smlal(&accum0, ax, bx = bm[3]);
+        smlal(&accum2, ax = a[2], bx);
+        smlal(&accum0, ax, bx = bm[2]);
+        smlal(&accum2, ax = a[3], bx);
+        smlal(&accum0, ax, bx = bm[1]);
+        smlal(&accum2, ax = a[4], bx);
+        smlal(&accum0, ax, bx = bm[0]);
+        smlal(&accum2, ax = a[5], bx);
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[4] = ((uint32_t)(accum0)) & mask;
+        c[5] = ((uint32_t)(accum2)) & mask;
+        c[12] = ((uint32_t)(accum1)) & mask;
+        c[13] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        
+        /* t^3 terms */
+        smull(&accum1, ax = aa[7], bx = b[15]);
+        accum0 = accum1;
+        
+        /* t^2 terms */
+        
+        smull(&accum2, ax = aa[0], bx);
+        smlal(&accum0, ax, bx = b[14]);
+        smlal(&accum2, ax = aa[1], bx);
+        smlal(&accum0, ax, bx = b[13]);
+        smlal(&accum2, ax = aa[2], bx);
+        smlal(&accum0, ax, bx = b[12]);
+        smlal(&accum2, ax = aa[3], bx);
+        smlal(&accum0, ax, bx = b[11]);
+        smlal(&accum2, ax = aa[4], bx);
+        smlal(&accum0, ax, bx = b[10]);
+        smlal(&accum2, ax = aa[5], bx);
+        smlal(&accum0, ax, bx = b[9]);
+        smlal(&accum2, ax = aa[6], bx);
+        smlal(&accum0, ax, bx = b[8]);
+        smlal(&accum2, ax = aa[7], bx);
+        
+        
+        smlal(&accum0, ax = a[15], bx = b[7]);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 = accum2;
+        
+        smlal(&accum3, ax = a[8], bx);
+        smlal(&accum1, ax, bx = b[6]);
+        smlal(&accum3, ax = a[9], bx);
+        smlal(&accum1, ax, bx = b[5]);
+        smlal(&accum3, ax = a[10], bx);
+        smlal(&accum1, ax, bx = b[4]);
+        smlal(&accum3, ax = a[11], bx);
+        smlal(&accum1, ax, bx = b[3]);
+        smlal(&accum3, ax = a[12], bx);
+        smlal(&accum1, ax, bx = b[2]);
+        smlal(&accum3, ax = a[13], bx);
+        smlal(&accum1, ax, bx = b[1]);
+        smlal(&accum3, ax = a[14], bx);
+        smlal(&accum1, ax, bx = b[0]);
+        smlal(&accum3, ax = a[15], bx);
+        
+        
+        smlal(&accum1, ax = a[7], bx = bm[7]);
+        
+        /* 1 terms */
+        
+        smlal(&accum2, ax = a[0], bx);
+        smlal(&accum0, ax, bx = bm[6]);
+        smlal(&accum2, ax = a[1], bx);
+        smlal(&accum0, ax, bx = bm[5]);
+        smlal(&accum2, ax = a[2], bx);
+        smlal(&accum0, ax, bx = bm[4]);
+        smlal(&accum2, ax = a[3], bx);
+        smlal(&accum0, ax, bx = bm[3]);
+        smlal(&accum2, ax = a[4], bx);
+        smlal(&accum0, ax, bx = bm[2]);
+        smlal(&accum2, ax = a[5], bx);
+        smlal(&accum0, ax, bx = bm[1]);
+        smlal(&accum2, ax = a[6], bx);
+        smlal(&accum0, ax, bx = bm[0]);
+        smlal(&accum2, ax = a[7], bx);
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[6] = ((uint32_t)(accum0)) & mask;
+        c[7] = ((uint32_t)(accum2)) & mask;
+        c[14] = ((uint32_t)(accum1)) & mask;
+        c[15] = ((uint32_t)(accum3)) & mask;
+        
+        accum0 = accum2 >> 28;
+        accum1 = accum3 >> 28;
+    }
+
+    accum0 += accum1;
+    accum0 += c[8];
+    accum1 += c[0];
+    c[8] = ((uint32_t)(accum0)) & mask;
+    c[0] = ((uint32_t)(accum1)) & mask;
+    
+    accum0 >>= 28;
+    accum1 >>= 28;
+    c[9] += ((uint32_t)(accum0));
+    c[1] += ((uint32_t)(accum1));
+}
+
+void
+p448_sqr (
+    p448_t *__restrict__ cs,
+    const p448_t *as
+) {
+    // p448_t ar, br;
+//     p448_copy(&ar,as);
+//     p448_copy(&br,bs);
+//     p448_weak_reduce(&ar);
+//     p448_weak_reduce(&br);
+    
+    const uint32_t *a = as->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
+    uint32_t mask = (1<<28) - 1;  
+
+    uint32_t bm[8];
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    // unsigned int i;
+    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&bm[4]))[i];
+    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    // }
+    int i;
+    for (i=0; i<8; i++) {
+        bm[i] = a[i] - a[i+8];
+    }
+
+    uint32_t ax,bx;
+    {
+        /* t^3 terms */
+        smull2(&accum1, ax = a[9], bx = a[15]);
+        smull2(&accum3, ax = a[10], bx);
+        smlal2(&accum1, ax, bx = a[14]);
+        smlal2(&accum3, ax = a[11], bx);
+        smlal2(&accum1, ax, bx = a[13]);
+        smlal2(&accum3, ax = a[12], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        smlal2(&accum2, ax = a[8], a[9]);
+        smlal(&accum0, ax, ax);
+        
+        smlal2(&accum0, ax = a[1], bx = a[7]);
+        smlal2(&accum2, ax = a[2], bx);
+        smlal2(&accum0, ax, bx = a[6]);
+        smlal2(&accum2, ax = a[3], bx);
+        smlal2(&accum0, ax, bx = a[5]);
+        smlal2(&accum2, ax = a[4], bx);
+        smlal(&accum0, ax, ax);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        smlal2(&accum3, ax = a[0], bx = a[1]);
+        smlal(&accum1, ax, ax);
+        
+        accum1 = -accum1;
+        accum3 = -accum3;
+        accum2 = -accum2;
+        accum0 = -accum0;
+        
+        smlal2(&accum1, ax = bm[1], bx = bm[7]);
+        smlal2(&accum3, ax = bm[2], bx);
+        smlal2(&accum1, ax, bx = bm[6]);
+        smlal2(&accum3, ax = bm[3], bx);
+        smlal2(&accum1, ax, bx = bm[5]);
+        smlal2(&accum3, ax = bm[4], bx);
+        smlal(&accum1, ax, ax);
+        
+        /* 1 terms */
+        smlal2(&accum2, ax = bm[0], bx = bm[1]);
+        smlal(&accum0, ax, ax);
+        
+        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
+        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
+        
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[0] = ((uint32_t)(accum0)) & mask;
+        c[1] = ((uint32_t)(accum2)) & mask;
+        c[8] = ((uint32_t)(accum1)) & mask;
+        c[9] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        /* t^3 terms */
+        smull2(&accum1, ax = a[11], bx = a[15]);
+        smull2(&accum3, ax = a[12], bx);
+        smlal2(&accum1, ax, bx = a[14]);
+        smlal2(&accum3, ax = a[13], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        smlal2(&accum2, ax = a[8], bx = a[11]);
+        smlal2(&accum0, ax, bx = a[10]);
+        smlal2(&accum2, ax = a[9], bx);
+        smlal(&accum0, ax, ax);
+        
+        smlal2(&accum0, ax = a[3], bx = a[7]);
+        smlal2(&accum2, ax = a[4], bx);
+        smlal2(&accum0, ax, bx = a[6]);
+        smlal2(&accum2, ax = a[5], bx);
+        smlal(&accum0, ax, ax);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        smlal2(&accum3, ax = a[0], bx = a[3]);
+        smlal2(&accum1, ax, bx = a[2]);
+        smlal2(&accum3, ax = a[1], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum1 = -accum1;
+        accum3 = -accum3;
+        accum2 = -accum2;
+        accum0 = -accum0;
+        
+        smlal2(&accum1, ax = bm[3], bx = bm[7]);
+        smlal2(&accum3, ax = bm[4], bx);
+        smlal2(&accum1, ax, bx = bm[6]);
+        smlal2(&accum3, ax = bm[5], bx);
+        smlal(&accum1, ax, ax);
+        
+        /* 1 terms */
+        smlal2(&accum2, ax = bm[0], bx = bm[3]);
+        smlal2(&accum0, ax, bx = bm[2]);
+        smlal2(&accum2, ax = bm[1], bx);
+        smlal(&accum0, ax, ax);
+        
+        
+        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
+        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[2] = ((uint32_t)(accum0)) & mask;
+        c[3] = ((uint32_t)(accum2)) & mask;
+        c[10] = ((uint32_t)(accum1)) & mask;
+        c[11] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        
+        /* t^3 terms */
+        smull2(&accum1, ax = a[13], bx = a[15]);
+        smull2(&accum3, ax = a[14], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum0 = accum1;
+        accum2 = accum3;
+        
+        /* t^2 terms */
+        
+        smlal2(&accum2, ax = a[8], bx = a[13]);
+        smlal2(&accum0, ax, bx = a[12]);
+        smlal2(&accum2, ax = a[9], bx);
+        smlal2(&accum0, ax, bx = a[11]);
+        smlal2(&accum2, ax = a[10], bx);
+        smlal(&accum0, ax, ax);
+        
+        
+        smlal2(&accum0, ax = a[5], bx = a[7]);
+        smlal2(&accum2, ax = a[6], bx);
+        smlal(&accum0, ax, ax);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 += accum2;
+        
+        smlal2(&accum3, ax = a[0], bx = a[5]);
+        smlal2(&accum1, ax, bx = a[4]);
+        smlal2(&accum3, ax = a[1], bx);
+        smlal2(&accum1, ax, bx = a[3]);
+        smlal2(&accum3, ax = a[2], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum1 = -accum1;
+        accum3 = -accum3;
+        accum2 = -accum2;
+        accum0 = -accum0;
+        
+        smlal2(&accum1, ax = bm[5], bx = bm[7]);
+        smlal2(&accum3, ax = bm[6], bx);
+        smlal(&accum1, ax, ax);
+        
+        /* 1 terms */
+        
+        smlal2(&accum2, ax = bm[0], bx = bm[5]);
+        smlal2(&accum0, ax, bx = bm[4]);
+        smlal2(&accum2, ax = bm[1], bx);
+        smlal2(&accum0, ax, bx = bm[3]);
+        smlal2(&accum2, ax = bm[2], bx);
+        smlal(&accum0, ax, ax);
+        
+        
+        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
+        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[4] = ((uint32_t)(accum0)) & mask;
+        c[5] = ((uint32_t)(accum2)) & mask;
+        c[12] = ((uint32_t)(accum1)) & mask;
+        c[13] = ((uint32_t)(accum3)) & mask;
+        
+        accumC0 = accum2 >> 28;
+        accumC1 = accum3 >> 28;
+    }
+    {
+        
+        /* t^3 terms */
+        smull(&accum1, ax = a[15], bx = a[15]);
+        accum0 = accum1;
+        
+        /* t^2 terms */
+        
+        smull2(&accum2, ax = a[8], bx);
+        smlal2(&accum0, ax, bx = a[14]);
+        smlal2(&accum2, ax = a[9], bx);
+        smlal2(&accum0, ax, bx = a[13]);
+        smlal2(&accum2, ax = a[10], bx);
+        smlal2(&accum0, ax, bx = a[12]);
+        smlal2(&accum2, ax = a[11], bx);
+        smlal(&accum0, ax, ax);
+        
+        
+        smlal(&accum0, ax = a[7], bx = a[7]);
+        
+        /* t terms */
+        accum1 += accum0;
+        accum3 = accum2;
+        
+        smlal2(&accum3, ax = a[0], bx);
+        smlal2(&accum1, ax, bx = a[6]);
+        smlal2(&accum3, ax = a[1], bx);
+        smlal2(&accum1, ax, bx = a[5]);
+        smlal2(&accum3, ax = a[2], bx);
+        smlal2(&accum1, ax, bx = a[4]);
+        smlal2(&accum3, ax = a[3], bx);
+        smlal(&accum1, ax, ax);
+        
+        accum1 = -accum1;
+        accum3 = -accum3;
+        accum2 = -accum2;
+        accum0 = -accum0;
+        
+        bx = bm[7];
+        smlal(&accum1, bx, bx);
+        
+        /* 1 terms */
+        
+        smlal2(&accum2, ax = bm[0], bx);
+        smlal2(&accum0, ax, bx = bm[6]);
+        smlal2(&accum2, ax = bm[1], bx);
+        smlal2(&accum0, ax, bx = bm[5]);
+        smlal2(&accum2, ax = bm[2], bx);
+        smlal2(&accum0, ax, bx = bm[4]);
+        smlal2(&accum2, ax = bm[3], bx);
+        smlal(&accum0, ax, ax);
+        
+        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
+        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
+        
+        
+        accum0 += accumC0;
+        accum1 += accumC1;
+        accum2 += accum0 >> 28;
+        accum3 += accum1 >> 28;
+        
+        c[6] = ((uint32_t)(accum0)) & mask;
+        c[7] = ((uint32_t)(accum2)) & mask;
+        c[14] = ((uint32_t)(accum1)) & mask;
+        c[15] = ((uint32_t)(accum3)) & mask;
+        
+        accum0 = accum2 >> 28;
+        accum1 = accum3 >> 28;
+    }
+
+    accum0 += accum1;
+    accum0 += c[8];
+    accum1 += c[0];
+    c[8] = ((uint32_t)(accum0)) & mask;
+    c[0] = ((uint32_t)(accum1)) & mask;
+    
+    accum0 >>= 28;
+    accum1 >>= 28;
+    c[9] += ((uint32_t)(accum0));
+    c[1] += ((uint32_t)(accum1));
+}
+
+void
+p448_mulw (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    uint64_t b
+) {
+    const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
+    
+    const uint32_t *a = as->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0, accum8;
+    uint32_t mask = (1ull<<28)-1;  
+
+    int i;
+
+    uint32_t c0, c8, n0, n8;
+    accum0 = widemul_32(bhi, a[15]);
+    accum8 = widemul_32(bhi, a[15] + a[7]);
+    c0 = a[0]; c8 = a[8];
+    smlal(&accum0, blo, c0);
+    smlal(&accum8, blo, c8);
+
+    c[0] = accum0 & mask; accum0 >>= 28;
+    c[8] = accum8 & mask; accum8 >>= 28;
+    
+    i=1;
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+        
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+        
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+
+    accum0 += accum8 + c[8];
+    c[8] = accum0 & mask;
+    c[9] += accum0 >> 28;
+
+    accum8 += c[0];
+    c[0] = accum8 & mask;
+    c[1] += accum8 >> 28;
+}
+
+void
+p448_strong_reduce (
+    p448_t *a
+) {
+    word_t mask = (1ull<<28)-1;
+
+    /* first, clear high */
+    a->limb[8] += a->limb[15]>>28;
+    a->limb[0] += a->limb[15]>>28;
+    a->limb[15] &= mask;
+
+    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+
+    dsword_t scarry = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
+        a->limb[i] = scarry & mask;
+        scarry >>= 28;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
+    * so let's add back in p.  will carry back off the top for 2^448.
+    */
+
+    assert(is_zero(scarry) | is_zero(scarry+1));
+
+    word_t scarry_mask = scarry & mask;
+    dword_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<16; i++) {
+        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
+        a->limb[i] = carry & mask;
+        carry >>= 28;
+    }
+
+    assert(is_zero(carry + scarry));
+}
+
+mask_t
+p448_is_zero (
+    const struct p448_t *a
+) {
+    struct p448_t b;
+    p448_copy(&b,a);
+    p448_strong_reduce(&b);
+
+    uint32_t any = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        any |= b.limb[i];
+    }
+    return is_zero(any);
+}
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+) {
+    int i,j;
+    p448_t red;
+    p448_copy(&red, x);
+    p448_strong_reduce(&red);
+    for (i=0; i<8; i++) {
+        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
+        for (j=0; j<7; j++) {
+            serial[7*i+j] = limb;
+            limb >>= 8;
+        }
+        assert(limb == 0);
+    }
+}
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+) {
+    int i,j;
+    for (i=0; i<8; i++) {
+        uint64_t out = 0;
+        for (j=0; j<7; j++) {
+            out |= ((uint64_t)serial[7*i+j])<<(8*j);
+        }
+        x->limb[2*i] = out & (1ull<<28)-1;
+        x->limb[2*i+1] = out >> 28;
+    }
+    
+    /* Check for reduction.
+     *
+     * The idea is to create a variable ge which is all ones (rather, 56 ones)
+     * if and only if the low $i$ words of $x$ are >= those of p.
+     *
+     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
+     */
+    uint32_t ge = -1, mask = (1ull<<28)-1;
+    for (i=0; i<8; i++) {
+        ge &= x->limb[i];
+    }
+    
+    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
+    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    
+    /* Propagate the rest */
+    for (i=9; i<16; i++) {
+        ge &= x->limb[i];
+    }
+    
+    return ~is_zero(ge ^ mask);
+}
+
+void
+simultaneous_invert_p448(
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+) {
+  if (n==0) {
+      return;
+  } else if (n==1) {
+      p448_inverse(out,in);
+      return;
+  }
+  
+  p448_copy(&out[1], &in[0]);
+  int i;
+  for (i=1; i<(int) (n-1); i++) {
+      p448_mul(&out[i+1], &out[i], &in[i]);
+  }
+  p448_mul(&out[0], &out[n-1], &in[n-1]);
+  
+  struct p448_t tmp;
+  p448_inverse(&tmp, &out[0]);
+  p448_copy(&out[0], &tmp);
+  
+  /* at this point, out[0] = product(in[i]) ^ -1
+   * out[i] = product(in[0]..in[i-1]) if i != 0
+   */
+  for (i=n-1; i>0; i--) {
+      p448_mul(&tmp, &out[i], &out[0]);
+      p448_copy(&out[i], &tmp);
+      
+      p448_mul(&tmp, &out[0], &in[i]);
+      p448_copy(&out[0], &tmp);
+  }
+}
diff --git a/src/arch_arm_32/p448.h b/src/arch_arm_32/p448.h
new file mode 100644
index 0000000..4628a89
--- /dev/null
+++ b/src/arch_arm_32/p448.h
@@ -0,0 +1,378 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __P448_H__
+#define __P448_H__ 1
+
+#include "word.h"
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef struct p448_t {
+  uint32_t limb[16];
+} __attribute__((aligned(32))) p448_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) __attribute__((unused,always_inline));
+           
+static __inline__ void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t do_swap
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+            
+static __inline__ void
+p448_cond_neg (
+    p448_t *a,
+    mask_t doNegate
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_weak_reduce (
+    p448_t *inout
+) __attribute__((unused,always_inline));
+             
+void
+p448_strong_reduce (
+    p448_t *inout
+);
+
+mask_t
+p448_is_zero (
+    const p448_t *in
+);
+             
+static __inline__ void
+p448_bias (
+    p448_t *inout,
+    int amount
+) __attribute__((unused,always_inline));
+
+void
+p448_mul (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    const p448_t *b
+);
+
+void
+p448_mulw (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    uint64_t b
+);
+
+void
+p448_sqr (
+    p448_t *__restrict__ out,
+    const p448_t *a
+);
+         
+static __inline__ void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) __attribute__((unused,always_inline));
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+);
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+);
+    
+static __inline__ void
+p448_mask(
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) __attribute__((unused,always_inline));
+
+/**
+* Returns 1/x.
+* 
+* If x=0, returns 0.
+*/
+void
+p448_inverse (
+   struct p448_t*       a,
+   const struct p448_t* x
+);
+       
+void
+simultaneous_invert_p448 (
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+);
+
+static inline mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) __attribute__((always_inline,unused));
+
+/* -------------- Inline functions begin here -------------- */
+
+void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) {
+    int i;
+    out->limb[0] = x & (1<<28)-1;
+    out->limb[1] = x>>28;
+    for (i=2; i<16; i++) {
+      out->limb[i] = 0;
+    }
+}
+            
+void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t doswap
+) {
+    big_register_t *aa = (big_register_t*)a;
+    big_register_t *bb = (big_register_t*)b;
+    big_register_t m = doswap;
+
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        big_register_t x = m & (aa[i]^bb[i]);
+        aa[i] ^= x;
+        bb[i] ^= x;
+    }
+}
+
+void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    */
+}
+
+void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] - b->limb[i];
+    }
+    */
+}
+
+void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = -a->limb[i];
+    }
+    */
+}
+
+void
+p448_cond_neg(
+    p448_t *a,
+    mask_t doNegate
+) {
+    unsigned int i;
+    struct p448_t negated;
+    big_register_t *aa = (big_register_t *)a;
+    big_register_t *nn = (big_register_t*)&negated;
+    big_register_t m = doNegate;
+    
+    p448_neg(&negated, a);
+    p448_bias(&negated, 2);
+    
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        aa[i] = (aa[i] & ~m) | (nn[i] & m);
+    }
+}
+
+void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] += x;
+}
+             
+void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] -= x;
+}
+
+void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) {
+  *out = *a;
+}
+
+void
+p448_bias (
+    p448_t *a,
+    int amt
+) {
+    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
+    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
+    uint32x4_t *aa = (uint32x4_t*) a;
+    aa[0] += lo;
+    aa[1] += lo;
+    aa[2] += hi;
+    aa[3] += lo;
+}
+
+void
+p448_weak_reduce (
+    p448_t *a
+) {
+    uint64_t mask = (1ull<<28) - 1;
+    uint64_t tmp = a->limb[15] >> 28;
+    int i;
+    a->limb[8] += tmp;
+    for (i=15; i>0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+    }
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) {
+    p448_t tmp;
+    assert(n>0);
+    if (n&1) {
+        p448_sqr(y,x);
+        n--;
+    } else {
+        p448_sqr(&tmp,x);
+        p448_sqr(y,&tmp);
+        n-=2;
+    }
+    for (; n; n-=2) {
+        p448_sqr(&tmp,y);
+        p448_sqr(y,&tmp);
+    }
+}
+
+mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) {
+    struct p448_t ra, rb;
+    p448_copy(&ra, a);
+    p448_copy(&rb, b);
+    p448_weak_reduce(&ra);
+    p448_weak_reduce(&rb);
+    p448_sub(&ra, &ra, &rb);
+    p448_bias(&ra, 2);
+    return p448_is_zero(&ra);
+}
+
+void
+p448_mask (
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
+        a->limb[i] = b->limb[i] & mask;
+    }
+}
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __P448_H__ */
diff --git a/src/arch_x86_64/ec_point.c b/src/arch_x86_64/ec_point.c
new file mode 100644
index 0000000..87df79f
--- /dev/null
+++ b/src/arch_x86_64/ec_point.c
@@ -0,0 +1,910 @@
+/**
+ * @cond internal
+ * @file ec_point.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @warning This file was automatically generated.
+ */
+
+#include "ec_point.h"
+
+
+void
+p448_isr (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L1,     x );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqrn (   &L1,   &L2,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L1,   &L0,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L2,   &L0,     9 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (   &L2,     x,   &L0 );
+    p448_sqrn (   &L0,   &L2,    18 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqrn (   &L0,   &L2,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,   111 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     x,   &L0 );
+    p448_sqrn (   &L0,   &L1,   223 );
+    p448_mul  (     a,   &L2,   &L0 );
+}
+
+void
+p448_inverse (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (     a,     x,   &L0 );
+}
+
+void
+add_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_mul  (   &L0, &e->a,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->b,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_sub  ( &d->y, &d->z, &d->x );
+    p448_bias ( &d->y,     2 );
+    p448_add  (   &L0, &d->x, &d->z );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+sub_tw_niels_from_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_mul  (   &L0, &e->b,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->a,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_add  ( &d->y, &d->x, &d->z );
+    p448_sub  (   &L0, &d->z, &d->x );
+    p448_bias (   &L0,     2 );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+add_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    add_tw_niels_to_tw_extensible(     e, &a->n );
+}
+
+void
+sub_tw_pniels_from_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    sub_tw_niels_from_tw_extensible(     e, &a->n );
+}
+
+void
+double_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  ( &a->u,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  (   &L1, &a->t );
+    p448_sub  ( &a->t,   &L1, &a->u );
+    p448_bias ( &a->t,     3 );
+    p448_sub  (   &L1,   &L0,   &L2 );
+    p448_bias (   &L1,     2 );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     2 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+double_extensible (
+    struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  (   &L1,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  ( &a->u, &a->t );
+    p448_sub  ( &a->t, &a->u,   &L1 );
+    p448_bias ( &a->t,     3 );
+    p448_sub  ( &a->u,   &L0,   &L2 );
+    p448_bias ( &a->u,     2 );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     2 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+twist_and_double (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  ( &b->u, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  (   &L0, &b->t );
+    p448_sub  ( &b->t,   &L0, &b->u );
+    p448_bias ( &b->t,     3 );
+    p448_sub  (   &L0, &b->z, &b->x );
+    p448_bias (   &L0,     2 );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     2 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+untwist_and_double (
+    struct extensible_t*          b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  (   &L0, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  ( &b->u, &b->t );
+    p448_sub  ( &b->t, &b->u,   &L0 );
+    p448_bias ( &b->t,     3 );
+    p448_sub  ( &b->u, &b->z, &b->x );
+    p448_bias ( &b->u,     2 );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     2 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+convert_tw_affine_to_tw_pniels (
+    struct tw_pniels_t*       b,
+    const struct tw_affine_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->y, &a->x );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_set_ui( &b->z,     2 );
+}
+
+void
+convert_tw_affine_to_tw_extensible (
+    struct tw_extensible_t*   b,
+    const struct tw_affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_affine_to_extensible (
+    struct extensible_t*   b,
+    const struct affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_tw_extensible_to_tw_pniels (
+    struct tw_pniels_t*           b,
+    const struct tw_extensible_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->u, &a->t );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_add  ( &b->z, &a->z, &a->z );
+    p448_weak_reduce( &b->z );
+}
+
+void
+convert_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* d
+) {
+    p448_add  ( &e->u, &d->n.b, &d->n.a );
+    p448_sub  ( &e->t, &d->n.b, &d->n.a );
+    p448_bias ( &e->t,     2 );
+    p448_mul  ( &e->x, &d->z, &e->t );
+    p448_mul  ( &e->y, &d->z, &e->u );
+    p448_sqr  ( &e->z, &d->z );
+}
+
+void
+convert_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  e,
+    const struct tw_niels_t* d
+) {
+    p448_add  ( &e->y, &d->b, &d->a );
+    p448_weak_reduce( &e->y );
+    p448_sub  ( &e->x, &d->b, &d->a );
+    p448_bias ( &e->x,     2 );
+    p448_weak_reduce( &e->x );
+    p448_set_ui( &e->z,     1 );
+    p448_copy ( &e->t, &e->x );
+    p448_copy ( &e->u, &e->y );
+}
+
+void
+montgomery_step (
+    struct montgomery_t* a
+) {
+    struct p448_t L0, L1;
+    p448_add  (   &L0, &a->zd, &a->xd );
+    p448_sub  (   &L1, &a->xd, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_sub  ( &a->zd, &a->xa, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_add  ( &a->zd, &a->za, &a->xa );
+    p448_mul  ( &a->za,   &L1, &a->zd );
+    p448_add  ( &a->xa, &a->za, &a->xd );
+    p448_sqr  ( &a->zd, &a->xa );
+    p448_mul  ( &a->xa, &a->z0, &a->zd );
+    p448_sub  ( &a->zd, &a->xd, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_sqr  ( &a->za, &a->zd );
+    p448_sqr  ( &a->xd,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mulw ( &a->zd, &a->xd, 39082 );
+    p448_sub  (   &L1, &a->xd,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_sub  (   &L0, &a->zd,   &L1 );
+    p448_bias (   &L0,     4 );
+    p448_mul  ( &a->zd,   &L0,   &L1 );
+}
+
+void
+deserialize_montgomery (
+    struct montgomery_t* a,
+    const struct p448_t* sbz
+) {
+    p448_sqr  ( &a->z0,   sbz );
+    p448_set_ui( &a->xd,     1 );
+    p448_set_ui( &a->zd,     0 );
+    p448_set_ui( &a->xa,     1 );
+    p448_copy ( &a->za, &a->z0 );
+}
+
+mask_t
+serialize_montgomery (
+    struct p448_t*             b,
+    const struct montgomery_t* a,
+    const struct p448_t*       sbz
+) {
+    struct p448_t L0, L1, L2, L3;
+    mask_t L4, L5, L6;
+    p448_mul  (   &L3, &a->z0, &a->zd );
+    p448_sub  (   &L1,   &L3, &a->xd );
+    p448_bias (   &L1,     2 );
+    p448_mul  (   &L3, &a->za,   &L1 );
+    p448_mul  (   &L2, &a->z0, &a->xd );
+    p448_sub  (   &L1,   &L2, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_mul  (   &L2, &a->xa,   &L1 );
+    p448_add  (   &L1,   &L2,   &L3 );
+    p448_sub  (   &L0,   &L3,   &L2 );
+    p448_bias (   &L0,     2 );
+    p448_mul  (   &L3,   &L0,   &L1 );
+    p448_copy (   &L2, &a->z0 );
+    p448_addw (   &L2,     1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mulw (   &L2,   &L1, 39082 );
+    p448_neg  (   &L1,   &L2 );
+    p448_add  (   &L0, &a->z0, &a->z0 );
+    p448_bias (   &L0,     1 );
+    p448_add  (   &L2,   &L0,   &L0 );
+    p448_add  (   &L0,   &L2,   &L1 );
+    p448_mul  (   &L2, &a->xd,   &L0 );
+       L5 = p448_is_zero( &a->zd );
+       L6 = -   L5;
+    p448_mask (   &L1,   &L2,    L5 );
+    p448_add  (   &L2,   &L1, &a->zd );
+       L4 = ~   L5;
+    p448_mul  (   &L1,   sbz,   &L3 );
+    p448_addw (   &L1,    L6 );
+    p448_mul  (   &L3,   &L2,   &L1 );
+    p448_mul  (   &L1,   &L3,   &L2 );
+    p448_mul  (   &L2,   &L3, &a->xd );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_isr  (   &L0,   &L3 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L3,   &L1 );
+    p448_mask (     b,   &L2,    L4 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L5 = p448_is_zero(   &L0 );
+       L4 = p448_is_zero(   sbz );
+    return    L5 |    L4;
+}
+
+void
+serialize_extensible (
+    struct p448_t*             b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sub  (   &L0, &a->y, &a->z );
+    p448_bias (   &L0,     2 );
+    p448_add  (     b, &a->z, &a->y );
+    p448_mul  (   &L1, &a->z, &a->x );
+    p448_mul  (   &L2,   &L0,   &L1 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (     b,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+}
+
+void
+untwist_and_double_and_serialize (
+    struct p448_t*                b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->y, &a->x );
+    p448_add  (     b, &a->y, &a->x );
+    p448_sqr  (   &L1,     b );
+    p448_add  (   &L2,   &L3,   &L3 );
+    p448_sub  (     b,   &L1,   &L2 );
+    p448_bias (     b,     3 );
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1,   &L2 );
+    p448_add  (   &L2,     b,     b );
+    p448_mulw (     b,   &L2, 39082 );
+    p448_neg  (   &L2,     b );
+    p448_bias (   &L2,     2 );
+    p448_mulw (   &L0,   &L2, 39082 );
+    p448_neg  (     b,   &L0 );
+    p448_bias (     b,     2 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_mul  (   &L2,     b,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     b,   &L0 );
+    p448_sqr  (     b,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (     b,   &L1,   &L3 );
+}
+
+void
+twist_even (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    p448_sqr  ( &b->y, &a->z );
+    p448_sqr  ( &b->z, &a->x );
+    p448_sub  ( &b->u, &b->y, &b->z );
+    p448_bias ( &b->u,     2 );
+    p448_sub  ( &b->z, &a->z, &a->x );
+    p448_bias ( &b->z,     2 );
+    p448_mul  ( &b->y, &b->z, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_mul  ( &b->x, &b->z, &b->y );
+    p448_mul  ( &b->t, &b->x, &b->u );
+    p448_mul  ( &b->y, &b->x, &b->t );
+    p448_isr  ( &b->t, &b->y );
+    p448_mul  ( &b->u, &b->x, &b->t );
+    p448_sqr  ( &b->x, &b->t );
+    p448_mul  ( &b->t, &b->y, &b->x );
+    p448_mul  ( &b->x, &a->x, &b->u );
+    p448_mul  ( &b->y, &a->y, &b->u );
+       L1 = p448_is_zero( &b->z );
+       L0 = -   L1;
+    p448_addw ( &b->y,    L0 );
+    p448_weak_reduce( &b->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+void
+test_only_twist (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1;
+    mask_t L2, L3;
+    p448_sqr  ( &b->u, &a->z );
+    p448_sqr  ( &b->y, &a->x );
+    p448_sub  ( &b->z, &b->u, &b->y );
+    p448_bias ( &b->z,     2 );
+    p448_add  ( &b->y, &b->z, &b->z );
+    p448_add  ( &b->u, &b->y, &b->y );
+    p448_sub  ( &b->y, &a->z, &a->x );
+    p448_bias ( &b->y,     2 );
+    p448_mul  ( &b->t, &b->y, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_mul  ( &b->x, &b->z, &b->t );
+    p448_mul  ( &b->t, &b->x, &b->u );
+    p448_mul  (   &L1, &b->x, &b->t );
+    p448_isr  ( &b->t,   &L1 );
+    p448_mul  ( &b->u, &b->x, &b->t );
+    p448_sqr  ( &b->x, &b->t );
+    p448_mul  ( &b->t,   &L1, &b->x );
+    p448_add  (   &L1, &a->y, &a->x );
+    p448_sub  (   &L0, &a->x, &a->y );
+    p448_bias (   &L0,     2 );
+    p448_mul  ( &b->x, &b->t,   &L0 );
+    p448_add  (   &L0, &b->x,   &L1 );
+    p448_sub  ( &b->t,   &L1, &b->x );
+    p448_bias ( &b->t,     2 );
+    p448_mul  ( &b->x,   &L0, &b->u );
+       L2 = p448_is_zero( &b->y );
+       L3 = -   L2;
+    p448_addw ( &b->x,    L3 );
+    p448_weak_reduce( &b->x );
+    p448_mul  ( &b->y, &b->t, &b->u );
+       L2 = p448_is_zero( &b->z );
+       L3 = -   L2;
+    p448_addw ( &b->y,    L3 );
+    p448_weak_reduce( &b->y );
+       L3 = p448_is_zero( &a->y );
+       L2 =    L3 +     1;
+    p448_set_ui( &b->z,    L2 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+mask_t
+is_square (
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1;
+    mask_t L2, L3;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,     x,   &L1 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L3 = p448_is_zero(   &L0 );
+       L2 = p448_is_zero(     x );
+    return    L3 |    L2;
+}
+
+mask_t
+is_even_pt (
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+is_even_tw (
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+deserialize_affine (
+    struct affine_t*     a,
+    const struct p448_t* sz
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L1,    sz );
+    p448_copy (   &L3,   &L1 );
+    p448_addw (   &L3,     1 );
+    p448_sqr  ( &a->x,   &L3 );
+    p448_mulw (   &L3, &a->x, 39082 );
+    p448_neg  ( &a->x,   &L3 );
+    p448_add  (   &L3,   &L1,   &L1 );
+    p448_bias (   &L3,     1 );
+    p448_add  ( &a->y,   &L3,   &L3 );
+    p448_add  (   &L3, &a->y, &a->x );
+    p448_copy ( &a->y,   &L1 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  ( &a->x, &a->y );
+    p448_bias ( &a->x,     2 );
+    p448_mul  ( &a->y, &a->x,   &L3 );
+    p448_sqr  (   &L2, &a->x );
+    p448_mul  (   &L0,   &L2, &a->y );
+    p448_mul  ( &a->y, &a->x,   &L0 );
+    p448_isr  (   &L3, &a->y );
+    p448_mul  ( &a->y,   &L2,   &L3 );
+    p448_sqr  (   &L2,   &L3 );
+    p448_mul  (   &L3,   &L0,   &L2 );
+    p448_mul  (   &L0, &a->x,   &L3 );
+    p448_add  (   &L2, &a->y, &a->y );
+    p448_mul  ( &a->x,    sz,   &L2 );
+    p448_addw (   &L1,     1 );
+    p448_mul  ( &a->y,   &L1,   &L3 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+deserialize_and_twist_approx (
+    struct tw_extensible_t* a,
+    const struct p448_t*    sdm1,
+    const struct p448_t*    sz
+) {
+    struct p448_t L0, L1;
+    p448_sqr  ( &a->z,    sz );
+    p448_copy ( &a->y, &a->z );
+    p448_addw ( &a->y,     1 );
+    p448_sqr  ( &a->x, &a->y );
+    p448_mulw ( &a->y, &a->x, 39082 );
+    p448_neg  ( &a->x, &a->y );
+    p448_add  ( &a->y, &a->z, &a->z );
+    p448_bias ( &a->y,     1 );
+    p448_add  ( &a->u, &a->y, &a->y );
+    p448_add  ( &a->y, &a->u, &a->x );
+    p448_sqr  ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  ( &a->u, &a->x );
+    p448_bias ( &a->u,     2 );
+    p448_mul  ( &a->x,  sdm1, &a->u );
+    p448_mul  (   &L0, &a->x, &a->y );
+    p448_mul  ( &a->t,   &L0, &a->y );
+    p448_mul  ( &a->u, &a->x, &a->t );
+    p448_mul  ( &a->t, &a->u,   &L0 );
+    p448_mul  ( &a->y, &a->x, &a->t );
+    p448_isr  (   &L0, &a->y );
+    p448_mul  ( &a->y, &a->u,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &a->u, &a->t,   &L1 );
+    p448_mul  ( &a->t, &a->x, &a->u );
+    p448_add  ( &a->x,    sz,    sz );
+    p448_mul  (   &L0, &a->u, &a->x );
+    p448_copy ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  (   &L1, &a->x );
+    p448_bias (   &L1,     2 );
+    p448_mul  ( &a->x,   &L1,   &L0 );
+    p448_mul  (   &L0, &a->u, &a->y );
+    p448_addw ( &a->z,     1 );
+    p448_mul  ( &a->y, &a->z,   &L0 );
+    p448_subw ( &a->t,     1 );
+    p448_bias ( &a->t,     1 );
+    mask_t ret = p448_is_zero( &a->t );
+    p448_set_ui( &a->z,     1 );
+    p448_copy ( &a->t, &a->x );
+    p448_copy ( &a->u, &a->y );
+    return ret;
+}
+
+void
+set_identity_extensible (
+    struct extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_affine (
+    struct affine_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+}
+
+mask_t
+eq_affine (
+    const struct affine_t* a,
+    const struct affine_t* b
+) {
+    struct p448_t L0;
+    mask_t L1, L2;
+    p448_sub  (   &L0, &a->x, &b->x );
+    p448_bias (   &L0,     2 );
+       L2 = p448_is_zero(   &L0 );
+    p448_sub  (   &L0, &a->y, &b->y );
+    p448_bias (   &L0,     2 );
+       L1 = p448_is_zero(   &L0 );
+    return    L2 &    L1;
+}
+
+mask_t
+eq_extensible (
+    const struct extensible_t* a,
+    const struct extensible_t* b
+) {
+    struct p448_t L0, L1, L2;
+    mask_t L3, L4;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
+}
+
+mask_t
+eq_tw_extensible (
+    const struct tw_extensible_t* a,
+    const struct tw_extensible_t* b
+) {
+    struct p448_t L0, L1, L2;
+    mask_t L3, L4;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
+}
+
+void
+elligator_2s_inject (
+    struct affine_t*     a,
+    const struct p448_t* r
+) {
+    struct p448_t L0, L1, L2, L3, L4, L5, L6, L7;
+    mask_t L8, L9;
+    p448_sqr  ( &a->x,     r );
+    p448_sqr  (   &L1, &a->x );
+    p448_copy ( &a->y,   &L1 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  (   &L7, &a->y );
+    p448_bias (   &L7,     2 );
+    p448_sqr  (   &L0,   &L7 );
+    p448_mulw (   &L6,   &L0, 1527402724 );
+    p448_mulw (   &L5,   &L1, 6108985600 );
+    p448_add  ( &a->y,   &L5,   &L6 );
+    p448_mulw (   &L6,   &L0, 6109454568 );
+    p448_sub  (   &L5, &a->y,   &L6 );
+    p448_bias (   &L5,     2 );
+    p448_mulw (   &L2, &a->y, 78160 );
+    p448_mul  (   &L4,   &L5,   &L7 );
+    p448_mul  (   &L6,   &L4,   &L2 );
+    p448_mul  (   &L2,   &L5,   &L6 );
+    p448_isr  (   &L3,   &L2 );
+    p448_mul  (   &L2,   &L4,   &L3 );
+    p448_sqr  (   &L4,   &L3 );
+    p448_mul  (   &L3,   &L6,   &L4 );
+    p448_mul  (   &L6,   &L5,   &L3 );
+    p448_mul  (   &L5,   &L6,   &L3 );
+    p448_copy (   &L4, &a->x );
+    p448_subw (   &L4,     1 );
+    p448_addw ( &a->x,     1 );
+    p448_mul  (   &L3, &a->x,   &L6 );
+    p448_sub  ( &a->x,   &L4,   &L3 );
+    p448_bias ( &a->x,     3 );
+    p448_mul  (   &L3,   &L2, &a->x );
+    p448_mulw (   &L2,   &L3, 78160 );
+    p448_neg  ( &a->x,   &L2 );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_add  (   &L2,   &L1,   &L1 );
+    p448_add  (   &L1,   &L2,   &L0 );
+    p448_subw (   &L1,     2 );
+    p448_bias (   &L1,     1 );
+    p448_mul  (   &L0,   &L1,   &L6 );
+    p448_mulw (   &L1,   &L0, 3054649120 );
+    p448_add  (   &L0,   &L1, &a->y );
+    p448_mul  ( &a->y,   &L5,   &L0 );
+       L9 = p448_is_zero(   &L7 );
+       L8 = -   L9;
+    p448_addw ( &a->y,    L8 );
+    p448_weak_reduce( &a->y );
+}
+
+mask_t
+validate_affine (
+    const struct affine_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L0, &a->y );
+    p448_sqr  (   &L2, &a->x );
+    p448_add  (   &L3,   &L2,   &L0 );
+    p448_subw (   &L3,     1 );
+    p448_mulw (   &L1,   &L2, 39081 );
+    p448_neg  (   &L2,   &L1 );
+    p448_bias (   &L2,     2 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sub  (   &L0,   &L3,   &L1 );
+    p448_bias (   &L0,     3 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+validate_tw_extensible (
+    const struct tw_extensible_t* ext
+) {
+    struct p448_t L0, L1, L2, L3;
+    mask_t L4, L5;
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L0, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L0 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L1, &ext->x, &ext->y );
+    p448_neg  (   &L0,   &L1 );
+    p448_add  (   &L1,   &L0,   &L2 );
+    p448_bias (   &L1,     2 );
+       L5 = p448_is_zero(   &L1 );
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
+     */
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L0,   &L2 );
+    p448_addw (   &L0,     0 );
+    p448_sqr  (   &L1, &ext->x );
+    p448_add  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L1, &ext->t );
+    p448_mul  (   &L0,   &L1,   &L3 );
+    p448_mulw (   &L1,   &L0, 39081 );
+    p448_neg  (   &L3,   &L1 );
+    p448_add  (   &L1,   &L3,   &L2 );
+    p448_neg  (   &L3,   &L0 );
+    p448_add  (   &L2,   &L3,   &L1 );
+    p448_sqr  (   &L1, &ext->z );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     4 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
+}
+
+mask_t
+validate_extensible (
+    const struct extensible_t* ext
+) {
+    struct p448_t L0, L1, L2, L3;
+    mask_t L4, L5;
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
+     */
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->z );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L3,   &L1, 39081 );
+    p448_neg  (   &L0,   &L3 );
+    p448_add  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_neg  (   &L2,   &L0 );
+    p448_add  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     4 );
+       L5 = p448_is_zero(   &L0 );
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
+}
+
+
diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c
new file mode 100644
index 0000000..7a37195
--- /dev/null
+++ b/src/arch_x86_64/p448.c
@@ -0,0 +1,467 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "p448.h"
+#include "x86-64-arith.h"
+
+void
+p448_mul (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    const p448_t *bs
+) {
+    const uint64_t *a = as->limb, *b = bs->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ull<<56) - 1;  
+
+    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32)));
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    unsigned int i;
+    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    }
+    /*
+    for (int i=0; i<4; i++) {
+    aa[i] = a[i] + a[i+4];
+    bb[i] = b[i] + b[i+4];
+    }
+    */
+
+    accum2  = widemul(&a[0],&b[3]);
+    accum0  = widemul(&aa[0],&bb[3]);
+    accum1  = widemul(&a[4],&b[7]);
+
+    mac(&accum2, &a[1], &b[2]);
+    mac(&accum0, &aa[1], &bb[2]);
+    mac(&accum1, &a[5], &b[6]);
+
+    mac(&accum2, &a[2], &b[1]);
+    mac(&accum0, &aa[2], &bb[1]);
+    mac(&accum1, &a[6], &b[5]);
+
+    mac(&accum2, &a[3], &b[0]);
+    mac(&accum0, &aa[3], &bb[0]);
+    mac(&accum1, &a[7], &b[4]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1)) & mask;
+    c[7] = ((uint64_t)(accum0)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+    
+    mac(&accum0, &aa[1],&bb[3]);
+    mac(&accum1, &a[5], &b[7]);
+    mac(&accum0, &aa[2], &bb[2]);
+    mac(&accum1, &a[6], &b[6]);
+    mac(&accum0, &aa[3], &bb[1]);
+    accum1 += accum0;
+
+    accum2 = widemul(&a[0],&b[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+    
+    msb(&accum0, &a[1], &b[3]);
+    msb(&accum0, &a[2], &b[2]);
+    mac(&accum1, &a[7], &b[5]);
+    msb(&accum0, &a[3], &b[1]);
+    mac(&accum1, &aa[0], &bb[0]);
+    mac(&accum0, &a[4], &b[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2  = widemul(&aa[2],&bb[3]);
+    msb(&accum0, &a[2], &b[3]);
+    mac(&accum1, &a[6], &b[7]);
+
+    mac(&accum2, &aa[3], &bb[2]);
+    msb(&accum0, &a[3], &b[2]);
+    mac(&accum1, &a[7], &b[6]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul(&a[0],&b[1]);
+    mac(&accum1, &aa[0], &bb[1]);
+    mac(&accum0, &a[4], &b[5]);
+
+    mac(&accum2, &a[1], &b[0]);
+    mac(&accum1, &aa[1], &bb[0]);
+    mac(&accum0, &a[5], &b[4]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2  = widemul(&aa[3],&bb[3]);
+    msb(&accum0, &a[3], &b[3]);
+    mac(&accum1, &a[7], &b[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul(&a[0],&b[2]);
+    mac(&accum1, &aa[0], &bb[2]);
+    mac(&accum0, &a[4], &b[6]);
+
+    mac(&accum2, &a[1], &b[1]);
+    mac(&accum1, &aa[1], &bb[1]);
+    mac(&accum0, &a[5], &b[5]);
+
+    mac(&accum2, &a[2], &b[0]);
+    mac(&accum1, &aa[2], &bb[0]);
+    mac(&accum0, &a[6], &b[4]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
+
+void
+p448_mulw (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    uint64_t b
+) {
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0, accum4;
+    uint64_t mask = (1ull<<56) - 1;  
+
+    accum0 = widemul_rm(b, &a[0]);
+    accum4 = widemul_rm(b, &a[4]);
+
+    c[0] = accum0 & mask; accum0 >>= 56;
+    c[4] = accum4 & mask; accum4 >>= 56;
+
+    mac_rm(&accum0, b, &a[1]);
+    mac_rm(&accum4, b, &a[5]);
+
+    c[1] = accum0 & mask; accum0 >>= 56;
+    c[5] = accum4 & mask; accum4 >>= 56;
+
+    mac_rm(&accum0, b, &a[2]);
+    mac_rm(&accum4, b, &a[6]);
+
+    c[2] = accum0 & mask; accum0 >>= 56;
+    c[6] = accum4 & mask; accum4 >>= 56;
+
+    mac_rm(&accum0, b, &a[3]);
+    mac_rm(&accum4, b, &a[7]);
+
+    c[3] = accum0 & mask; accum0 >>= 56;
+    c[7] = accum4 & mask; accum4 >>= 56;
+
+    c[4] += accum0 + accum4;
+    c[0] += accum4;
+    
+    /*
+     * TODO: double-check that this is not necessary.
+    accum0 += accum4 + c[4];
+    c[4] = accum0 & mask;
+    c[5] += accum0 >> 56;
+
+    accum4 += c[0];
+    c[0] = accum4 & mask;
+    c[1] += accum4 >> 56;
+    */
+}
+
+void
+p448_sqr (
+    p448_t *__restrict__ cs,
+    const p448_t *as
+) {
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ull<<56) - 1;  
+
+    uint64_t aa[4] __attribute__((aligned(32)));
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    unsigned int i;
+    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+    }
+
+    accum2  = widemul(&a[0],&a[3]);
+    accum0  = widemul(&aa[0],&aa[3]);
+    accum1  = widemul(&a[4],&a[7]);
+
+    mac(&accum2, &a[1], &a[2]);
+    mac(&accum0, &aa[1], &aa[2]);
+    mac(&accum1, &a[5], &a[6]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1))<<1 & mask;
+    c[7] = ((uint64_t)(accum0))<<1 & mask;
+
+    accum0 >>= 55;
+    accum1 >>= 55;
+
+    mac2(&accum0, &aa[1],&aa[3]);
+    mac2(&accum1, &a[5], &a[7]);
+    mac(&accum0, &aa[2], &aa[2]);
+    accum1 += accum0;
+
+    msb2(&accum0, &a[1], &a[3]);
+    mac(&accum1, &a[6], &a[6]);
+    
+    accum2 = widemul(&a[0],&a[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    msb(&accum0, &a[2], &a[2]);
+    mac(&accum1, &aa[0], &aa[0]);
+    mac(&accum0, &a[4], &a[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2  = widemul2(&aa[2],&aa[3]);
+    msb2(&accum0, &a[2], &a[3]);
+    mac2(&accum1, &a[6], &a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul2(&a[0],&a[1]);
+    mac2(&accum1, &aa[0], &aa[1]);
+    mac2(&accum0, &a[4], &a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2  = widemul(&aa[3],&aa[3]);
+    msb(&accum0, &a[3], &a[3]);
+    mac(&accum1, &a[7], &a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul2(&a[0],&a[2]);
+    mac2(&accum1, &aa[0], &aa[2]);
+    mac2(&accum0, &a[4], &a[6]);
+
+    mac(&accum2, &a[1], &a[1]);
+    mac(&accum1, &aa[1], &aa[1]);
+    mac(&accum0, &a[5], &a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
+
+void
+p448_strong_reduce (
+    p448_t *a
+) {
+    uint64_t mask = (1ull<<56)-1;
+
+    /* first, clear high */
+    a->limb[4] += a->limb[7]>>56;
+    a->limb[0] += a->limb[7]>>56;
+    a->limb[7] &= mask;
+
+    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+
+    __int128_t scarry = 0;
+    int i;
+    for (i=0; i<8; i++) {
+        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
+        a->limb[i] = scarry & mask;
+        scarry >>= 56;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
+    * so let's add back in p.  will carry back off the top for 2^448.
+    */
+
+    assert(is_zero(scarry) | is_zero(scarry+1));
+
+    uint64_t scarry_mask = scarry & mask;
+    __uint128_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<8; i++) {
+        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
+        a->limb[i] = carry & mask;
+        carry >>= 56;
+    }
+
+    assert(is_zero(carry + scarry));
+}
+
+mask_t
+p448_is_zero (
+    const struct p448_t *a
+) {
+    struct p448_t b;
+    p448_copy(&b,a);
+    p448_strong_reduce(&b);
+
+    uint64_t any = 0;
+    int i;
+    for (i=0; i<8; i++) {
+        any |= b.limb[i];
+    }
+    return is_zero(any);
+}
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+) {
+    int i,j;
+    p448_t red;
+    p448_copy(&red, x);
+    p448_strong_reduce(&red);
+    for (i=0; i<8; i++) {
+        for (j=0; j<7; j++) {
+            serial[7*i+j] = red.limb[i];
+            red.limb[i] >>= 8;
+        }
+        assert(red.limb[i] == 0);
+    }
+}
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+) {
+    int i,j;
+    for (i=0; i<8; i++) {
+        word_t out = 0;
+        for (j=0; j<7; j++) {
+            out |= ((word_t)serial[7*i+j])<<(8*j);
+        }
+        x->limb[i] = out;
+    }
+    
+    /* Check for reduction.
+     *
+     * The idea is to create a variable ge which is all ones (rather, 56 ones)
+     * if and only if the low $i$ words of $x$ are >= those of p.
+     *
+     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
+     */
+    word_t ge = -1, mask = (1ull<<56)-1;
+    for (i=0; i<4; i++) {
+        ge &= x->limb[i];
+    }
+    
+    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
+    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    
+    /* Propagate the rest */
+    for (i=5; i<8; i++) {
+        ge &= x->limb[i];
+    }
+    
+    return ~is_zero(ge ^ mask);
+}
+
+void
+simultaneous_invert_p448(
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+) {
+  if (n==0) {
+      return;
+  } else if (n==1) {
+      p448_inverse(out,in);
+      return;
+  }
+  
+  p448_copy(&out[1], &in[0]);
+  int i;
+  for (i=1; i<(int) (n-1); i++) {
+      p448_mul(&out[i+1], &out[i], &in[i]);
+  }
+  p448_mul(&out[0], &out[n-1], &in[n-1]);
+  
+  struct p448_t tmp;
+  p448_inverse(&tmp, &out[0]);
+  p448_copy(&out[0], &tmp);
+  
+  /* at this point, out[0] = product(in[i]) ^ -1
+   * out[i] = product(in[0]..in[i-1]) if i != 0
+   */
+  for (i=n-1; i>0; i--) {
+      p448_mul(&tmp, &out[i], &out[0]);
+      p448_copy(&out[i], &tmp);
+      
+      p448_mul(&tmp, &out[0], &in[i]);
+      p448_copy(&out[0], &tmp);
+  }
+}
diff --git a/src/arch_x86_64/p448.h b/src/arch_x86_64/p448.h
new file mode 100644
index 0000000..b0b4dc0
--- /dev/null
+++ b/src/arch_x86_64/p448.h
@@ -0,0 +1,376 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __P448_H__
+#define __P448_H__ 1
+
+#include <stdint.h>
+#include <assert.h>
+
+#include "word.h"
+
+typedef struct p448_t {
+  uint64_t limb[8];
+} __attribute__((aligned(32))) p448_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) __attribute__((unused,always_inline));
+           
+static __inline__ void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t do_swap
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+            
+static __inline__ void
+p448_cond_neg (
+    p448_t *a,
+    mask_t doNegate
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_addw (
+    p448_t *a,
+    uint64_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_subw (
+    p448_t *a,
+    uint64_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_weak_reduce (
+    p448_t *inout
+) __attribute__((unused,always_inline));
+             
+void
+p448_strong_reduce (
+    p448_t *inout
+);
+
+mask_t
+p448_is_zero (
+    const p448_t *in
+);
+             
+static __inline__ void
+p448_bias (
+    p448_t *inout,
+    int amount
+) __attribute__((unused,always_inline));
+         
+void
+p448_mul (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    const p448_t *b
+);
+
+void
+p448_mulw (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    uint64_t b
+);
+
+void
+p448_sqr (
+    p448_t *__restrict__ out,
+    const p448_t *a
+);
+         
+static __inline__ void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) __attribute__((unused,always_inline));
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+);
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+);
+    
+static __inline__ void
+p448_mask(
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) __attribute__((unused,always_inline));
+
+/**
+* Returns 1/x.
+* 
+* If x=0, returns 0.
+*/
+void
+p448_inverse (
+   struct p448_t*       a,
+   const struct p448_t* x
+);
+       
+void
+simultaneous_invert_p448 (
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+);
+
+static inline mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) __attribute__((always_inline,unused));
+
+/* -------------- Inline functions begin here -------------- */
+
+void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) {
+    int i;
+    out->limb[0] = x;
+    for (i=1; i<8; i++) {
+      out->limb[i] = 0;
+    }
+}
+            
+void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t doswap
+) {
+    big_register_t *aa = (big_register_t*)a;
+    big_register_t *bb = (big_register_t*)b;
+    big_register_t m = doswap;
+
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        big_register_t x = m & (aa[i]^bb[i]);
+        aa[i] ^= x;
+        bb[i] ^= x;
+    }
+}
+
+void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    */
+}
+
+void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] - b->limb[i];
+    }
+    */
+}
+
+void
+p448_neg (
+    struct p448_t *out,
+    const p448_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = -a->limb[i];
+    }
+    */
+}
+
+void
+p448_cond_neg(
+    struct p448_t *a,
+    mask_t doNegate
+) {
+    unsigned int i;
+    struct p448_t negated;
+    big_register_t *aa = (big_register_t *)a;
+    big_register_t *nn = (big_register_t*)&negated;
+    big_register_t m = doNegate;
+    
+    p448_neg(&negated, a);
+    p448_bias(&negated, 2);
+    
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        aa[i] = (aa[i] & ~m) | (nn[i] & m);
+    }
+}
+
+void
+p448_addw (
+    p448_t *a,
+    uint64_t x
+) {
+  a->limb[0] += x;
+}
+             
+void
+p448_subw (
+    p448_t *a,
+    uint64_t x
+) {
+  a->limb[0] -= x;
+}
+
+void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) {
+  *out = *a;
+}
+
+void
+p448_bias (
+    p448_t *a,
+    int amt
+) {
+    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
+    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
+    uint64x4_t *aa = (uint64x4_t*) a;
+    aa[0] += lo;
+    aa[1] += hi;
+}
+
+void
+p448_weak_reduce (
+    p448_t *a
+) {
+    /* PERF: use pshufb/palignr if anyone cares about speed of this */
+    uint64_t mask = (1ull<<56) - 1;
+    uint64_t tmp = a->limb[7] >> 56;
+    int i;
+    a->limb[4] += tmp;
+    for (i=7; i>0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
+    }
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) {
+    p448_t tmp;
+    assert(n>0);
+    if (n&1) {
+        p448_sqr(y,x);
+        n--;
+    } else {
+        p448_sqr(&tmp,x);
+        p448_sqr(y,&tmp);
+        n-=2;
+    }
+    for (; n; n-=2) {
+        p448_sqr(&tmp,y);
+        p448_sqr(y,&tmp);
+    }
+}
+
+mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) {
+    struct p448_t ra, rb;
+    p448_copy(&ra, a);
+    p448_copy(&rb, b);
+    p448_weak_reduce(&ra);
+    p448_weak_reduce(&rb);
+    p448_sub(&ra, &ra, &rb);
+    p448_bias(&ra, 2);
+    return p448_is_zero(&ra);
+}
+
+void
+p448_mask (
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
+        a->limb[i] = b->limb[i] & mask;
+    }
+}
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __P448_H__ */
diff --git a/src/arch_x86_64/x86-64-arith.h b/src/arch_x86_64/x86-64-arith.h
new file mode 100644
index 0000000..32ee832
--- /dev/null
+++ b/src/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,279 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __X86_64_ARITH_H__
+#define __X86_64_ARITH_H__
+
+#include <stdint.h>
+
+/* TODO: non x86-64 versions of these.
+ * FUTURE: autogenerate
+ */
+
+static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"r"(a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"d"(a));
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "leaq (,%%rdx,2), %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       "addq %[c], %[lo2]; "
+       "adcq %[d], %[hi2]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       "addq %%rax, %[lo2]; "
+       "adcq %%rdx, %[hi2]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
+}
+
+static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"d"(a)
+       : "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"r"(a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  
+}
+
+static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d, lo = *acc, hi = *acc>>64;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[lo], %[c]; "
+       "sbbq %[hi], %[d]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  *acc = (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
+  return ((__uint128_t)(a)) * b;
+}
+
+static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
+  return ((__int128_t)(a)) * b;
+}
+ 
+static __inline__ uint64_t opacify(uint64_t x) {
+  __asm__ volatile("" : "+r"(x));
+  return x;
+}
+
+static __inline__ mask_t is_zero(uint64_t x) {
+  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
+  return ~x;
+}
+
+#endif /* __X86_64_ARITH_H__ */
diff --git a/src/barrett_field.c b/src/barrett_field.c
new file mode 100644
index 0000000..55afe7d
--- /dev/null
+++ b/src/barrett_field.c
@@ -0,0 +1,349 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "barrett_field.h"
+#include <string.h>
+#include <assert.h>
+
+word_t
+add_nr_ext_packed(
+    word_t *out,
+    const word_t *a,
+    uint32_t nwords_a,
+    const word_t *c,
+    uint32_t nwords_c,
+    word_t mask
+) {
+    uint32_t i;
+    dword_t carry = 0;
+    for (i=0; i<nwords_c; i++) {
+        out[i] = carry = carry + a[i] + (c[i]&mask);
+        carry >>= WORD_BITS;
+    }
+    for (; i<nwords_a; i++) {
+        out[i] = carry = carry + a[i];
+        carry >>= WORD_BITS;
+    }
+    return carry;
+}
+
+static __inline__ word_t
+add_nr_packed(
+    word_t *a,
+    const word_t *c,
+    uint32_t nwords
+) {
+    uint32_t i;
+    dword_t carry = 0;
+    for (i=0; i<nwords; i++) {
+        a[i] = carry = carry + a[i] + c[i];
+        carry >>= WORD_BITS;
+    }
+    return carry;
+}
+
+word_t
+sub_nr_ext_packed(
+    word_t *out,
+    const word_t *a,
+    uint32_t nwords_a,
+    const word_t *c,
+    uint32_t nwords_c,
+    word_t mask
+) {
+    uint32_t i;
+    dsword_t carry = 0;
+    for (i=0; i<nwords_c; i++) {
+        out[i] = carry = carry + a[i] - (c[i]&mask);
+        carry >>= WORD_BITS;
+    }
+    for (; i<nwords_a; i++) {
+        out[i] = carry = carry + a[i];
+        carry >>= WORD_BITS;
+    }
+    return carry;
+}
+
+static word_t
+widemac(
+    word_t *accum,
+    uint32_t nwords_accum,
+    const word_t *mier,
+    uint32_t nwords_mier,
+    word_t mand,
+    word_t carry
+) {
+    uint32_t i;
+    assert(nwords_mier <= nwords_accum);
+    
+    for (i=0; i<nwords_mier; i++) {
+#ifdef __clang_analyzer__
+        /* always true, but this satisfies scan-build (bug in scan-build?) */
+        assert(i<nwords_accum);
+#endif
+        /* UMAAL chain for the wordy part of p */
+        dword_t product = ((dword_t)mand) * mier[i];
+        product += accum[i];
+        product += carry;
+        accum[i] = product;
+        carry = product >> WORD_BITS;
+    }
+    
+    for (; i<nwords_accum; i++) {
+        dword_t sum = ((dword_t)carry) + accum[i];
+        accum[i] = sum;
+        carry = sum >> WORD_BITS;
+    }
+    
+    return carry;
+}
+
+void
+barrett_negate (
+    word_t *a,
+    uint32_t nwords_a,
+    const struct barrett_prime_t *prime
+) {
+    uint32_t i;
+    dsword_t carry = 0;
+    
+    barrett_reduce(a,nwords_a,0,prime);
+    
+    /* Have p = 2^big - p_lo.  Want p - a = 2^big - p_lo - a */
+    
+    for (i=0; i<prime->nwords_lo; i++) {
+        a[i] = carry = carry - prime->p_lo[i] - a[i];
+        carry >>= WORD_BITS;
+    }
+    for (; i<prime->nwords_p; i++) {
+        a[i] = carry = carry - a[i];
+        if (i<prime->nwords_p-1) {
+            carry >>= WORD_BITS;
+        }
+    }
+    
+    a[prime->nwords_p-1] = carry = carry + (((word_t)1) << prime->p_shift);
+    
+    for (; i<nwords_a; i++) {
+        assert(!a[i]);
+    }
+    
+    assert(!(carry>>WORD_BITS));
+}
+
+void
+barrett_reduce(
+    word_t *a,
+    uint32_t nwords_a,
+    word_t a_carry,
+    const struct barrett_prime_t *prime
+) {
+    uint32_t repeat, nwords_left_in_a=nwords_a;
+    
+    /* Is there a point to this a_carry business? */
+    assert(a_carry < ((word_t)1) << prime->p_shift);
+    assert(nwords_a >= prime->nwords_p);
+    assert(prime->nwords_p > 0); /* scan-build: prevent underflow */
+    
+    for (; nwords_left_in_a >= prime->nwords_p; nwords_left_in_a--) {
+        for (repeat=0; repeat<2; repeat++) {
+            /* PERF: surely a more careful implementation could
+             * avoid this double round
+             */
+            word_t mand = a[nwords_left_in_a-1] >> prime->p_shift;
+            a[nwords_left_in_a-1] &= (((word_t)1)<<prime->p_shift)-1;
+            if (prime->p_shift && !repeat) {
+                /* collect high bits when there are any */
+                if (nwords_left_in_a < nwords_a) {
+                    mand |= a[nwords_left_in_a] << (WORD_BITS-prime->p_shift);
+                    a[nwords_left_in_a] = 0;
+                } else {
+                    mand |= a_carry << (WORD_BITS-prime->p_shift);
+                }
+            }
+            
+            word_t carry = widemac(
+                a+nwords_left_in_a-prime->nwords_p,
+                prime->nwords_p,
+                prime->p_lo,
+                prime->nwords_lo,
+                mand,
+                0
+            );
+            assert(!carry);
+            (void)carry;
+        }
+    }
+    
+    assert(nwords_left_in_a == prime->nwords_p-1);
+    
+    /* OK, but it still isn't reduced.  Add and subtract p_lo. */
+    word_t cout = add_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,-1);
+    if (prime->p_shift) {
+        cout = (cout<<(WORD_BITS-prime->p_shift)) + (a[prime->nwords_p-1]>>prime->p_shift);
+        a[prime->nwords_p-1] &= (((word_t)1)<<prime->p_shift)-1;
+    }
+    
+    /* mask = carry-1: if no carry then do sub, otherwise don't */
+    sub_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,cout-1);
+}
+
+/* PERF: This function is horribly slow.  Enough to break 1%. */
+void
+barrett_mul_or_mac(
+    word_t *accum,
+    uint32_t nwords_accum,
+    
+    const word_t *a,
+    uint32_t nwords_a,
+    
+    const word_t *b,
+    uint32_t nwords_b,
+    
+    const struct barrett_prime_t *prime,
+    
+    mask_t doMac
+) {
+    assert(nwords_accum >= prime->nwords_p);
+    
+    /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
+    uint32_t nwords_tmp = (nwords_a > prime->nwords_p) ? nwords_a : prime->nwords_p;
+    nwords_tmp++;
+    assert(nwords_tmp > 0); /* scan-build: prevent underflow. */
+    if (nwords_tmp < nwords_accum && doMac)
+        nwords_tmp = nwords_accum;
+    
+    word_t tmp[nwords_tmp];
+    int bpos, idown;
+    uint32_t i;
+    
+    for (i=0; i<nwords_tmp; i++) {
+        tmp[i] = 0;
+    }
+    
+    for (bpos=nwords_b-1; bpos >= 0; bpos--) {
+        /* Invariant at the beginning of the loop: the high word is unused. */
+        assert(tmp[nwords_tmp-1] == 0);
+        
+        /* shift up */
+        for (idown=nwords_tmp-2; idown>=0; idown--) {
+            tmp[idown+1] = tmp[idown];
+        }
+        tmp[0] = 0;
+
+        /* mac and reduce */
+        word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
+        
+        /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
+        assert(!carry);
+        barrett_reduce(tmp, nwords_tmp, carry, prime);
+        
+        /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
+         * so the high word is again clear */
+    }
+    
+    if (doMac) {
+        word_t cout = add_nr_packed(tmp, accum, nwords_accum);
+        barrett_reduce(tmp, nwords_tmp, cout, prime);
+    }
+    
+    for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
+        accum[i] = tmp[i];
+    }
+    for (; i<nwords_tmp; i++) {
+        assert(tmp[i] == 0);
+    }
+    for (; i<nwords_accum; i++) {
+        accum[i] = 0;
+    }
+}
+mask_t
+barrett_deserialize (
+    word_t *x,
+    const uint8_t *serial,
+    const struct barrett_prime_t *prime
+) {
+    unsigned int i,j,nserial = prime->nwords_p * sizeof(word_t);
+    if (prime->p_shift) {
+        nserial -= (WORD_BITS - prime->p_shift) / 8;
+    }
+
+    
+    /* Track x < p, p = 2^k - p_lo <==> x + p_lo < 2^k */
+    dword_t carry = 0;
+    
+    for (i=0; i*sizeof(word_t)<nserial; i++) {
+        carry >>= WORD_BITS;
+        
+        word_t the = 0;
+        for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
+            the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
+        }
+        x[i] = the;
+        
+        carry += the;
+        if (i < prime->nwords_lo) carry += prime->p_lo[i];
+    }
+    
+    /* check for reduction */
+    if (prime->p_shift) {
+        carry >>= prime->p_shift;
+    } else {
+        carry >>= WORD_BITS;
+    }
+    
+    /* at this point, carry > 0 indicates failure */
+    dsword_t scarry = carry;
+    scarry = -scarry;
+    scarry >>= WORD_BITS;
+    scarry >>= WORD_BITS;
+    
+    return (mask_t) ~scarry;
+}
+    
+void
+barrett_deserialize_and_reduce (
+    word_t *x,
+    const uint8_t *serial,
+    uint32_t nserial,
+    const struct barrett_prime_t *prime
+) {
+    unsigned int size = (nserial + sizeof(word_t) - 1)/sizeof(word_t);
+    if (size < prime->nwords_p) {
+        size = prime->nwords_p;
+    }
+    word_t tmp[size];
+    memset(tmp,0,sizeof(tmp));
+    
+    unsigned int i,j;
+    for (i=0; i*sizeof(word_t)<nserial; i++) {
+        word_t the = 0;
+        for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
+            the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
+        }
+        tmp[i] = the;
+    }
+    
+    barrett_reduce(tmp,size,0,prime);
+    for (i=0; i<prime->nwords_p; i++) {
+        x[i] = tmp[i];
+    }
+    for (; i<size; i++) {
+        assert(!tmp[i]);
+    }
+}
+
+void
+barrett_serialize (
+    uint8_t *serial,
+    const word_t *x,
+    uint32_t nserial
+) {
+    unsigned int i,j;
+    for (i=0; i*sizeof(word_t)<nserial; i++) {
+        for (j=0; j<sizeof(word_t); j++) {
+            serial[sizeof(word_t)*i+j] = x[i]>>(8*j);
+        }
+    }
+}
diff --git a/src/crandom.c b/src/crandom.c
new file mode 100644
index 0000000..e4a71d0
--- /dev/null
+++ b/src/crandom.c
@@ -0,0 +1,442 @@
+/* Copyright (c) 2011 Stanford University.
+ * Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/* Chacha random number generator code copied from crandom */
+
+#include "intrinsics.h"
+#include "crandom.h"
+#include <stdio.h>
+
+volatile unsigned int crandom_features = 0;
+
+unsigned int crandom_detect_features() {
+  unsigned int out = GEN;
+  
+# if (defined(__i386__) || defined(__x86_64__))
+    u_int32_t a,b,c,d;
+    
+    a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
+    out |= GEN;
+    if (d & 1<<26) out |= SSE2;
+    if (d & 1<< 9) out |= SSSE3;
+    if (c & 1<<25) out |= AESNI;
+    if (c & 1<<28) out |= AVX;
+    if (b & 1<<5) out  |= AVX2;
+    
+    a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
+    if (c & 1<<11) out |= XOP;
+    if (c & 1<<30) out |= RDRAND;
+# endif
+  
+  return out;
+}
+
+
+
+INTRINSIC u_int64_t rdrand(int abort_on_fail) {
+    uint64_t out = 0;
+    int tries = 1000;
+    
+    if (HAVE(RDRAND)) {
+    # if defined(__x86_64__)
+        u_int64_t out, a=0;
+        for (; tries && !a; tries--) {
+            __asm__ __volatile__ (
+                "rdrand %0\n\tsetc %%al"
+                    : "=r"(out), "+a"(a) :: "cc"
+            );
+        }
+    # elif (defined(__i386__))
+        u_int32_t reg, a=0;
+        uint64_t out;
+        for (; tries && !a; tries--) {
+            __asm__ __volatile__ (
+                "rdrand %0\n\tsetc %%al"
+                    : "=r"(reg), "+a"(a) :: "cc"
+            );
+        }
+        out = reg; a = 0;
+        for (; tries && !a; tries--) {
+            __asm__ __volatile__ (
+                "rdrand %0\n\tsetc %%al"
+                    : "=r"(reg), "+a"(a) :: "cc"
+            );
+        }
+        out = out << 32 | reg;
+        return out;
+    # else
+        abort(); // whut
+    # endif
+    } else {
+        tries = 0;
+    }
+    
+    if (abort_on_fail && !tries) {
+        abort();
+    }
+    
+    return out;
+}
+
+
+/* ------------------------------- Vectorized code ------------------------------- */
+#define shuffle(x,i) _mm_shuffle_epi32(x, \
+  i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64)
+
+#define add _mm_add_epi32
+#define add64 _mm_add_epi64
+
+#define NEED_XOP   (MIGHT_HAVE(XOP))
+#define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP))
+#define NEED_SSE2  (MIGHT_HAVE(SSE2)  && !MUST_HAVE(SSSE3))
+#define NEED_CONV  (!MUST_HAVE(SSE2))
+
+#if NEED_XOP
+static __inline__ void
+quarter_round_xop(
+    ssereg *a,
+    ssereg *b,
+    ssereg *c,
+    ssereg *d
+) {
+    *a = add(*a,*b); *d = xop_rotate(16, *d ^ *a);
+    *c = add(*c,*d); *b = xop_rotate(12, *b ^ *c);
+    *a = add(*a,*b); *d = xop_rotate(8,  *d ^ *a);
+    *c = add(*c,*d); *b = xop_rotate(7,  *b ^ *c);
+}
+#endif
+
+#if NEED_SSSE3
+static const ssereg shuffle8  = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull };
+static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull };
+  
+INTRINSIC ssereg ssse3_rotate_8(ssereg a) {
+    return _mm_shuffle_epi8(a, shuffle8);
+}
+  
+INTRINSIC ssereg ssse3_rotate_16(ssereg a) {
+    return _mm_shuffle_epi8(a, shuffle16);
+}
+  
+static __inline__ void
+quarter_round_ssse3(
+    ssereg *a,
+    ssereg *b,
+    ssereg *c,
+    ssereg *d
+) {
+    *a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a);
+    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
+    *a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a);
+    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
+}
+#endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */
+
+#if NEED_SSE2
+static __inline__ void
+quarter_round_sse2(
+    ssereg *a,
+    ssereg *b,
+    ssereg *c,
+    ssereg *d
+) {
+    *a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a);
+    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
+    *a = add(*a,*b); *d = sse2_rotate(8,  *d ^ *a);
+    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
+}
+#endif
+
+#define DOUBLE_ROUND(qrf) { \
+  qrf(&a1,&b1,&c1,&d1);     \
+  qrf(&a2,&b2,&c2,&d2);     \
+  b1 = shuffle(b1,1);       \
+  c1 = shuffle(c1,2);       \
+  d1 = shuffle(d1,3);       \
+  b2 = shuffle(b2,1);       \
+  c2 = shuffle(c2,2);       \
+  d2 = shuffle(d2,3);       \
+                            \
+  qrf(&a1,&b1,&c1,&d1);     \
+  qrf(&a2,&b2,&c2,&d2);     \
+  b1 = shuffle(b1,3);       \
+  c1 = shuffle(c1,2);       \
+  d1 = shuffle(d1,1);       \
+  b2 = shuffle(b2,3);       \
+  c2 = shuffle(c2,2);       \
+  d2 = shuffle(d2,1);       \
+                          }
+                          
+#define OUTPUT_FUNCTION   { \
+  output[0] = add(a1,aa);   \
+  output[1] = add(b1,bb);   \
+  output[2] = add(c1,cc);   \
+  output[3] = add(d1,dd);   \
+  output[4] = add(a2,aa);   \
+  output[5] = add(b2,bb);   \
+  output[6] = add(c2,add(cc,p)); \
+  output[7] = add(d2,dd);   \
+                            \
+  output += 8;              \
+                            \
+  cc = add64(add64(cc,p), p); \
+  a1 = a2 = aa;             \
+  b1 = b2 = bb;             \
+  c1 = cc; c2 = add64(cc,p);\
+  d1 = d2 = dd;             \
+                          }
+/* ------------------------------------------------------------------------------- */
+
+INTRINSIC u_int32_t rotate(int r, u_int32_t a) {
+    return a<<r ^ a>>(32-r);
+}
+
+static __inline__ __attribute__((unused)) void
+quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) {
+    *a = *a + *b; *d = rotate(16, *d^*a);
+    *c = *c + *d; *b = rotate(12, *b^*c);
+    *a = *a + *b; *d = rotate(8,  *d^*a);
+    *c = *c + *d; *b = rotate(7,  *b^*c);
+}
+
+static void
+crandom_chacha_expand(u_int64_t iv,
+                         u_int64_t ctr,
+                         int nr,
+                         int output_size,
+                         const unsigned char *key_,
+                         unsigned char *output_) {
+# if MIGHT_HAVE_SSE2
+    if (HAVE(SSE2)) {
+        ssereg *key = (ssereg *)key_;
+        ssereg *output = (ssereg *)output_;
+                 
+        ssereg a1 = key[0], a2 = a1, aa = a1,
+               b1 = key[1], b2 = b1, bb = b1,
+               c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1,
+               d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull},
+               d2 = d1, dd = d1,
+               p = {0, 1};
+ 
+        int i,r;
+#   if (NEED_XOP)
+        if (HAVE(XOP)) {
+            for (i=0; i<output_size; i+=128) {
+                for (r=nr; r>0; r-=2)
+                    DOUBLE_ROUND(quarter_round_xop);
+                OUTPUT_FUNCTION;
+            }
+            return;
+        }
+#   endif
+#   if (NEED_SSSE3)
+        if (HAVE(SSSE3)) {
+            for (i=0; i<output_size; i+=128) {
+                for (r=nr; r>0; r-=2)
+                    DOUBLE_ROUND(quarter_round_ssse3);
+                OUTPUT_FUNCTION;
+            }
+            return;
+        }
+#   endif
+#   if (NEED_SSE2)
+        if (HAVE(SSE2)) {
+            for (i=0; i<output_size; i+=128) {
+                for (r=nr; r>0; r-=2)
+                    DOUBLE_ROUND(quarter_round_sse2);
+                OUTPUT_FUNCTION;
+            }
+            return;
+        }
+#   endif
+    }
+# endif
+
+# if NEED_CONV
+    {
+        const u_int32_t *key = (const u_int32_t *)key_;
+        u_int32_t
+        x[16],
+        input[16] = {
+            key[0], key[1], key[2], key[3],
+            key[4], key[5], key[6], key[7],
+            iv, iv>>32, ctr, ctr>>32,
+            0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+        },
+        *output = (u_int32_t *)output_;
+        int i, r;
+
+        for (i=0; i<output_size; i+= 64) {
+            for (r=0; r<16; r++) {
+                x[r] = input[r];
+            }
+                for (r=nr; r>0; r-=2) {
+                quarter_round(&x[0], &x[4],  &x[8], &x[12]);
+                quarter_round(&x[1], &x[5],  &x[9], &x[13]);
+                quarter_round(&x[2], &x[6], &x[10], &x[14]);
+                quarter_round(&x[3], &x[7], &x[11], &x[15]);
+
+                quarter_round(&x[0], &x[5], &x[10], &x[15]);
+                quarter_round(&x[1], &x[6], &x[11], &x[12]);
+                quarter_round(&x[2], &x[7],  &x[8], &x[13]);
+                quarter_round(&x[3], &x[4],  &x[9], &x[14]);
+            }
+            for (r=0; r<16; r++) {
+                output[r] = x[r] + input[r];
+            }
+
+            output += 16;
+            input[11] ++;
+            if (!input[11]) input[12]++;
+        }
+    }
+  
+#endif /* NEED_CONV */
+}
+
+/* "return 4", cf xkcd #221 */
+#define CRANDOM_MAGIC 0x72657475726e2034ull
+
+int
+crandom_init_from_file(
+    struct crandom_state_t *state,
+    const char *filename,
+    int reseed_interval,
+    int reseeds_mandatory
+) {
+    state->fill = 0;
+    state->reseed_countdown = reseed_interval;
+    state->reseed_interval = reseed_interval;
+    state->ctr = 0;
+
+    state->randomfd = open(filename, O_RDONLY);
+    if (state->randomfd == -1) {
+        int err = errno;
+        return err ? err : -1;
+    }
+
+    ssize_t offset = 0, red;
+    do {
+        red = read(state->randomfd, state->seed + offset, 32 - offset);
+        if (red > 0) offset += red;
+    } while (red > 0 && offset < 32);
+
+    if (offset < 32) {
+        int err = errno;
+        return err ? err : -1;
+    }
+
+    memset(state->buffer, 0, 96);
+
+    state->magic = CRANDOM_MAGIC;
+    state->reseeds_mandatory = reseeds_mandatory;
+
+    return 0;
+}
+
+void
+crandom_init_from_buffer(
+    struct crandom_state_t *state,
+    const char initial_seed[32]
+) {
+    memcpy(state->seed, initial_seed, 32);
+    memset(state->buffer, 0, 96);
+    state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
+    state->randomfd = -1;
+    state->magic = CRANDOM_MAGIC;
+}
+
+int
+crandom_generate(
+    struct crandom_state_t *state,
+    unsigned char *output,
+    unsigned long long length
+) {
+    /* the generator isn't seeded; maybe they ignored the return value of init_from_file */
+    if (unlikely(state->magic != CRANDOM_MAGIC)) {
+        abort();
+    }
+
+    int ret = 0;
+
+    while (length) {
+        if (unlikely(state->fill <= 0)) {
+            uint64_t iv = 0;
+            if (state->reseed_interval) {
+                /* it's nondeterministic, stir in some rdrand() or rdtsc() */
+                if (HAVE(RDRAND)) {
+                    iv = rdrand(0);
+                    if (!iv) iv = rdtsc();
+                } else {
+                    iv = rdtsc();
+                }
+
+                state->reseed_countdown--;
+                if (unlikely(state->reseed_countdown <= 0)) {
+                    /* reseed by xoring in random state */
+                    state->reseed_countdown = state->reseed_interval;
+                    ssize_t offset = 0, red;
+                    do {
+                        red = read(state->randomfd, state->buffer + offset, 32 - offset);
+                        if (red > 0) offset += red;
+                    } while (red > 0 && offset < 32);
+
+                    if (offset < 32) {
+                        /* The read failed.  Signal an error with the return code.
+                         *
+                         * If reseeds are mandatory, crash.
+                         *
+                         * If not, the generator is still probably safe to use, because reseeding
+                         * is basically over-engineering for caution.  Also, the user might ignore
+                         * the return code, so we still need to fill the request.
+                         *
+                         * Set reseed_countdown = 1 so we'll try again later.  If the user's
+                         * performance sucks as a result of ignoring the error code while calling
+                         * us in a loop, well, that's life.
+                         */
+                        if (state->reseeds_mandatory) {
+                            abort();
+                        }
+
+                        ret = errno;
+                        if (ret == 0) ret = -1;
+                        state->reseed_countdown = 1;
+                    }
+
+                    int i;
+                    for (i=0; i<32; i++) {
+                        /* Stir in the buffer.  If somehow the read failed, it'll be zeros. */
+                        state->seed[i] ^= state->buffer[i];
+                    }
+                }
+            }
+            crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
+            state->ctr++;
+            state->fill = sizeof(state->buffer);
+        }
+
+        unsigned long long copy = (length > state->fill) ? state->fill : length;
+        state->fill -= copy;
+        memcpy(output, state->buffer + state->fill, copy);
+        memset(state->buffer + state->fill, 0, copy);
+        output += copy; length -= copy;
+    }
+
+    return ret;
+}
+
+void
+crandom_destroy(
+    struct crandom_state_t *state
+) { 
+    if (state->magic == CRANDOM_MAGIC && state->randomfd) {
+        (void) close(state->randomfd);
+        /* Ignore the return value from close(), because what would it mean?
+         * "Your random device, which you were reading over NFS, lost some data"?
+         */
+    }
+
+    memset(state, 0, sizeof(*state));
+}
diff --git a/src/exported.sym b/src/exported.sym
new file mode 100644
index 0000000..e26f3db
--- /dev/null
+++ b/src/exported.sym
@@ -0,0 +1,6 @@
+_goldilocks_init
+_goldilocks_keygen
+_goldilocks_shared_secret
+_goldilocks_sign
+_goldilocks_verify
+_goldilocks_private_to_public
diff --git a/src/goldilocks.c b/src/goldilocks.c
new file mode 100644
index 0000000..f178d7a
--- /dev/null
+++ b/src/goldilocks.c
@@ -0,0 +1,393 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "config.h"
+#include "word.h"
+
+#include <errno.h>
+
+#if GOLDILOCKS_USE_PTHREAD
+#include <pthread.h>
+#endif
+
+#include "goldilocks.h"
+#include "ec_point.h"
+#include "scalarmul.h"
+#include "barrett_field.h"
+#include "crandom.h"
+#include "sha512.h"
+#include "intrinsics.h"
+
+#ifndef GOLDILOCKS_RANDOM_INIT_FILE
+#define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom"
+#endif
+
+#ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL
+#define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000
+#endif
+
+/* We'll check it ourselves */
+#ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY
+#define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
+#endif
+
+/* FUTURE: auto */
+const struct affine_t goldilocks_base_point = {
+    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
+       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
+       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
+       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
+    }},
+    {{ 19 }}
+};
+
+static const char *G_INITING = "initializing";
+static const char *G_INITED = "initialized";
+static const char *G_FAILED = "failed to initialize";
+
+/* FUTURE: auto */
+static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
+    U64LE(0xdc873d6d54a7bb0d),
+    U64LE(0xde933d8d723a70aa),
+    U64LE(0x3bb124b65129c96f),
+    0x8335dc16
+};
+const struct barrett_prime_t goldi_q448 = {
+    448/WORD_BITS,
+    62 % WORD_BITS,
+    sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
+    goldi_q448_lo
+};
+
+/* FUTURE: auto */
+struct {
+    const char * volatile state;
+#if GOLDILOCKS_USE_PTHREAD
+    pthread_mutex_t mutex;
+#endif
+    struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64];
+    struct fixed_base_table_t fixed_base;
+    struct tw_niels_t wnafs[32];
+    struct crandom_state_t rand;
+} goldilocks_global;
+
+static inline mask_t
+goldilocks_check_init() {
+    if (likely(goldilocks_global.state == G_INITED)) {
+        return MASK_SUCCESS;
+    } else {
+        return MASK_FAILURE;
+    }
+}
+
+int
+goldilocks_init () {
+    const char *res = compare_and_swap(&goldilocks_global.state, NULL, G_INITING);
+    if (res == G_INITED) return GOLDI_EALREADYINIT;
+    else if (res) {
+        return GOLDI_ECORRUPT;
+    }
+
+#if GOLDILOCKS_USE_PTHREAD
+    int ret = pthread_mutex_init(&goldilocks_global.mutex, NULL);
+    if (ret) goto fail;
+#endif
+    
+    struct extensible_t ext;
+    struct tw_extensible_t text;
+    
+    /* Sanity check: the base point is on the curve. */
+    assert(validate_affine(&goldilocks_base_point));
+    
+    /* Convert it to twisted Edwards. */
+    convert_affine_to_extensible(&ext, &goldilocks_base_point);
+    twist_even(&text, &ext);
+    
+    /* Precompute the tables. */
+    mask_t succ;
+    
+    int big = (WORD_BITS==64);
+    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;
+
+    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
+    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, 5);
+    
+    int criff_res = crandom_init_from_file(&goldilocks_global.rand,
+        GOLDILOCKS_RANDOM_INIT_FILE,
+        GOLDILOCKS_RANDOM_RESEED_INTERVAL,
+        GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
+        
+    if (succ & !criff_res) {
+        if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) {
+            abort();
+        }
+        return 0;
+    }
+    
+    /* it failed! fall though... */
+
+fail:
+    if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_FAILED)) {
+        /* ok something is seriously wrong */
+        abort();
+    }
+    return -1;
+}
+
+static const struct p448_t
+sqrt_d_minus_1 = {{
+    U58LE(0xd2e21836749f46),
+    U58LE(0x888db42b4f0179),
+    U58LE(0x5a189aabdeea38),
+    U58LE(0x51e65ca6f14c06),
+    U58LE(0xa49f7b424d9770),
+    U58LE(0xdcac4628c5f656),
+    U58LE(0x49443b8748734a),
+    U58LE(0x12fec0c0b25b7a)
+}};
+
+int
+goldilocks_keygen (
+    struct goldilocks_private_key_t *privkey,
+    struct goldilocks_public_key_t *pubkey
+) {
+    if (!goldilocks_check_init()) {
+        return GOLDI_EUNINIT;
+    }
+    
+    word_t sk[448*2/WORD_BITS];
+    
+    struct tw_extensible_t exta;
+    struct p448_t pk;
+
+#if GOLDILOCKS_USE_PTHREAD
+    int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex);
+    if (ml_ret) return ml_ret;
+#endif
+
+    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
+    int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
+    if (!ret) ret = ret2;
+
+#if GOLDILOCKS_USE_PTHREAD
+    ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
+    if (ml_ret) abort();
+#endif
+    
+    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448);
+    barrett_serialize(privkey->opaque, sk, 448/8);
+    
+    scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base);
+    //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
+    untwist_and_double_and_serialize(&pk, &exta);
+    
+    p448_serialize(pubkey->opaque, &pk);
+    memcpy(&privkey->opaque[56], pubkey->opaque, 56);
+    
+    return ret ? GOLDI_ENODICE : GOLDI_EOK;
+}
+
+int
+goldilocks_private_to_public (
+    struct goldilocks_public_key_t *pubkey,
+    const struct goldilocks_private_key_t *privkey
+) {
+    struct p448_t pk;
+    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]);
+    
+    if (msucc) {
+        p448_serialize(pubkey->opaque, &pk);
+        return GOLDI_EOK;
+    } else {
+        return GOLDI_ECORRUPT;
+    }
+}
+
+int
+goldilocks_shared_secret (
+    uint8_t shared[64],
+    const struct goldilocks_private_key_t *my_privkey,
+    const struct goldilocks_public_key_t *your_pubkey
+) {
+    /* This function doesn't actually need anything in goldilocks_global,
+     * so it doesn't check init.
+     */
+    
+    word_t sk[448/WORD_BITS];
+    struct p448_t pk;
+    
+    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
+    
+#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
+    struct p448_t sum, prod;
+    msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
+    p448_mul(&prod,&pk,&sum);
+    p448_add(&sum,&pk,&sum);
+#endif
+    
+    msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
+    succ &= montgomery_ladder(&pk,&pk,sk,446,2);
+    
+    p448_serialize(shared,&pk);
+    
+    /* obliterate records of our failure by adjusting with obliteration key */
+    struct sha512_ctx_t ctx;
+    sha512_init(&ctx);
+
+#ifdef EXPERIMENT_ECDH_OBLITERATE_CT
+    uint8_t oblit[40];
+    unsigned i;
+    for (i=0; i<8; i++) {
+        oblit[i] = "noshared"[i] & ~(succ&msucc);
+    }
+    for (i=0; i<32; i++) {
+        oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
+    }
+    sha512_update(&ctx, oblit, 40);
+#endif
+    
+#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
+    /* stir in the sum and product of the pubkeys. */
+    uint8_t a_pk[56];
+    p448_serialize(a_pk, &sum);
+    sha512_update(&ctx, a_pk, 56);
+    p448_serialize(a_pk, &prod);
+    sha512_update(&ctx, a_pk, 56);
+#endif
+       
+    /* stir in the shared key and finish */
+    sha512_update(&ctx, shared, 56);
+    sha512_final(&ctx, shared);
+    
+    return (GOLDI_ECORRUPT & ~msucc)
+        | (GOLDI_EINVAL & msucc &~ succ)
+        | (GOLDI_EOK & msucc & succ);
+}
+
+int
+goldilocks_sign (
+    uint8_t signature_out[56*2],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_private_key_t *privkey
+) {
+    if (!goldilocks_check_init()) {
+        return GOLDI_EUNINIT;
+    }
+    
+    /* challenge = H(pk, [nonceG], message). */
+    word_t skw[448/WORD_BITS];
+    mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
+    if (!succ) {
+        memset(skw,0,sizeof(skw));
+        return GOLDI_ECORRUPT;
+    }
+        
+    /* Derive a nonce.  TODO: use HMAC. FUTURE: factor. */
+    unsigned char sha_out[512/8];
+    word_t tk[448/WORD_BITS];
+    struct sha512_ctx_t ctx;
+    sha512_init(&ctx);
+    sha512_update(&ctx, (const unsigned char *)"signonce", 8);
+    sha512_update(&ctx, &privkey->opaque[112], 32);
+    sha512_update(&ctx, message, message_len);
+    sha512_update(&ctx, &privkey->opaque[112], 32);
+    sha512_final(&ctx, sha_out);
+    barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448);
+    
+    /* 4[nonce]G */
+    uint8_t signature_tmp[56];
+    struct tw_extensible_t exta;
+    struct p448_t gsk;
+    scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base);
+    double_tw_extensible(&exta);
+    untwist_and_double_and_serialize(&gsk, &exta);
+    p448_serialize(signature_tmp, &gsk);
+    
+    word_t challenge[448/WORD_BITS];
+    sha512_update(&ctx, &privkey->opaque[56], 56);
+    sha512_update(&ctx, signature_tmp, 56);
+    sha512_update(&ctx, message, message_len);
+    sha512_final(&ctx, sha_out);
+    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
+    
+    // reduce challenge and sub.
+    barrett_negate(challenge,448/WORD_BITS,&goldi_q448);
+
+    barrett_mac(
+        tk,448/WORD_BITS,
+        challenge,448/WORD_BITS,
+        skw,448/WORD_BITS,
+        &goldi_q448
+    );
+        
+    word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1);
+    barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448);
+        
+    memcpy(signature_out, signature_tmp, 56);
+    barrett_serialize(signature_out+56, tk, 448/8);
+    memset((unsigned char *)tk,0,sizeof(tk));
+    memset((unsigned char *)skw,0,sizeof(skw));
+    memset((unsigned char *)challenge,0,sizeof(challenge));
+    
+    /* response = 2(nonce_secret - sk*challenge)
+     * Nonce = 8[nonce_secret]*G
+     * PK = 2[sk]*G, except doubled (TODO)
+     * so [2] ( [response]G + 2[challenge]PK ) = Nonce
+     */
+    
+    return 0;
+}
+
+int
+goldilocks_verify (
+    const uint8_t signature[56*2],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_public_key_t *pubkey
+) {
+    if (!goldilocks_check_init()) {
+        return GOLDI_EUNINIT;
+    }
+    
+    struct p448_t pk;
+    word_t s[448/WORD_BITS];
+    
+    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
+    if (!succ) return GOLDI_EINVAL;
+    
+    succ = barrett_deserialize(s, &signature[56], &goldi_q448);
+    if (!succ) return GOLDI_EINVAL;
+    
+    /* challenge = H(pk, [nonceG], message). */
+    unsigned char sha_out[512/8];
+    word_t challenge[448/WORD_BITS];
+    struct sha512_ctx_t ctx;
+    sha512_init(&ctx);
+    sha512_update(&ctx, pubkey->opaque, 56);
+    sha512_update(&ctx, signature, 56);
+    sha512_update(&ctx, message, message_len);
+    sha512_final(&ctx, sha_out);
+    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
+    
+    struct p448_t eph;
+    struct tw_extensible_t pk_text;
+    
+    /* deserialize [nonce]G */
+    succ = p448_deserialize(&eph, signature);
+    if (!succ) return GOLDI_EINVAL;
+    
+    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
+    if (!succ) return GOLDI_EINVAL;
+    
+    linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 );
+    
+    untwist_and_double_and_serialize( &pk, &pk_text );
+    p448_sub(&eph, &eph, &pk);
+    p448_bias(&eph, 2);
+    
+    succ = p448_is_zero(&eph);
+    
+    return succ ? 0 : GOLDI_EINVAL;
+}
diff --git a/src/include/barrett_field.h b/src/include/barrett_field.h
new file mode 100644
index 0000000..9d8f930
--- /dev/null
+++ b/src/include/barrett_field.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __BARRETT_FIELD_H__
+#define __BARRETT_FIELD_H__ 1
+
+/**
+ * @file barrett_field.h
+ * @brief Slow routines for generic primes in Barrett form.
+ *
+ * @warning These routines are very slow, roughly implemented, and should be made more
+ * flexible in the future.  I might even outright switch to Montgomery form.
+ */
+
+#include "word.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    
+/**
+ * @brief A Barrett-form prime, 2^k - c.
+ * @todo Support primes of other forms.
+ */
+struct barrett_prime_t {
+    uint32_t nwords_p;   /**< The number of bits in p, i.e. ceiling((k-1) / WORD_BITS) */
+    uint32_t p_shift;    /**< c mod WORD_BITS. */
+    uint32_t nwords_lo;  /**< The number of nonzero low words. */
+    const word_t *p_lo;  /**< The low words. */
+};
+
+/**
+ * The Goldilocks prime.  I'm not sure this is the right place for it, but oh well.
+ */
+extern const struct barrett_prime_t goldi_q448;
+
+/**
+ * Reduce a number (with optional high carry word) mod p.
+ *
+ * @param [inout] a The value to be reduced.
+ * @param [in] nwords_a The number of words in a.
+ * @param [in] a_carry A high word to be carried into the computation.
+ * @param [in] prime The Barrett prime.
+ */
+void
+barrett_reduce(
+    word_t *a,
+    uint32_t nwords_a,
+    word_t a_carry,
+    const struct barrett_prime_t *prime
+);
+    
+/**
+ * out = a+(c&mask), returning a carry.
+ *
+ * @param [out] out The output, of length nwords_a.
+ * @param [in] a The "always" addend.
+ * @param [in] nwords_a The number of words in a.
+ * @param [in] c The "sometimes" addend.
+ * @param [in] nwords_c The number of words in c.
+ * @param [in] mask A mask of whether to add or not.
+ * @return A carry word.
+ */
+word_t
+add_nr_ext_packed(
+    word_t *out,
+    const word_t *a,
+    uint32_t nwords_a,
+    const word_t *c,
+    uint32_t nwords_c,
+    word_t mask
+);
+  
+/**
+ * out = a-(c&mask), returning a borrow.
+ *
+ * @param [out] out The output, of length nwords_a.
+ * @param [in] a The "always" minuend.
+ * @param [in] nwords_a The number of words in a.
+ * @param [in] c The "sometimes" subtrahend.
+ * @param [in] nwords_c The number of words in c.
+ * @param [in] mask A mask of whether to add or not.
+ * @return A borrow word.
+ */  
+word_t
+sub_nr_ext_packed(
+    word_t *out,
+    const word_t *a,
+    uint32_t nwords_a,
+    const word_t *c,
+    uint32_t nwords_c,
+    word_t mask
+);
+
+/**
+ * a -> reduce(-a) mod p
+ *
+ * @param [in] a The value to be reduced and negated.
+ * @param [in] nwords_a The number of words in a.  Must be >= nwords_p.
+ * @param [in] prime The prime.
+ */   
+void
+barrett_negate (
+    word_t *a,
+    uint32_t nwords_a,
+    const struct barrett_prime_t *prime
+);
+
+/*
+ * If doMac, accum = accum + a*b mod p.
+ * Otherwise, accum = a*b mod p.
+ *
+ * This function is not __restrict__; you may pass accum,
+ * a, b, etc all from the same location.
+ */
+void
+barrett_mul_or_mac(
+    word_t *accum,
+    uint32_t nwords_accum,
+
+    const word_t *a,
+    uint32_t nwords_a,
+
+    const word_t *b,
+    uint32_t nwords_b,
+
+    const struct barrett_prime_t *prime,
+    
+    mask_t doMac
+);
+    
+static inline void
+barrett_mul(
+    word_t *out,
+    int nwords_out,
+
+    const word_t *a,
+    uint32_t nwords_a,
+
+    const word_t *b,
+    uint32_t nwords_b,
+
+    const struct barrett_prime_t *prime
+) {
+    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,0);
+}
+    
+static inline void
+barrett_mac(
+    word_t *out,
+    uint32_t nwords_out,
+
+    const word_t *a,
+    uint32_t nwords_a,
+
+    const word_t *b,
+    uint32_t nwords_b,
+
+    const struct barrett_prime_t *prime
+) {
+    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,prime,-1);
+}
+
+mask_t
+barrett_deserialize (
+    word_t *x,
+    const uint8_t *serial,
+    const struct barrett_prime_t *prime
+);
+
+void
+barrett_serialize (
+    uint8_t *serial,
+    const word_t *x,
+    uint32_t nserial
+);
+    
+void
+barrett_deserialize_and_reduce (
+    word_t *x,
+    const uint8_t *serial,
+    uint32_t nserial,
+    const struct barrett_prime_t *prime
+);
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __BARRETT_FIELD_H__ */
diff --git a/src/include/config.h b/src/include/config.h
new file mode 100644
index 0000000..dbd785d
--- /dev/null
+++ b/src/include/config.h
@@ -0,0 +1,8 @@
+#ifndef __GOLDILOCKS_CONFIG_H__
+#define __GOLDILOCKS_CONFIG_H__ 1
+
+#define GOLDILOCKS_USE_PTHREAD          1
+#define EXPERIMENT_ECDH_OBLITERATE_CT   1
+#define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1
+
+#endif // __GOLDILOCKS_CONFIG_H__
diff --git a/src/include/crandom.h b/src/include/crandom.h
new file mode 100644
index 0000000..f603f13
--- /dev/null
+++ b/src/include/crandom.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2011 Stanford University.
+ * Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file crandom.h
+ * @author Mike Hamburg
+ * @brief A miniature version of the (as of yet incomplete) crandom project.
+ */
+
+#ifndef __GOLDI_CRANDOM_H__
+#define __GOLDI_CRANDOM_H__ 1
+
+#include <stdint.h>  /* for uint64_t */
+#include <fcntl.h>   /* for open */
+#include <errno.h>   /* for returning errors after open */
+#include <stdlib.h>  /* for abort */
+#include <string.h>  /* for memcpy */
+#include <strings.h> /* for bzero */
+#include <unistd.h>  /* for read */
+
+/**
+ * @brief The state of a crandom generator.
+ *
+ * This object is opaque.  It is not protected by a lock, and so must
+ * not be accessed by multiple threads at the same time.
+ */
+struct crandom_state_t {
+    /** @privatesection */
+    unsigned char seed[32];
+    unsigned char buffer[96];
+    uint64_t ctr;
+    uint64_t magic;
+    unsigned int fill;
+    int reseed_countdown;
+    int reseed_interval;
+    int reseeds_mandatory;
+    int randomfd;
+} __attribute__((aligned(16))) ;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize a crandom state from the chosen file.
+ * 
+ * This function initializes a state from a given state file, or
+ * from a random device (eg. /dev/random or /dev/urandom).
+ *
+ * You must check the return value of this function.
+ *
+ * @param [out] state The crandom state variable to initalize.
+ * @param [in] filename The name of the seed file or random device.
+ * @param [in] reseed_interval The number of 96-byte blocks which can be
+ *        generated without reseeding.  Suggest 10000.
+ * @param [in] reseeds_mandatory If nonzero, call abort() if a reseed fails.
+ *        Suggest 1.
+ *
+ * @retval 0 Success.
+ * @retval Nonzero An error to be interpreted by strerror().
+ */
+int
+crandom_init_from_file (
+    struct crandom_state_t *state,
+    const char *filename,
+    int reseed_interval,
+    int reseeds_mandatory
+) __attribute__((warn_unused_result));
+
+
+/**
+ * Initialize a crandom state from a buffer, for deterministic operation.
+ * 
+ * This function is used to initialize a crandom state deterministically,
+ * mainly for testing purposes.  It can also be used to expand a secret
+ * random value deterministically.
+ *
+ * @warning The crandom implementation is not guaranteed to be stable.
+ * That is, a later release might produce a different random stream from
+ * the same seed.
+ *
+ * @param [out] state The crandom state variable to initalize.
+ * @param [in] initial_seed The seed value.
+ */
+void
+crandom_init_from_buffer (
+    struct crandom_state_t *state,
+    const char initial_seed[32]
+);
+
+/**
+ * Fill the output buffer with random data.
+ *
+ * This function uses the given crandom state to produce pseudorandom data
+ * in the output buffer.
+ *
+ * This function may perform reads from the state's random device if it needs
+ * to reseed.  This could block if that file is a blocking source, such as
+ * a pipe or /dev/random on Linux.  If reseeding fails and the state has
+ * reseeds_mandatory set, this function will call abort().  Otherwise, it will
+ * return an error code, but it will still randomize the buffer.
+ *
+ * If called on a corrupted, uninitialized or destroyed state, this function
+ * will abort().
+ *
+ * @warning This function is not thread-safe with respect to the state.  Don't
+ * call it from multiple threads with the same state at the same time.
+ *
+ * @param [inout] state The crandom state to use for generation.
+ * @param [out] output The buffer to fill with random data.
+ * @param [in] length The length of the buffer.
+ *
+ * @retval 0 Success.
+ * @retval Nonezero A non-mandatory reseed operation failed.
+ */
+int
+crandom_generate (
+    struct crandom_state_t *state,
+    unsigned char *output,
+    unsigned long long length
+);
+
+/**
+ * Destroy the random state.  Further calls to crandom_generate() on that state
+ * will abort().
+ *
+ * @param [inout] state The state to be destroyed.
+ */
+void
+crandom_destroy (
+    struct crandom_state_t *state
+);
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __GOLDI_CRANDOM_H__ */
diff --git a/src/include/ec_point.h b/src/include/ec_point.h
new file mode 100644
index 0000000..456cd3d
--- /dev/null
+++ b/src/include/ec_point.h
@@ -0,0 +1,552 @@
+/**
+ * @file ec_point.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @warning This file was automatically generated.
+ */
+
+#ifndef __CC_INCLUDED_EC_POINT_H__
+#define __CC_INCLUDED_EC_POINT_H__
+
+#include "p448.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Affine point on an Edwards curve.
+ */
+struct affine_t {
+    struct p448_t x, y;
+};
+
+/**
+ * Affine point on a twisted Edwards curve.
+ */
+struct tw_affine_t {
+    struct p448_t x, y;
+};
+
+/**
+ * Montgomery buffer.
+ */
+struct montgomery_t {
+    struct p448_t z0, xd, zd, xa, za;
+};
+
+/**
+ * Extensible coordinates for Edwards curves, suitable for
+ * accumulators.
+ * 
+ * Represents the point (x/z, y/z).  The extra coordinates
+ * t,u satisfy xy = tuz, allowing for conversion to Extended
+ * form by multiplying t and u.
+ * 
+ * The idea is that you don't have to do this multiplication
+ * when doubling the accumulator, because the t-coordinate
+ * isn't used there.  At the same time, as long as you only
+ * have one point in extensible form, additions don't cost
+ * extra.
+ * 
+ * This is essentially a lazier version of Hisil et al's
+ * lookahead trick.  It might be worth considering that trick
+ * instead.
+ */
+struct extensible_t {
+    struct p448_t x, y, z, t, u;
+};
+
+/**
+ * Extensible coordinates for twisted Edwards curves,
+ * suitable for accumulators.
+ */
+struct tw_extensible_t {
+    struct p448_t x, y, z, t, u;
+};
+
+/**
+ * Niels coordinates for twisted Edwards curves.
+ * 
+ * Good for mixed readdition; suitable for fixed tables.
+ */
+struct tw_niels_t {
+    struct p448_t a, b, c;
+};
+
+/**
+ * Projective niels coordinates for twisted Edwards curves.
+ * 
+ * Good for readdition; suitable for temporary tables.
+ */
+struct tw_pniels_t {
+    struct tw_niels_t n;
+    struct p448_t z;
+};
+
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_affine (
+    struct affine_t*       a,
+    const struct affine_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_tw_affine (
+    struct tw_affine_t*       a,
+    const struct tw_affine_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_montgomery (
+    struct montgomery_t*       a,
+    const struct montgomery_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_extensible (
+    struct extensible_t*       a,
+    const struct extensible_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_tw_extensible (
+    struct tw_extensible_t*       a,
+    const struct tw_extensible_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_tw_niels (
+    struct tw_niels_t*       a,
+    const struct tw_niels_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Auto-generated copy method.
+ */
+static __inline__ void
+copy_tw_pniels (
+    struct tw_pniels_t*       a,
+    const struct tw_pniels_t* ds
+) __attribute__((unused,always_inline));
+
+/**
+ * Returns 1/sqrt(+- x).
+ * 
+ * The Legendre symbol of the result is the same as that of the
+ * input.
+ * 
+ * If x=0, returns 0.
+ */
+void
+p448_isr (
+    struct p448_t*       a,
+    const struct p448_t* x
+);
+
+/**
+ * Returns 1/x.
+ * 
+ * If x=0, returns 0.
+ */
+void
+p448_inverse (
+    struct p448_t*       a,
+    const struct p448_t* x
+);
+
+/**
+ * Add two points on a twisted Edwards curve, one in Extensible form
+ * and the other in half-Niels form.
+ */
+void
+add_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+);
+
+/**
+ * Add two points on a twisted Edwards curve, one in Extensible form
+ * and the other in half-Niels form.
+ */
+void
+sub_tw_niels_from_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+);
+
+/**
+ * Add two points on a twisted Edwards curve, one in Extensible form
+ * and the other in projective Niels form.
+ */
+void
+add_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+);
+
+/**
+ * Add two points on a twisted Edwards curve, one in Extensible form
+ * and the other in projective Niels form.
+ */
+void
+sub_tw_pniels_from_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+);
+
+/**
+ * Double a point on a twisted Edwards curve, in "extensible" coordinates.
+ */
+void
+double_tw_extensible (
+    struct tw_extensible_t* a
+);
+
+/**
+ * Double a point on an Edwards curve, in "extensible" coordinates.
+ */
+void
+double_extensible (
+    struct extensible_t* a
+);
+
+/**
+ * Double a point, and transfer it to the twisted curve.
+ * 
+ * That is, apply the 4-isogeny.
+ */
+void
+twist_and_double (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+);
+
+/**
+ * Double a point, and transfer it to the untwisted curve.
+ * 
+ * That is, apply the dual isogeny.
+ */
+void
+untwist_and_double (
+    struct extensible_t*          b,
+    const struct tw_extensible_t* a
+);
+
+void
+convert_tw_affine_to_tw_pniels (
+    struct tw_pniels_t*       b,
+    const struct tw_affine_t* a
+);
+
+void
+convert_tw_affine_to_tw_extensible (
+    struct tw_extensible_t*   b,
+    const struct tw_affine_t* a
+);
+
+void
+convert_affine_to_extensible (
+    struct extensible_t*   b,
+    const struct affine_t* a
+);
+
+void
+convert_tw_extensible_to_tw_pniels (
+    struct tw_pniels_t*           b,
+    const struct tw_extensible_t* a
+);
+
+void
+convert_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* d
+);
+
+void
+convert_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  e,
+    const struct tw_niels_t* d
+);
+
+void
+montgomery_step (
+    struct montgomery_t* a
+);
+
+void
+deserialize_montgomery (
+    struct montgomery_t* a,
+    const struct p448_t* sbz
+);
+
+mask_t
+serialize_montgomery (
+    struct p448_t*             b,
+    const struct montgomery_t* a,
+    const struct p448_t*       sbz
+);
+
+/**
+ * Serialize a point on an Edwards curve.
+ * 
+ * The serialized form would be sqrt((z-y)/(z+y)) with sign of xz.
+ * 
+ * It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x.
+ * 
+ * But 4/(1-d) isn't square, so we need to twist it:
+ * 
+ * -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x
+ */
+void
+serialize_extensible (
+    struct p448_t*             b,
+    const struct extensible_t* a
+);
+
+/**
+ * 
+ */
+void
+untwist_and_double_and_serialize (
+    struct p448_t*                b,
+    const struct tw_extensible_t* a
+);
+
+/**
+ * Expensive transfer from untwisted to twisted.  Roughly equivalent to halve and isogeny.
+ * Correctly transfers point of order 2.
+ * 
+ * Can't have x=+1 (it's not even).  There is code to fix the exception that would otherwise
+ * occur at (0,1).
+ * 
+ * Input point must be even.
+ */
+void
+twist_even (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+);
+
+/**
+ * Expensive transfer from untwisted to twisted.  Roughly equivalent to halve and isogeny.
+ * 
+ * This function is for testing purposes only, because it can return odd points on the
+ * twist.  This can cause exceptions in the point addition formula.  What's more, this
+ * function should be able to return points of order 4, which are at infinity.
+ * 
+ * This function probably doesn't properly handle special cases, such as the point at
+ * infinity (FUTURE).
+ * 
+ * This function probably isn't a homomorphism, in that it probably doesn't consistently
+ * handle adjustments by the point of order 2 when the input is odd.    (FUTURE)
+ */
+void
+test_only_twist (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+);
+
+mask_t
+is_square (
+    const struct p448_t* x
+);
+
+mask_t
+is_even_pt (
+    const struct extensible_t* a
+);
+
+mask_t
+is_even_tw (
+    const struct tw_extensible_t* a
+);
+
+/**
+ * Deserialize a point to an untwisted affine curve.
+ */
+mask_t
+deserialize_affine (
+    struct affine_t*     a,
+    const struct p448_t* sz
+);
+
+/**
+ * Deserialize a point and transfer it to the twist.
+ * 
+ * Not guaranteed to preserve the 4-torsion component.
+ * 
+ * Refuses to deserialize +-1, which are the points of order 2.
+ */
+mask_t
+deserialize_and_twist_approx (
+    struct tw_extensible_t* a,
+    const struct p448_t*    sdm1,
+    const struct p448_t*    sz
+);
+
+void
+set_identity_extensible (
+    struct extensible_t* a
+);
+
+void
+set_identity_tw_extensible (
+    struct tw_extensible_t* a
+);
+
+void
+set_identity_affine (
+    struct affine_t* a
+);
+
+mask_t
+eq_affine (
+    const struct affine_t* a,
+    const struct affine_t* b
+);
+
+mask_t
+eq_extensible (
+    const struct extensible_t* a,
+    const struct extensible_t* b
+);
+
+mask_t
+eq_tw_extensible (
+    const struct tw_extensible_t* a,
+    const struct tw_extensible_t* b
+);
+
+void
+elligator_2s_inject (
+    struct affine_t*     a,
+    const struct p448_t* r
+);
+
+mask_t
+validate_affine (
+    const struct affine_t* a
+);
+
+/**
+ * Check the invariants for struct tw_extensible_t.
+ * NOTE: This function was automatically generated
+ * with no regard for speed.
+ */
+mask_t
+validate_tw_extensible (
+    const struct tw_extensible_t* ext
+);
+
+/**
+ * Check the invariants for struct extensible_t.
+ * NOTE: This function was automatically generated
+ * with no regard for speed.
+ */
+mask_t
+validate_extensible (
+    const struct extensible_t* ext
+);
+
+
+void
+copy_affine (
+    struct affine_t*       a,
+    const struct affine_t* ds
+) {
+    p448_copy ( &a->x, &ds->x );
+    p448_copy ( &a->y, &ds->y );
+}
+
+void
+copy_tw_affine (
+    struct tw_affine_t*       a,
+    const struct tw_affine_t* ds
+) {
+    p448_copy ( &a->x, &ds->x );
+    p448_copy ( &a->y, &ds->y );
+}
+
+void
+copy_montgomery (
+    struct montgomery_t*       a,
+    const struct montgomery_t* ds
+) {
+    p448_copy ( &a->z0, &ds->z0 );
+    p448_copy ( &a->xd, &ds->xd );
+    p448_copy ( &a->zd, &ds->zd );
+    p448_copy ( &a->xa, &ds->xa );
+    p448_copy ( &a->za, &ds->za );
+}
+
+void
+copy_extensible (
+    struct extensible_t*       a,
+    const struct extensible_t* ds
+) {
+    p448_copy ( &a->x, &ds->x );
+    p448_copy ( &a->y, &ds->y );
+    p448_copy ( &a->z, &ds->z );
+    p448_copy ( &a->t, &ds->t );
+    p448_copy ( &a->u, &ds->u );
+}
+
+void
+copy_tw_extensible (
+    struct tw_extensible_t*       a,
+    const struct tw_extensible_t* ds
+) {
+    p448_copy ( &a->x, &ds->x );
+    p448_copy ( &a->y, &ds->y );
+    p448_copy ( &a->z, &ds->z );
+    p448_copy ( &a->t, &ds->t );
+    p448_copy ( &a->u, &ds->u );
+}
+
+void
+copy_tw_niels (
+    struct tw_niels_t*       a,
+    const struct tw_niels_t* ds
+) {
+    p448_copy ( &a->a, &ds->a );
+    p448_copy ( &a->b, &ds->b );
+    p448_copy ( &a->c, &ds->c );
+}
+
+void
+copy_tw_pniels (
+    struct tw_pniels_t*       a,
+    const struct tw_pniels_t* ds
+) {
+    copy_tw_niels( &a->n, &ds->n );
+    p448_copy ( &a->z, &ds->z );
+}
+
+
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __CC_INCLUDED_EC_POINT_H__ */
diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h
new file mode 100644
index 0000000..02a8a1e
--- /dev/null
+++ b/src/include/intrinsics.h
@@ -0,0 +1,244 @@
+/* Copyright (c) 2011 Stanford University.
+ * Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/** @file intrinsics.h
+ * @brief cRandom intrinsics header.
+ */
+
+#ifndef __CRANDOM_INTRINSICS_H__
+#define __CRANDOM_INTRINSICS_H__ 1
+
+#include <sys/types.h>
+
+#include <immintrin.h>
+
+#define INTRINSIC \
+  static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))
+
+#define GEN    1
+#define SSE2   2
+#define SSSE3  4
+#define AESNI  8
+#define XOP    16
+#define AVX    32
+#define AVX2   64
+#define RDRAND 128
+
+/**
+ * If on x86, read the timestamp counter.  Otherwise, return 0.
+ */
+INTRINSIC u_int64_t rdtsc() {
+  u_int64_t out = 0;
+# if (defined(__i386__) || defined(__x86_64__))
+    __asm__ __volatile__ ("rdtsc" : "=A"(out));
+# endif
+  return out;
+}
+
+/**
+ * Return x unchanged, but confuse the compiler.
+ *
+ * This is mainly for use in test scripts, to prevent the value from
+ * being constant-folded or removed by dead code elimination.
+ *
+ * @param x A 64-bit number.
+ * @return The same number in a register.
+ */
+INTRINSIC u_int64_t opacify(u_int64_t x) {
+  __asm__ volatile("mov %0, %0" : "+r"(x));
+  return x;
+}
+
+#ifdef __AVX2__
+#  define MIGHT_HAVE_AVX2 1
+#  ifndef MUST_HAVE_AVX2
+#    define MUST_HAVE_AVX2 0
+#  endif
+#else
+#  define MIGHT_HAVE_AVX2 0
+#  define MUST_HAVE_AVX2  0
+#endif
+
+#ifdef __AVX__
+#  define MIGHT_HAVE_AVX 1
+#  ifndef MUST_HAVE_AVX
+#    define MUST_HAVE_AVX MUST_HAVE_AVX2
+#  endif
+#else
+#  define MIGHT_HAVE_AVX 0
+#  define MUST_HAVE_AVX 0
+#endif
+
+#ifdef __SSSE3__
+#  define MIGHT_HAVE_SSSE3 1
+#  ifndef MUST_HAVE_SSSE3
+#    define MUST_HAVE_SSSE3 MUST_HAVE_AVX
+#  endif
+#else
+#  define MIGHT_HAVE_SSSE3 0
+#  define MUST_HAVE_SSSE3 0
+#endif
+
+#ifdef __SSE2__
+#  define MIGHT_HAVE_SSE2 1
+#  ifndef MUST_HAVE_SSE2
+#    define MUST_HAVE_SSE2 MUST_HAVE_SSSE3
+#  endif
+   typedef __m128i ssereg;
+#  define pslldq _mm_slli_epi32
+#  define pshufd _mm_shuffle_epi32
+
+INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
+  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
+}
+
+#else
+#  define MIGHT_HAVE_SSE2 0
+#  define MUST_HAVE_SSE2  0
+#endif
+
+#ifdef __AES__
+/* don't include intrinsics file, because not all platforms have it */
+#  define MIGHT_HAVE_AESNI 1
+#  ifndef MIGHT_HAVE_RDRAND
+#    define MIGHT_HAVE_RDRAND 1
+#  endif
+#  ifndef MUST_HAVE_RDRAND
+#    define MUST_HAVE_RDRAND 0
+#  endif
+#  ifndef MUST_HAVE_AESNI
+#    define MUST_HAVE_AESNI 0
+#  endif
+
+#else
+#  define MIGHT_HAVE_AESNI 0
+#  define MUST_HAVE_AESNI 0
+#  define MIGHT_HAVE_RDRAND 0
+#  define MUST_HAVE_RDRAND 0
+#endif
+
+#ifdef __XOP__
+/* don't include intrinsics file, because not all platforms have it */
+#  define MIGHT_HAVE_XOP 1
+#  ifndef MUST_HAVE_XOP
+#    define MUST_HAVE_XOP 0
+#  endif
+INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
+  ssereg out;
+  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
+  return out;
+}
+#else
+#  define MIGHT_HAVE_XOP 0
+#  define MUST_HAVE_XOP 0
+#endif
+
+#define MIGHT_MASK \
+  ( SSE2   * MIGHT_HAVE_SSE2   \
+  | SSSE3  * MIGHT_HAVE_SSSE3  \
+  | AESNI  * MIGHT_HAVE_AESNI  \
+  | XOP    * MIGHT_HAVE_XOP    \
+  | AVX    * MIGHT_HAVE_AVX    \
+  | RDRAND * MIGHT_HAVE_RDRAND \
+  | AVX2   * MIGHT_HAVE_AVX2)
+
+#define MUST_MASK \
+  ( SSE2   * MUST_HAVE_SSE2   \
+  | SSSE3  * MUST_HAVE_SSSE3  \
+  | AESNI  * MUST_HAVE_AESNI  \
+  | XOP    * MUST_HAVE_XOP    \
+  | AVX    * MUST_HAVE_AVX    \
+  | RDRAND * MUST_HAVE_RDRAND \
+  | AVX2   * MUST_HAVE_AVX2 )
+
+#define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
+#define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)
+
+#ifdef __cplusplus
+#  define extern_c extern "C"
+#else
+#  define extern_c
+#endif
+
+extern_c
+unsigned int crandom_detect_features();
+
+#ifndef likely
+#  define likely(x)       __builtin_expect((x),1)
+#  define unlikely(x)     __builtin_expect((x),0)
+#endif
+  
+/**
+ * Atomic compare and swap, return by fetching.
+ *
+ * Equivalent to:
+ * ret = *target; if (*target == old) *target = new; return ret;
+ *
+ * @param [inout] target The volatile memory area to be CAS'd
+ * @param [in] old The expected old value of the target.
+ * @param [in] new A value to replace the target on success.
+ */
+INTRINSIC const char *
+compare_and_swap (
+    const char *volatile* target,
+    const char *old,
+    const char *new
+);
+    
+const char *compare_and_swap (
+    const char *volatile* target,
+    const char *old,
+    const char *new
+) {
+    return __sync_val_compare_and_swap(target,old,new);
+}
+  
+/**
+ * Atomic compare and swap.  Return whether successful.
+ *
+ * Equivalent to:
+ * if (*target == old) { *target = new; return nonzero; } else { return 0; }
+ *
+ * @param [inout] target The volatile memory area to be CAS'd
+ * @param [in] old The expected old value of the target.
+ * @param [in] new A value to replace the target on success.
+ */
+INTRINSIC int
+bool_compare_and_swap (
+    const char *volatile* target,
+    const char *old,
+    const char *new
+);
+
+int
+bool_compare_and_swap (
+    const char *volatile* target,
+    const char *old,
+    const char *new
+) {
+    return __sync_bool_compare_and_swap(target,old,new);
+}
+
+/**
+ * Determine whether the current processor supports the given feature.
+ *
+ * This function is designed so that it should only have runtime overhead
+ * if the feature is not known at compile time -- that is, if
+ * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not.
+ */
+extern volatile unsigned int crandom_features;
+INTRINSIC int HAVE(unsigned int feature);
+
+int HAVE(unsigned int feature) {
+  unsigned int features;
+  if (!MIGHT_HAVE(feature)) return 0;
+  if (MUST_HAVE(feature))   return 1;
+  features = crandom_features;
+  if (unlikely(!features))
+    crandom_features = features = crandom_detect_features();
+  return likely((features & feature) == feature);
+}
+
+#endif /* __CRANDOM_INTRINSICS_H__ */
diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h
new file mode 100644
index 0000000..122fccc
--- /dev/null
+++ b/src/include/scalarmul.h
@@ -0,0 +1,289 @@
+/**
+ * @file scalarmul.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+
+#ifndef __P448_ALGO_H__
+#define __P448_ALGO_H__ 1
+
+#include "ec_point.h"
+#include "intrinsics.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * A precomputed table for fixed-base scalar multiplication.
+ *
+ * This uses a signed combs format.
+ */
+struct fixed_base_table_t {
+   /** Comb tables containing multiples of the base point. */
+  struct tw_niels_t *table;
+  
+  /** Adjustments to the scalar in even and odd cases, respectively. */
+  word_t scalar_adjustments[2*(448/WORD_BITS)];
+  
+  /** The number of combs in the table. */
+  unsigned int n;
+  
+  /** The number of teeth in each comb. */
+  unsigned int t;
+  
+  /** The spacing between the teeth. */
+  unsigned int s;
+  
+  /** If nonzero, the table was malloc'd by precompute_for_combs. */
+  unsigned int own_table;
+};
+    
+/**
+ * Full Montgomery ladder in inverse square root format.
+ *
+ * Out = [2^n_extra_doubles * scalar] * in, where
+ * scalar is little-endian and has length $nbits$ bits.
+ *
+ * If the scalar is even and/or n_extra_doubles >= 1,
+ * then this function will reject points which are not
+ * on the curve by returning MASK_FAILURE.
+ *
+ * This function will also reject multiplies which output
+ * the identity or the point of order 2.  It may be worth
+ * revisiting this decision in the FUTURE.  The idea is that
+ * this can only happen when: the input is the identity or the
+ * point of order 2; or the input is the point of order 4 on
+ * the twist; or the scalar is 0 or a multiple of the curve
+ * order; or the scalar is a multiple of the twist order and
+ * the input point is on the twist.
+ *
+ * This function takes constant time with respect to $*in$
+ * and $*scalar$, but not of course with respect to nbits or
+ * n_extra_doubles.
+ *
+ * For security, we recommend setting n_extra_doubles = 1.
+ * Because the cofactor of Goldilocks is 4 and input points
+ * are always even (when on the curve), this will cancel the
+ * cofactor.
+ *
+ * @param [out] out The output point.
+ * @param [in] in The base point.
+ * @param [in] scalar The scalar's little-endian representation.
+ * @param [in] nbits The number of bits in the scalar.  Note that
+ * unlike in Curve25519, we do not require the top bit to be set.
+ * @param [in] n_extra_doubles The number of extra doubles to do at
+ * the end.
+ *
+ * @retval MASK_SUCCESS The operation was successful.
+ * @retval MASK_FAILURE The input point was invalid, or the output
+ * would be the identity or the point of order 2.
+ */
+mask_t
+montgomery_ladder (
+    struct p448_t *out,
+    const struct p448_t *in,
+    const word_t *scalar,
+    unsigned int nbits,
+    unsigned int n_extra_doubles
+) __attribute__((warn_unused_result));
+    
+/**
+ * Scalar multiply a twisted Edwards-form point.
+ *
+ * This function takes constant time.
+ *
+ * Currently the scalar is always exactly 448 bits long.
+ *
+ * @param [inout] working The point to multply.
+ * @param [in] scalar The scalar, in little-endian form.
+ */
+void
+scalarmul (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+    /* TODO? int nbits */
+);
+    
+/**
+ * Scalar multiply a twisted Edwards-form point.  Use the same
+ * algorithm as scalarmul(), but uses variable array indices.
+ *
+ * Currently the scalar is always exactly 448 bits long.
+ *
+ * @warning This function uses variable array indices,
+ * so it is insecure against cache-timing attacks.  It is intended
+ * for microbenchmarking, to see how much constant-time arithmetic
+ * costs us.
+ *
+ * @param [inout] working The point to multply.
+ * @param [in] scalar The scalar, in little-endian form.
+ */
+void
+scalarmul_vlook (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+    /* TODO? int nbits */
+);
+
+/**
+ * Precompute a table to accelerate fixed-point scalar
+ * multiplication using the "multiple signed combs" approach.
+ *
+ * This function computes $n$ "comb" tables, each containing
+ * 2^(t-1) points in tw_niels_t format.  You must have
+ * n * t * s >= 446 for complete coverage.
+ *
+ * The scalar multiplication algorithm may adjust the scalar by
+ * a multiple of q.  Therefore, we strongly recommend to use base
+ * points in the q-torsion group (i.e. doubly even points).
+ *
+ * @param [out] out The table to compute.
+ * @param [in] base The base point.
+ * @param [in] n The number of combs in the table.
+ * @param [in] t The number of teeth in each comb.
+ * @param [in] s The spacing between the teeth.
+ * @param [out] prealloc An optional preallocated array containing
+ * space for n<<(t-1) values of type tw_niels_t.
+ *
+ * @retval MASK_SUCCESS Success.
+ * @retval MASK_FAILURE Failure, most likely because we are out
+ * of memory.
+ */
+mask_t
+precompute_fixed_base (
+  struct fixed_base_table_t *out,
+  const struct tw_extensible_t *base,
+  unsigned int n,
+  unsigned int t,
+  unsigned int s,
+  struct tw_niels_t *prealloc
+) __attribute__((warn_unused_result));
+
+ /**
+  * Destroy a fixed-base table.  Frees any memory that we allocated
+  * for the combs.
+  *
+  * @param [in] table The table to destroy.
+  */
+void
+destroy_fixed_base (
+    struct fixed_base_table_t *table
+);
+
+/**
+ * Scalar multiplication with precomputation.  Set working to
+ * to [scalar] * Base, where Base is the base point passed to
+ * precompute_for_combs().
+ *
+ * The scalar may be adjusted by a multiple of q, so this routine
+ * can be wrong by a cofactor if the base has cofactor components.
+ *
+ * @param [out] out The output point.
+ * @param [in] scalar The scalar.
+ * @param [in] nbits The number of bits in the scalar.  Must be <= n*t*s.
+ * @param [in] table The precomputed table.
+ *
+ * @retval MASK_SUCCESS Success.
+ * @retval MASK_FAILURE Failure, because n*t*s < nbits
+ */ 
+mask_t
+scalarmul_fixed_base (
+    struct tw_extensible_t *out,
+    const word_t *scalar,
+    unsigned int nbits,
+    const struct fixed_base_table_t *table
+);
+
+/**
+ * Variable-time scalar multiplication.
+ *
+ * @warning This function takes variable time.  It is intended for
+ * microbenchmarking.
+ *
+ * @param [inout] working The input and output point.
+ * @param [in] scalar The scalar.
+ */ 
+void
+scalarmul_vt (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+);
+
+
+/**
+ * Precompute a table to accelerate fixed-point scalar
+ * multiplication (and, more importantly, linear combos)
+ * using the "windowed non-adjacent form" approach.
+ *
+ * @param [out] out The output table.  Must have room for 1<<i entries.
+ * @param [in] base The base point.
+ * @param [in] tbits The number of bits to put in the table.
+ *
+ * @retval MASK_SUCCESS Success.
+ * @retval MASK_FAILURE Failure, most likely because we are out
+ * of memory.
+ */
+mask_t
+precompute_fixed_base_wnaf (
+    struct tw_niels_t *out,
+    const struct tw_extensible_t *base,
+    unsigned int tbits
+) __attribute__((warn_unused_result));
+
+/**
+ * Variable-time scalar multiplication with precomputed WNAF
+ * tables.
+ *
+ * @warning This function takes variable time.  It is intended for
+ * microbenchmarking.
+ *
+ * @param [out] out The output point.
+ * @param [in] scalar The scalar.
+ * @param [in] nbits The number of bits in the scalar.
+ * @param [in] precmp The precomputed WNAF table.
+ * @param [in] table_bits The number of bits in the WNAF table.
+ */ 
+void
+scalarmul_fixed_base_wnaf_vt (
+    struct tw_extensible_t *out,
+    const word_t *scalar,
+    unsigned int nbits,
+    const struct tw_niels_t *precmp,
+    unsigned int table_bits
+);
+
+
+/**
+ * Variable-time scalar linear combination of two points: one
+ * variable, and one fixed (with fixed-base WNAF tables)
+ *
+ * @warning This function takes variable time.  It is intended for
+ * signature verification.
+ *
+ * @param [inout] working The output point, and also the variable input.
+ * @param [in] scalar_var The scalar for the variable input.
+ * @param [in] nbits_var The number of bits in scalar_var.
+ * @param [in] scalar_pre The scalar for the fixed input.
+ * @param [in] nbits_pre The number of bits in scalar_pre.
+ * @param [in] precmp The precomputed WNAF table.
+ * @param [in] table_bits_pre The number of bits in the WNAF table.
+ */ 
+void
+linear_combo_var_fixed_vt (
+    struct tw_extensible_t *working,
+    const word_t scalar_var[448/WORD_BITS],
+    unsigned int nbits_var,
+    const word_t scalar_pre[448/WORD_BITS],
+    unsigned int nbits_pre,
+    const struct tw_niels_t *precmp,
+    unsigned int table_bits_pre
+);
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* __P448_ALGO_H__ */
diff --git a/src/include/sha512.h b/src/include/sha512.h
new file mode 100644
index 0000000..cad1588
--- /dev/null
+++ b/src/include/sha512.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __GOLDI_SHA512_H__
+#define __GOLDI_SHA512_H__ 1
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * SHA512 hashing context.
+ *
+ * This structure is opaque.
+ */
+struct sha512_ctx_t {
+    /** @privatesection */
+    uint64_t chain[8];
+    uint8_t block[128];
+    uint64_t nbytes;
+};
+
+void
+sha512_init (
+    struct sha512_ctx_t *ctx
+);
+
+void
+sha512_update (
+    struct sha512_ctx_t *ctx,
+    const unsigned char *data,
+    uint64_t bytes
+);
+    
+void
+sha512_final (
+    struct sha512_ctx_t *ctx,
+    uint8_t result[64]
+);
+    
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+    
+#endif /* __GOLDI_SHA512_H__ */
diff --git a/src/include/word.h b/src/include/word.h
new file mode 100644
index 0000000..0fc7427
--- /dev/null
+++ b/src/include/word.h
@@ -0,0 +1,128 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __WORD_H__
+#define __WORD_H__
+
+/* for posix_memalign */
+#define _XOPEN_SOURCE 600
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <inttypes.h>
+
+#if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
+/* It's a 64-bit machine if:
+ * // limits.h thinks so
+ * __uint128_t exists
+ * size_t is 64 bits
+ * Either longs are 64-bits (doesn't happen on Windows)
+ *   or pointers are 64-bits (doesn't happen on 32/64 arches)
+ * FUTURE: validate this hack on more architectures.
+ */
+typedef uint32_t hword_t;
+typedef uint64_t word_t;
+typedef __uint128_t dword_t;
+typedef int32_t hsword_t;
+typedef int64_t sword_t;
+typedef __int128_t dsword_t;
+#define PRIxWORD PRIx64
+#define PRIxWORDfull "%016" PRIx64
+#define PRIxWORD58   "%014" PRIx64
+#define U64LE(x) x##ull
+#define U58LE(x) x##ull
+#else
+typedef uint16_t hword_t;
+typedef uint32_t word_t;
+typedef uint64_t dword_t;
+typedef int16_t hsword_t;
+typedef int32_t sword_t;
+typedef int64_t dsword_t;
+#define PRIxWORD PRIx32
+#define PRIxWORDfull "%08" PRIx32
+#define PRIxWORD58   "%07" PRIx32
+#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
+#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
+#endif
+
+#define WORD_BITS (sizeof(word_t) * 8)
+
+/* TODO: vector width for procs like ARM; gcc support */
+typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));
+
+static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;
+
+/* FIXME this only works on clang */
+typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
+typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
+typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
+typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
+typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
+typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
+typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
+typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
+
+#if __AVX2__
+typedef uint32x8_t big_register_t;
+typedef uint64x4_t uint64xn_t;
+typedef uint32x8_t uint32xn_t;
+#elif __SSE2__ || __ARM_NEON__
+typedef uint32x4_t big_register_t;
+typedef uint64x2_t uint64xn_t;
+typedef uint32x4_t uint32xn_t;
+#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
+typedef uint64_t big_register_t, uint64xn_t;
+typedef uint32_t uint32xn_t;
+#else
+typedef uint64_t uint64xn_t;
+typedef uint32_t uint32xn_t;
+typedef uint32_t big_register_t;
+#endif
+
+
+#if __AVX2__ || __SSE2__ || __ARM_NEON__
+static __inline__ big_register_t
+br_is_zero(big_register_t x) {
+    return (big_register_t)(x == (big_register_t)0);
+}
+#else
+static __inline__ mask_t
+br_is_zero(word_t x) {
+    return (((dword_t)x) - 1)>>WORD_BITS;
+}
+#endif
+
+
+
+/**
+ * Allocate memory which is sufficiently aligned to be used for the
+ * largest vector on the system (for now that's a big_register_t).
+ *
+ * Man malloc says that it does this, but at least for AVX2 on MacOS X,
+ * it's lying.
+ *
+ * @param size The size of the region to allocate.
+ * @return A suitable pointer, which can be free'd with free(),
+ * or NULL if no memory can be allocated.
+ */
+static __inline__ void *
+malloc_vector (
+    size_t size
+) __attribute__((always_inline, unused));
+
+void *
+malloc_vector(size_t size) {
+    void *out = NULL;
+    
+    int ret = posix_memalign(&out, sizeof(big_register_t), size);
+    
+    if (ret) {
+        return NULL;
+    } else {
+        return out;
+    }
+}
+
+#endif /* __WORD_H__ */
diff --git a/src/scalarmul.c b/src/scalarmul.c
new file mode 100644
index 0000000..1ad856c
--- /dev/null
+++ b/src/scalarmul.c
@@ -0,0 +1,844 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#include "word.h"
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+
+#include "intrinsics.h"
+#include "scalarmul.h"
+#include "barrett_field.h"
+
+mask_t
+montgomery_ladder (
+    struct p448_t *out,
+    const struct p448_t *in,
+    const word_t *scalar,
+    unsigned int nbits,
+    unsigned int n_extra_doubles
+) { 
+    struct montgomery_t mont;
+    deserialize_montgomery(&mont, in);
+    
+    int i,j,n=(nbits-1)%WORD_BITS;
+    mask_t pflip = 0;
+    for (j=(nbits+WORD_BITS-1)/WORD_BITS-1; j>=0; j--) {
+        word_t w = scalar[j];
+        for (i=n; i>=0; i--) {
+            mask_t flip = -((w>>i)&1);
+            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
+            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
+            montgomery_step(&mont);
+            pflip = flip;
+        }
+        n = WORD_BITS-1;
+    }
+    p448_cond_swap(&mont.xa,&mont.xd,pflip);
+    p448_cond_swap(&mont.za,&mont.zd,pflip);
+    
+    assert(n_extra_doubles < INT_MAX);
+    for (j=0; j<(int)n_extra_doubles; j++) {
+        montgomery_step(&mont);
+    }
+    
+    return serialize_montgomery(out, &mont, in);
+}
+
+static __inline__ void
+cond_negate_tw_niels (
+    struct tw_niels_t *n,
+    mask_t doNegate
+) {
+    p448_cond_swap(&n->a, &n->b, doNegate);
+    p448_cond_neg(&n->c, doNegate);
+}
+
+static __inline__ void
+cond_negate_tw_pniels (
+    struct tw_pniels_t *n,
+    mask_t doNegate
+) {
+    cond_negate_tw_niels(&n->n, doNegate);
+}
+
+void    
+constant_time_lookup_tw_pniels (
+    struct tw_pniels_t *out,
+    const struct tw_pniels_t *in,
+    int nin,
+    int idx
+) {
+    big_register_t big_one = 1, big_i = idx;
+    big_register_t *o = (big_register_t *)out;
+    const big_register_t *i = (const big_register_t *)in;
+    int j;
+    unsigned int k;
+    
+    memset(out, 0, sizeof(*out));
+    for (j=0; j<nin; j++, big_i-=big_one) {
+        big_register_t mask = br_is_zero(big_i);
+        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
+            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
+        }
+    }
+}
+
+static __inline__ void    
+constant_time_lookup_tw_niels (
+    struct tw_niels_t *out,
+    const struct tw_niels_t *in,
+    int nin,
+    int idx
+) {
+    big_register_t big_one = 1, big_i = idx;
+    big_register_t *o = (big_register_t *)out;
+    const big_register_t *i = (const big_register_t *)in;
+    int j;
+    unsigned int k;
+    
+    memset(out, 0, sizeof(*out));
+    for (j=0; j<nin; j++, big_i-=big_one) {
+        big_register_t mask = br_is_zero(big_i);
+        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
+            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
+        }
+    }
+}
+
+static void
+convert_to_signed_window_form (
+    word_t *out,
+    const word_t *scalar,
+    int nwords_scalar,
+    const word_t *prepared_data,
+    int nwords_pd
+) {
+    assert(nwords_pd <= nwords_scalar);
+    mask_t mask = -(scalar[0]&1);
+
+    word_t carry = add_nr_ext_packed(out, scalar, nwords_scalar, prepared_data, nwords_pd, ~mask);
+    carry += add_nr_ext_packed(out, out, nwords_scalar, prepared_data+nwords_pd, nwords_pd, mask);
+    
+    assert(!(out[0]&1));
+    
+    int i;
+    for (i=0; i<nwords_scalar; i++) {
+        out[i] >>= 1;
+        if (i<nwords_scalar-1) {
+            out[i] |= out[i+1]<<(WORD_BITS-1);
+        } else {
+            out[i] |= carry<<(WORD_BITS-1);
+        }
+    }
+}
+
+void
+scalarmul (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+) {
+
+    const int nbits=448; /* HACK? */
+    word_t prepared_data[448*2/WORD_BITS] = {
+        U64LE(0x9595b847fdf73126),
+        U64LE(0x9bb9b8a856af5200),
+        U64LE(0xb3136e22f37d5c4f),
+        U64LE(0x0000000189a19442),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000),
+        U64LE(0x4000000000000000),
+
+        U64LE(0x721cf5b5529eec33),
+        U64LE(0x7a4cf635c8e9c2ab),
+        U64LE(0xeec492d944a725bf),
+        U64LE(0x000000020cd77058),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000)
+    }; /* TODO: split off */
+    
+    word_t scalar2[448/WORD_BITS];
+    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
+
+    struct tw_extensible_t tabulator;
+    copy_tw_extensible(&tabulator, working);
+    double_tw_extensible(&tabulator);
+
+    struct tw_pniels_t pn, multiples[8];
+    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
+    convert_tw_extensible_to_tw_pniels(&multiples[0], working);
+
+    int i;
+    for (i=1; i<8; i++) {
+        add_tw_pniels_to_tw_extensible(working, &pn);
+        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
+    }
+
+    i = nbits - 4;
+    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
+        inv = (bits>>3)-1;
+    bits ^= inv;
+    
+    constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
+    cond_negate_tw_pniels(&pn, inv);
+    convert_tw_pniels_to_tw_extensible(working, &pn);
+		
+
+    for (i-=4; i>=0; i-=4) {
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+
+        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
+        inv = (bits>>3)-1;
+        bits ^= inv;
+    
+        constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
+        cond_negate_tw_pniels(&pn, inv);
+        add_tw_pniels_to_tw_extensible(working, &pn);
+    }
+}
+
+void
+scalarmul_vlook (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+) {
+
+    const int nbits=448; /* HACK? */
+    word_t prepared_data[448*2/WORD_BITS] = {
+        U64LE(0x9595b847fdf73126),
+        U64LE(0x9bb9b8a856af5200),
+        U64LE(0xb3136e22f37d5c4f),
+        U64LE(0x0000000189a19442),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000),
+        U64LE(0x4000000000000000),
+
+        U64LE(0x721cf5b5529eec33),
+        U64LE(0x7a4cf635c8e9c2ab),
+        U64LE(0xeec492d944a725bf),
+        U64LE(0x000000020cd77058),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000),
+        U64LE(0x0000000000000000)
+    }; /* TODO: split off */
+    
+    word_t scalar2[448/WORD_BITS];
+    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
+
+    struct tw_extensible_t tabulator;
+    copy_tw_extensible(&tabulator, working);
+    double_tw_extensible(&tabulator);
+
+    struct tw_pniels_t pn, multiples[8];
+    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
+    convert_tw_extensible_to_tw_pniels(&multiples[0], working);
+
+    int i;
+    for (i=1; i<8; i++) {
+        add_tw_pniels_to_tw_extensible(working, &pn);
+        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
+    }
+
+    i = nbits - 4;
+    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
+        inv = (bits>>3)-1;
+    bits ^= inv;
+
+	copy_tw_pniels(&pn, &multiples[bits&7]);
+    cond_negate_tw_pniels(&pn, inv);
+    convert_tw_pniels_to_tw_extensible(working, &pn);
+		
+
+    for (i-=4; i>=0; i-=4) {
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+        double_tw_extensible(working);
+
+        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
+        inv = (bits>>3)-1;
+        bits ^= inv;
+    
+		copy_tw_pniels(&pn, &multiples[bits&7]);
+        cond_negate_tw_pniels(&pn, inv);
+        add_tw_pniels_to_tw_extensible(working, &pn);
+    }
+}
+
+
+mask_t
+scalarmul_fixed_base (
+    struct tw_extensible_t *out,
+    const word_t scalar[448/WORD_BITS],
+    unsigned int nbits,
+    const struct fixed_base_table_t *table
+) {
+    unsigned int n = table->n, t = table->t, s = table->s;
+    assert(n >= 1 && t >= 1 && s >= 1);
+    
+    if (n*t*s < nbits) {
+        return MASK_FAILURE;
+    }
+    
+    unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS,
+        scalar2_words = scalar_words;
+    if (scalar2_words < 448 / WORD_BITS)
+        scalar2_words = 448 / WORD_BITS;
+    word_t scalar2[scalar2_words], scalar3[scalar2_words];
+    
+    /* Copy scalar to scalar3, but clear its high bits (if there are any) */
+    unsigned int i,j,k;
+    for (i=0; i<scalar_words; i++) {
+        scalar3[i] = scalar[i];
+    }
+    if (likely(i) && (nbits % WORD_BITS)) {
+        scalar3[i-1] &= (((word_t)1) << (nbits%WORD_BITS)) - 1;
+    }
+    for (; i<scalar2_words; i++) {
+        scalar3[i] = 0;
+    }
+    
+    convert_to_signed_window_form (
+        scalar2,
+        scalar3, scalar2_words,
+        table->scalar_adjustments , 448 / WORD_BITS
+    );
+    
+    struct tw_niels_t ni;
+    
+    for (i=0; i<s; i++) {
+        if (i) double_tw_extensible(out);
+        
+        for (j=0; j<n; j++) {
+            int tab = 0;
+			
+			/*
+             * PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
+			 * time of a keygen or sign op.  Surely it is possible to speed it up.
+             */
+            for (k=0; k<t; k++) {
+                unsigned int bit = (s-1-i) + k*s + j*(s*t);
+                if (bit < scalar2_words * WORD_BITS) {
+                    tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
+                }
+            }
+            
+            mask_t invert = (tab>>(t-1))-1;
+            tab ^= invert;
+            tab &= (1<<(t-1)) - 1;
+            
+            constant_time_lookup_tw_niels(&ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
+            cond_negate_tw_niels(&ni, invert);
+            if (i||j) {
+                add_tw_niels_to_tw_extensible(out, &ni);
+            } else {
+                convert_tw_niels_to_tw_extensible(out, &ni);
+            }
+        }
+    }
+    
+    return MASK_SUCCESS;
+}
+
+mask_t
+precompute_fixed_base (
+  struct fixed_base_table_t *out,
+  const struct tw_extensible_t *base,
+  unsigned int n,
+  unsigned int t,
+  unsigned int s,
+  struct tw_niels_t *prealloc
+) {
+    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) {
+        memset(out, 0, sizeof(*out));
+        return 0;
+    }
+    
+    out->n = n;
+    out->t = t;
+    out->s = s;
+  
+    struct tw_extensible_t working, start;
+    copy_tw_extensible(&working, base);
+    struct tw_pniels_t pn_tmp;
+  
+    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
+    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
+    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
+    
+    struct tw_niels_t *table = prealloc;
+    if (prealloc) {
+        out->own_table = 0;
+    } else {
+        table = (struct tw_niels_t *) malloc_vector(sizeof(*table) * (n<<(t-1)));
+        out->own_table = 1;
+    }
+    out->table = table;
+  
+    if (!doubles || !zs || !zis || !table) {
+        free(doubles);
+        free(zs);
+        free(zis);
+        memset(out, 0, sizeof(*out));
+        memset(table, 0, sizeof(*table) * (n<<(t-1)));
+        if (!prealloc) free(table);
+        return 0;
+    }
+  
+    unsigned int i,j,k;
+    
+    /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */
+    unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1;
+    assert(adjustment_size >= 448/WORD_BITS);
+    word_t adjustment[adjustment_size];
+    for (i=0; i<adjustment_size; i++) {
+        adjustment[i] = -1;
+    }
+    
+    adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);
+
+    /* FIXME: factor out somehow */
+    const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
+        U64LE(0xdc873d6d54a7bb0d),
+        U64LE(0xde933d8d723a70aa),
+        U64LE(0x3bb124b65129c96f),
+        0x8335dc16
+    };
+    const struct barrett_prime_t goldi_q448 = {
+        448/WORD_BITS, 62 % WORD_BITS, sizeof(goldi_q448_lo)/sizeof(word_t), goldi_q448_lo
+    };
+    
+    /* The low adjustment is 2^nbits - 1 mod q */
+    barrett_reduce(adjustment, adjustment_size, 0, &goldi_q448);
+    word_t *low_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)],
+        *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)];
+    for (i=0; i<448/WORD_BITS; i++) {
+        low_adjustment[i] = adjustment[i];
+    }
+    
+    /* The high adjustment is low + q = low - q_lo + 2^big */
+    (void)
+    sub_nr_ext_packed(
+        high_adjustment,
+        adjustment, 448/WORD_BITS,
+        goldi_q448.p_lo, goldi_q448.nwords_lo,
+        -1
+    );
+    if (goldi_q448.p_shift) {
+        high_adjustment[goldi_q448.nwords_p - 1] += ((word_t)1)<<goldi_q448.p_shift;
+    }
+    
+    /* OK, now compute the tables */
+    for (i=0; i<n; i++) {
+
+        /* doubling phase */
+        for (j=0; j<t; j++) {
+            if (j) {
+                convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
+                add_tw_pniels_to_tw_extensible(&start, &pn_tmp);
+            } else {
+                copy_tw_extensible(&start, &working);
+            }
+
+            if (j==t-1 && i==n-1) {
+                break;
+            }
+
+            double_tw_extensible(&working);
+            if (j<t-1) {
+                convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
+            }
+
+            for (k=0; k<s-1; k++) {
+                double_tw_extensible(&working);
+            }
+        }
+
+        /* Gray-code phase */
+        for (j=0;; j++) {
+            int gray = j ^ (j>>1);
+            int idx = ((i+1)<<(t-1))-1 ^ gray;
+
+            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
+            copy_tw_niels(&table[idx], &pn_tmp.n);
+            p448_copy(&zs[idx], &pn_tmp.z);
+			
+            if (j >= (1<<(t-1)) - 1) break;
+            int delta = (j+1) ^ ((j+1)>>1) ^ gray;
+
+            for (k=0; delta>1; k++)
+                delta >>=1;
+            
+            if (gray & (1<<k)) {
+                /* start += doubles[k] */
+                add_tw_pniels_to_tw_extensible(&start, &doubles[k]);
+            } else {
+                /* start -= doubles[k] */
+                sub_tw_pniels_from_tw_extensible(&start, &doubles[k]);
+            }
+            
+            
+        }
+    }
+	
+    simultaneous_invert_p448(zis, zs, n<<(t-1));
+
+    p448_t product;
+    for (i=0; i<n<<(t-1); i++) {
+        p448_mul(&product, &table[i].a, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&table[i].a, &product);
+        
+        p448_mul(&product, &table[i].b, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&table[i].b, &product);
+        
+        p448_mul(&product, &table[i].c, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&table[i].c, &product);
+    }
+	
+	mask_t ret = ~p448_is_zero(&zis[0]);
+
+    free(doubles);
+    free(zs);
+    free(zis);
+
+    if (unlikely(!ret)) {
+        memset(table, 0, sizeof(*table) * (n<<(t-1)));
+        if (!prealloc) free(table);
+        memset(out, 0, sizeof(*out));
+        return 0;
+    }
+
+    return ret;
+}
+
+void
+destroy_fixed_base (
+    struct fixed_base_table_t *table
+) {
+    if (table->table) {
+        memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
+    }
+    if (table->own_table) {
+        free(table->table);
+    }
+    memset(table,0,sizeof(*table));
+}
+
+mask_t
+precompute_fixed_base_wnaf (
+    struct tw_niels_t *out,
+    const struct tw_extensible_t *const_base,
+    unsigned int tbits
+) {
+    int i;
+    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs)<<tbits);
+    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis)<<tbits);
+
+    if (!zs || !zis) {
+        free(zs);
+        free(zis);
+        return 0;
+    }
+
+    struct tw_extensible_t base;
+    copy_tw_extensible(&base,const_base);
+    
+    struct tw_pniels_t twop, tmp;
+    
+    convert_tw_extensible_to_tw_pniels(&tmp, &base);
+    p448_copy(&zs[0], &tmp.z);
+    copy_tw_niels(&out[0], &tmp.n);
+
+    if (tbits > 0) {
+        double_tw_extensible(&base);
+        convert_tw_extensible_to_tw_pniels(&twop, &base);
+        add_tw_pniels_to_tw_extensible(&base, &tmp);
+        
+        convert_tw_extensible_to_tw_pniels(&tmp, &base);
+        p448_copy(&zs[1], &tmp.z);
+        copy_tw_niels(&out[1], &tmp.n);
+
+        for (i=2; i < 1<<tbits; i++) {
+            add_tw_pniels_to_tw_extensible(&base, &twop);
+            convert_tw_extensible_to_tw_pniels(&tmp, &base);
+            p448_copy(&zs[i], &tmp.z);
+            copy_tw_niels(&out[i], &tmp.n);
+        }
+    }
+    
+    simultaneous_invert_p448(zis, zs, 1<<tbits);
+
+    p448_t product;
+    for (i=0; i<1<<tbits; i++) {
+        p448_mul(&product, &out[i].a, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&out[i].a, &product);
+        
+        p448_mul(&product, &out[i].b, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&out[i].b, &product);
+        
+        p448_mul(&product, &out[i].c, &zis[i]);
+        p448_strong_reduce(&product);
+        p448_copy(&out[i].c, &product);
+    }
+
+    free(zs);
+    free(zis);
+
+    return -1;
+}
+
+/**
+ * @cond internal
+ * Control for variable-time scalar multiply algorithms.
+ */
+struct smvt_control {
+  int power, addend;
+};
+
+static int
+recode_wnaf(
+    struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
+    const word_t *scalar,
+    unsigned int nbits,
+    unsigned int tableBits)
+{
+    int current = 0, i, j;
+    unsigned int position = 0;
+
+    /* PERF: negate scalar if it's large
+     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
+     */
+    for (i=nbits-1; i >= 0; i--) {
+        int bit = (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1;
+        current = 2*current + bit;
+
+        /*
+         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
+         * So current loses (tableBits+1) bits every time.  It otherwise gains
+         * 1 bit per iteration.  The number of iterations is
+         * (nbits + 2 + tableBits), and an additional control word is added at
+         * the end.  So the total number of control words is at most
+         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
+         * There's also the stopper with power -1, for a total of +3.
+         */
+        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
+            int delta = (current + 1) >> 1; // |delta| < 2^tablebits
+            current = -(current & 1);
+
+            for (j=i; (delta & 1) == 0; j++) {
+                delta >>= 1;
+            }
+            control[position].power = j+1;
+            control[position].addend = delta;
+            position++;
+            assert(position <= nbits/(tableBits+1) + 2);
+        }
+    }
+    
+    if (current) {
+        for (j=0; (current & 1) == 0; j++) {
+            current >>= 1;
+        }
+        control[position].power = j;
+        control[position].addend = current;
+        position++;
+        assert(position <= nbits/(tableBits+1) + 2);
+    }
+    
+  
+    control[position].power = -1;
+    control[position].addend = 0;
+    return position;
+}
+
+
+static void
+prepare_wnaf_table(
+    struct tw_pniels_t *output,
+    struct tw_extensible_t *working,
+    unsigned int tbits
+) {
+    convert_tw_extensible_to_tw_pniels(&output[0], working);
+
+    if (tbits == 0) return;
+
+    double_tw_extensible(working);
+    struct tw_pniels_t twop;
+    convert_tw_extensible_to_tw_pniels(&twop, working);
+
+    add_tw_pniels_to_tw_extensible(working, &output[0]);
+    convert_tw_extensible_to_tw_pniels(&output[1], working);
+
+    for (int i=2; i < 1<<tbits; i++) {
+        add_tw_pniels_to_tw_extensible(working, &twop);
+        convert_tw_extensible_to_tw_pniels(&output[i], working);
+    }
+}
+
+void
+scalarmul_vt (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS]
+) {
+    /* HACK: not 448? */
+    const int nbits=448, table_bits = 3;
+    struct smvt_control control[nbits/(table_bits+1)+3];
+    
+    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
+  
+    struct tw_pniels_t precmp[1<<table_bits];
+    prepare_wnaf_table(precmp, working, table_bits);
+  
+    if (control_bits > 0) {
+        assert(control[0].addend > 0);
+        assert(control[0].power >= 0);
+        convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
+    } else {
+        set_identity_tw_extensible(working);
+        return;
+    }
+  
+    int conti = 1, i;
+    for (i = control[0].power - 1; i >= 0; i--) {
+        double_tw_extensible(working);
+
+        if (i == control[conti].power) {
+            assert(control[conti].addend);
+
+            if (control[conti].addend > 0) {
+                add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
+            } else {
+                sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
+            }
+            conti++;
+            assert(conti <= control_bits);
+        }
+    }
+}
+
+void
+scalarmul_fixed_base_wnaf_vt (
+    struct tw_extensible_t *working,
+    const word_t scalar[448/WORD_BITS],
+    unsigned int nbits,
+    const struct tw_niels_t *precmp,
+    unsigned int table_bits
+) {
+    struct smvt_control control[nbits/(table_bits+1)+3];
+    
+    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
+  
+    if (control_bits > 0) {
+        assert(control[0].addend > 0);
+        assert(control[0].power >= 0);
+        convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
+    } else {
+        set_identity_tw_extensible(working);
+        return;
+    }
+  
+    int conti = 1, i;
+    for (; control[conti].power >= 0; conti++) {
+        assert(conti <= control_bits);
+        for (i = control[conti-1].power - control[conti].power; i; i--) {
+            double_tw_extensible(working);
+        }
+        
+        assert(control[conti].addend);
+        if (control[conti].addend > 0) {
+            add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
+        } else {
+            sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
+        }
+    }
+
+    for (i = control[conti-1].power; i; i--) {
+        double_tw_extensible(working);
+    }
+}
+
+void
+linear_combo_var_fixed_vt(
+    struct tw_extensible_t *working,
+    const word_t scalar_var[448/WORD_BITS],
+    unsigned int nbits_var,
+    const word_t scalar_pre[448/WORD_BITS],
+    unsigned int nbits_pre,
+    const struct tw_niels_t *precmp,
+    unsigned int table_bits_pre
+) {
+    const int table_bits_var = 3;
+    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
+    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
+    
+    int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
+    int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
+    (void)ncb_var;
+    (void)ncb_pre;
+  
+    struct tw_pniels_t precmp_var[1<<table_bits_var];
+    prepare_wnaf_table(precmp_var, working, table_bits_var);
+  
+    int contp=0, contv=0, i;
+  
+    i = control_var[0].power;
+    if (i > control_pre[0].power) {
+        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
+        contv++;
+    } else if (i == control_pre[0].power && i >=0 ) {
+        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
+        add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
+        contv++; contp++;
+    } else {
+        i = control_pre[0].power;
+        convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
+        contp++;
+    }
+    
+    if (i < 0) {
+        set_identity_tw_extensible(working);
+        return;
+    }
+    
+    for (i--; i >= 0; i--) {
+        double_tw_extensible(working);
+
+        if (i == control_var[contv].power) {
+            assert(control_var[contv].addend);
+
+            if (control_var[contv].addend > 0) {
+                add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]);
+            } else {
+                sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]);
+            }
+            contv++;
+        }
+
+        if (i == control_pre[contp].power) {
+            assert(control_pre[contp].addend);
+
+            if (control_pre[contp].addend > 0) {
+                add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]);
+            } else {
+                sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]);
+            }
+            contp++;
+        }
+    }
+    
+    assert(contv == ncb_var);
+    assert(contp == ncb_pre);
+}
+
+
+
diff --git a/src/sha512.c b/src/sha512.c
new file mode 100644
index 0000000..dd1468b
--- /dev/null
+++ b/src/sha512.c
@@ -0,0 +1,187 @@
+/* Copyright (c) 2011 Stanford University.
+ * Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __APPLE__
+#define _BSD_SOURCE
+#include <endian.h>
+#endif
+
+#include "sha512.h"
+
+#include <string.h>
+#include <assert.h>
+
+static inline uint64_t
+rotate_r (
+    uint64_t x,
+    int d
+) {
+  return (x >> d) | (x << (64-d));
+}
+
+#ifdef __APPLE__
+static inline uint64_t
+htobe64 (uint64_t x) {
+    __asm__ ("bswapq %0" : "+r"(x));
+    return x;
+}
+#endif
+
+static const uint64_t
+sha512_init_state[8] = {
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+};
+
+static const uint64_t
+sha512_k[80] = {
+    0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+    0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+    0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+    0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
+    0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 
+    0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+    0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+    0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
+    0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+    0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, 
+    0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
+    0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+    0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+    0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 
+    0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+    0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+    0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+    0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+static inline uint64_t S0 (uint64_t h1) {
+    return rotate_r(h1, 28) ^ rotate_r(h1, 34) ^ rotate_r(h1, 39);
+}
+
+static inline uint64_t S1 (uint64_t h4) {
+    return rotate_r(h4,14) ^ rotate_r(h4,18) ^ rotate_r(h4,41);
+}
+
+static inline uint64_t s0 (uint64_t a) {
+    return rotate_r(a,1) ^ rotate_r(a,8) ^ a>>7;
+}
+
+static inline uint64_t s1 (uint64_t b) {
+    return rotate_r(b,19) ^ rotate_r(b,61) ^ b>>6;
+}
+
+static inline uint64_t ch (uint64_t h4, uint64_t h5, uint64_t h6) {
+    return h6^(h4 & (h6^h5));
+}
+
+static inline uint64_t maj(uint64_t h1, uint64_t h2, uint64_t h3) {
+    return (h1&h2) ^ (h3&(h1^h2));
+}
+
+static void
+sha512_process_block (
+    struct sha512_ctx_t *ctx
+) {
+    uint64_t i, tmp, a, b,
+        *w = (uint64_t *) ctx->block,
+        *state = ctx->chain,
+        h0 = state[0], h1 = state[1], h2 = state[2], h3 = state[3],
+        h4 = state[4], h5 = state[5], h6 = state[6], h7 = state[7];
+
+    /* Clang doesn't unswitch this automatically */
+    for (i=0; i<16; i++) {
+        /* load up the input word for this round */
+        tmp = w[i] = htobe64(w[i]);
+        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
+  
+        /* shift register */
+        h7 = h6; h6 = h5; h5 = h4;
+        h4 = h3 + tmp;
+        h3 = h2; h2 = h1; h1 = h0;
+        h0 = tmp + maj(h1,h2,h3) + S0(h1);
+    }
+  
+    for (; i<80; i++) {
+        /* load up the input word for this round */
+        a   = w[(i+1 ) & 15];
+        b   = w[(i+14) & 15];
+        tmp = w[i&15] = s0(a) + s1(b) + w[i&15] + w[(i+9) & 15];
+        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
+  
+        /* shift register */
+        h7 = h6; h6 = h5; h5 = h4;
+        h4 = h3 + tmp;
+        h3 = h2; h2 = h1; h1 = h0;
+        h0 = tmp + maj(h1,h2,h3) + S0(h1);
+    }
+ 
+    state[0] += h0;
+    state[1] += h1;
+    state[2] += h2;
+    state[3] += h3;
+    state[4] += h4;
+    state[5] += h5;
+    state[6] += h6;
+    state[7] += h7;
+}
+
+void
+sha512_init (
+    struct sha512_ctx_t *ctx
+) {
+    ctx->nbytes = 0;
+    memcpy(ctx->chain, sha512_init_state, sizeof(sha512_init_state));
+    memset(ctx->block, 0, sizeof(ctx->block));
+}
+
+void
+sha512_update (
+    struct sha512_ctx_t *ctx,
+    const unsigned char *data,
+    uint64_t bytes
+) {
+    assert(ctx->nbytes < 1ull<<56);
+    assert(bytes < 1ull<<56);
+    
+    while (bytes) {
+        uint64_t fill = ctx->nbytes % 128, accept = 128 - fill;
+        if (accept > bytes) accept = bytes;
+        ctx->nbytes += accept;
+        memcpy(ctx->block + fill, data, accept);
+        
+        if (fill+accept == 128)
+            sha512_process_block(ctx);
+
+        bytes -= accept;
+        data += accept;
+    }
+    
+    assert(ctx->nbytes < 1ull<<56);
+}
+
+void
+sha512_final (
+    struct sha512_ctx_t *ctx,
+    uint8_t result[64]
+) {
+    uint64_t fill = ctx->nbytes % 128, i;
+    ctx->block[fill++] = 0x80;
+    if (fill > 112) {
+        memset(ctx->block + fill, 0, 128-fill);
+        sha512_process_block(ctx);
+        fill = 0;
+    }
+    memset(ctx->block + fill, 0, 112-fill);
+    *((uint64_t *)&ctx->block[112]) = 0;
+    *((uint64_t *)&ctx->block[120]) = htobe64((ctx->nbytes * 8));
+    sha512_process_block(ctx);
+    for (i=0; i<8; i++) {
+        ctx->chain[i] = htobe64(ctx->chain[i]);
+    }
+    memcpy(result, ctx->chain, sizeof(ctx->chain));
+    sha512_init(ctx);
+}
diff --git a/test/bench.c b/test/bench.c
new file mode 100644
index 0000000..b54488f
--- /dev/null
+++ b/test/bench.c
@@ -0,0 +1,684 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "word.h"
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <memory.h>
+
+#include "p448.h"
+#include "ec_point.h"
+#include "scalarmul.h"
+#include "barrett_field.h"
+#include "crandom.h"
+#include "goldilocks.h"
+#include "sha512.h"
+
+double now() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  
+  return tv.tv_sec + tv.tv_usec/1000000.0;
+}
+
+void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
+    crandom_generate(crand, (unsigned char *)a, sizeof(*a));
+    p448_strong_reduce(a);
+}
+
+void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
+    crandom_generate(crand, (unsigned char *)sk, 448/8);
+}
+
+void p448_print( const char *descr, const struct p448_t *a ) {
+    p448_t b;
+    p448_copy(&b, a);
+    p448_strong_reduce(&b);
+    int j;
+    printf("%s = 0x", descr);
+    for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) {
+        printf(PRIxWORD58, b.limb[j]);
+    }
+    printf("\n");
+}
+
+void p448_print_full( const char *descr, const struct p448_t *a ) {
+    int j;
+    printf("%s = 0x", descr);
+    for (j=15; j>=0; j--) {
+        printf("%02" PRIxWORD "_" PRIxWORD58 " ",
+            a->limb[j]>>28, a->limb[j]&(1<<28)-1);
+    }
+    printf("\n");
+}
+
+void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
+    int j;
+    printf("%s = 0x", descr);
+    for (j=448/WORD_BITS-1; j>=0; j--) {
+        printf(PRIxWORDfull, secret[j]);
+    }
+    printf("\n");
+}
+
+#ifndef N_TESTS_BASE
+#define N_TESTS_BASE 10000
+#endif
+
+int main(int argc, char **argv) {
+    (void)argc;
+    (void)argv;
+
+    struct tw_extensible_t ext;
+    struct extensible_t exta;
+    struct tw_niels_t niels;
+    struct tw_pniels_t pniels;
+    struct affine_t affine;
+    struct montgomery_t mb;
+    struct p448_t a,b,c,d;
+    
+    
+    double when;
+    int i;
+
+    int nbase = N_TESTS_BASE;
+    
+    /* Bad randomness so we can debug. */
+    char initial_seed[32];
+    for (i=0; i<32; i++) initial_seed[i] = i;
+    struct crandom_state_t crand;
+    crandom_init_from_buffer(&crand, initial_seed);
+    
+    word_t sk[448/WORD_BITS],tk[448/WORD_BITS];
+    q448_randomize(&crand, sk);
+    
+    when = now();
+    for (i=0; i<nbase*1000; i++) {
+        p448_mul(&c, &b, &a);
+    }
+    when = now() - when;
+    printf("mul:         %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*1000; i++) {
+        p448_sqr(&c, &a);
+    }
+    when = now() - when;
+    printf("sqr:         %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*500; i++) {
+        p448_mul(&c, &b, &a);
+        p448_mul(&a, &b, &c);
+    }
+    when = now() - when;
+    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
+    
+    when = now();
+    for (i=0; i<nbase*1000; i++) {
+        p448_mulw(&c, &b, 1234562);
+    }
+    when = now() - when;
+    printf("mulw:        %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*10; i++) {
+        p448_randomize(&crand, &a);
+    }
+    when = now() - when;
+    printf("rand448:     %5.1fns\n", when * 1e9 / i);
+    
+    struct sha512_ctx_t sha;
+    uint8_t hashout[128];
+    when = now();
+    for (i=0; i<nbase; i++) {
+        sha512_init(&sha);
+        sha512_final(&sha, hashout);
+    }
+    when = now() - when;
+    printf("sha512 1blk: %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        sha512_update(&sha, hashout, 128);
+    }
+    when = now() - when;
+    printf("sha512 blk:  %5.1fns (%0.2f MB/s)\n", when * 1e9 / i, 128*i/when/1e6);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        p448_isr(&c, &a);
+    }
+    when = now() - when;
+    printf("isr auto:    %5.1fµs\n", when * 1e6 / i);
+    
+    for (i=0; i<100; i++) {
+        p448_randomize(&crand, &a);
+        p448_isr(&d,&a);
+        p448_sqr(&b,&d);
+        p448_mul(&c,&b,&a);
+        p448_sqr(&b,&c);
+        p448_subw(&b,1);
+        p448_bias(&b,1);
+        if (!p448_is_zero(&b)) {
+            printf("ISR validation failure!\n");
+            p448_print("a", &a);
+            p448_print("s", &d);
+        }
+    }
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        elligator_2s_inject(&affine, &a);
+    }
+    when = now() - when;
+    printf("elligator:   %5.1fµs\n", when * 1e6 / i);
+    
+    for (i=0; i<100; i++) {
+        p448_randomize(&crand, &a);
+        elligator_2s_inject(&affine, &a);
+        if (!validate_affine(&affine)) {
+            printf("Elligator validation failure!\n");
+            p448_print("a", &a);
+            p448_print("x", &affine.x);
+            p448_print("y", &affine.y);
+        }
+    }
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        deserialize_affine(&affine, &a);
+    }
+    when = now() - when;
+    printf("decompress:  %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        serialize_extensible(&a, &exta);
+    }
+    when = now() - when;
+    printf("compress:    %5.1fµs\n", when * 1e6 / i);
+    
+    int goods = 0;
+    for (i=0; i<100; i++) {
+        p448_randomize(&crand, &a);
+        mask_t good = deserialize_affine(&affine, &a);
+        if (good & !validate_affine(&affine)) {
+            printf("Deserialize validation failure!\n");
+            p448_print("a", &a);
+            p448_print("x", &affine.x);
+            p448_print("y", &affine.y);
+        } else if (good) {
+            goods++;
+            convert_affine_to_extensible(&exta,&affine);
+            serialize_extensible(&b, &exta);
+            p448_sub(&c,&b,&a);
+            p448_bias(&c,2);
+            if (!p448_is_zero(&c)) {
+                printf("Reserialize validation failure!\n");
+                p448_print("a", &a);
+                p448_print("x", &affine.x);
+                p448_print("y", &affine.y);
+                deserialize_affine(&affine, &b);
+                p448_print("b", &b);
+                p448_print("x", &affine.x);
+                p448_print("y", &affine.y);
+                printf("\n");
+            }
+        }
+    }
+    if (goods<i/3) {
+        printf("Deserialization validation failure! Deserialized %d/%d points\n", goods, i);
+    }
+    
+    word_t lsk[768/WORD_BITS];
+    crandom_generate(&crand, (unsigned char *)lsk, sizeof(lsk));
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&goldi_q448);
+    }
+    when = now() - when;
+    printf("barrett red: %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*10; i++) {
+        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&goldi_q448);
+    }
+    when = now() - when;
+    printf("barrett mac: %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        add_tw_niels_to_tw_extensible(&ext, &niels);
+    }
+    when = now() - when;
+    printf("exti+niels:  %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        add_tw_pniels_to_tw_extensible(&ext, &pniels);
+    }
+    when = now() - when;
+    printf("exti+pniels: %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        double_tw_extensible(&ext);
+    }
+    when = now() - when;
+    printf("exti dbl:    %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        untwist_and_double(&exta, &ext);
+    }
+    when = now() - when;
+    printf("i->a isog:   %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        twist_and_double(&ext, &exta);
+    }
+    when = now() - when;
+    printf("a->i isog:   %5.1fns\n", when * 1e9 / i);
+    
+    when = now();
+    for (i=0; i<nbase*100; i++) {
+        montgomery_step(&mb);
+    }
+    when = now() - when;
+    printf("monty step:  %5.1fns\n", when * 1e9 / i);
+	
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        (void)montgomery_ladder(&a,&b,sk,448,0);
+    }
+    when = now() - when;
+    printf("full ladder: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        scalarmul(&ext,sk);
+    }
+    when = now() - when;
+    printf("edwards smz: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        scalarmul_vlook(&ext,sk);
+        untwist_and_double_and_serialize(&a,&ext);
+    }
+    when = now() - when;
+    printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        q448_randomize(&crand, sk);
+        scalarmul_vt(&ext,sk);
+    }
+    when = now() - when;
+    printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
+    
+    struct tw_niels_t wnaft[1<<6];
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        (void)precompute_fixed_base_wnaf(wnaft,&ext,6);
+    }
+    when = now() - when;
+    printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        q448_randomize(&crand, sk);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,6);
+    }
+    when = now() - when;
+    printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        (void)precompute_fixed_base_wnaf(wnaft,&ext,4);
+    }
+    when = now() - when;
+    printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        q448_randomize(&crand, sk);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,4);
+    }
+    when = now() - when;
+    printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);
+
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        (void)precompute_fixed_base_wnaf(wnaft,&ext,5);
+    }
+    when = now() - when;
+    printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        q448_randomize(&crand, sk);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,446,wnaft,5);
+    }
+    when = now() - when;
+    printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        q448_randomize(&crand, sk);
+        q448_randomize(&crand, tk);
+        linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
+    }
+    when = now() - when;
+    printf("vt vf combo: %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        deserialize_affine(&affine, &a);
+        convert_affine_to_extensible(&exta,&affine);
+        twist_and_double(&ext,&exta);
+        scalarmul(&ext,sk);
+        untwist_and_double(&exta,&ext);
+        serialize_extensible(&b, &exta);
+    }
+    when = now() - when;
+    printf("edwards sm:  %5.1fµs\n", when * 1e6 / i);
+    
+    struct fixed_base_table_t t_5_5_18, t_3_5_30, t_8_4_14, t_5_3_30, t_15_3_10;
+
+    while (1) {
+        p448_randomize(&crand, &a);
+        if (deserialize_affine(&affine, &a)) break;
+    }
+    convert_affine_to_extensible(&exta,&affine);
+    twist_and_double(&ext,&exta);
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        if (i) destroy_fixed_base(&t_5_5_18);
+        (void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL);
+    }
+    when = now() - when;
+    printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        if (i) destroy_fixed_base(&t_3_5_30);
+        (void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL);
+    }
+    when = now() - when;
+    printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        if (i) destroy_fixed_base(&t_5_3_30);
+        (void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL);
+    }
+    when = now() - when;
+    printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        if (i) destroy_fixed_base(&t_15_3_10);
+        (void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL);
+    }
+    when = now() - when;
+    printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        if (i) destroy_fixed_base(&t_8_4_14);
+        (void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL);
+    }
+    when = now() - when;
+    printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i);
+	
+    when = now();
+    for (i=0; i<nbase; i++) {
+        scalarmul_fixed_base(&ext, sk, 448, &t_5_5_18);
+    }
+    when = now() - when;
+    printf("com(5,5,18): %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        scalarmul_fixed_base(&ext, sk, 448, &t_3_5_30);
+    }
+    when = now() - when;
+    printf("com(3,5,30): %5.1fµs\n", when * 1e6 / i);
+
+    when = now();
+    for (i=0; i<nbase; i++) {
+        scalarmul_fixed_base(&ext, sk, 448, &t_8_4_14);
+    }
+    when = now() - when;
+    printf("com(8,4,14): %5.1fµs\n", when * 1e6 / i);
+
+    when = now();
+    for (i=0; i<nbase; i++) {
+        scalarmul_fixed_base(&ext, sk, 448, &t_5_3_30);
+    }
+    when = now() - when;
+    printf("com(5,3,30): %5.1fµs\n", when * 1e6 / i);
+
+    when = now();
+    for (i=0; i<nbase; i++) {
+        scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10);
+    }
+    when = now() - when;
+    printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i);
+    
+    printf("\nGoldilocks:\n");
+    
+    int res = goldilocks_init();
+    assert(!res);
+    
+    struct goldilocks_public_key_t gpk,hpk;
+    struct goldilocks_private_key_t gsk,hsk;
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        if (i&1) {
+            res = goldilocks_keygen(&gsk,&gpk);
+        } else {
+            res = goldilocks_keygen(&hsk,&hpk);
+        }
+        assert(!res);
+    }
+    when = now() - when;
+    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
+    
+    uint8_t ss1[64],ss2[64];
+    int gres1,gres2;
+    when = now();
+    for (i=0; i<nbase; i++) {
+        if (i&1) {
+            gres1 = goldilocks_shared_secret(ss1,&gsk,&hpk);
+        } else {
+            gres2 = goldilocks_shared_secret(ss2,&hsk,&gpk);
+        }
+    }
+    when = now() - when;
+    printf("ecdh:        %5.1fµs\n", when * 1e6 / i);
+    if (gres1 || gres2 || memcmp(ss1,ss2,64)) {
+        printf("[FAIL] %d %d\n",gres1,gres2);
+        
+        printf("sk1 = ");
+        for (i=0; i<56; i++) {
+            printf("%02x", gsk.opaque[i]);
+        }
+        printf("\nsk2 = ");
+        for (i=0; i<56; i++) {
+            printf("%02x", hsk.opaque[i]);
+        }
+        printf("\nss1 = ");
+        for (i=0; i<56; i++) {
+            printf("%02x", ss1[i]);
+        }
+        printf("\nss2 = ");
+        for (i=0; i<56; i++) {
+            printf("%02x", ss2[i]);
+        }
+        printf("\n");
+    }
+    
+    uint8_t sout[56*2];
+    const char *message = "hello world";
+    size_t message_len = strlen(message);
+    when = now();
+    for (i=0; i<nbase; i++) {
+        res = goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
+        assert(!res);
+    }
+    when = now() - when;
+    printf("sign:        %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
+        (void)res;
+    }
+    when = now() - when;
+    printf("verify:      %5.1fµs\n", when * 1e6 / i);
+    
+    printf("\nTesting...\n");
+    
+    
+    int failures=0, successes = 0;
+    for (i=0; i<nbase/10; i++) {
+        (void)goldilocks_keygen(&gsk,&gpk);
+        goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
+        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
+        if (res) failures++;
+    }
+    if (failures) {
+        printf("FAIL %d/%d signature checks!\n", failures, i);
+    }
+    
+    failures=0; successes = 0;
+    for (i=0; i<nbase/10; i++) {
+        p448_randomize(&crand, &a);
+		word_t two = 2;
+        mask_t good = montgomery_ladder(&b,&a,&two,2,0);
+		if (!good) continue;
+		
+		word_t x,y;
+        crandom_generate(&crand, (unsigned char *)&x, sizeof(x));
+        crandom_generate(&crand, (unsigned char *)&y, sizeof(y));
+        x = (hword_t)x;
+        y = (hword_t)y;
+        word_t z=x*y;
+        
+	(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0);
+        (void)montgomery_ladder(&c,&b,&y,WORD_BITS,0);
+        (void)montgomery_ladder(&b,&a,&z,WORD_BITS,0);
+        
+        p448_sub(&d,&b,&c);
+        p448_bias(&d,2);
+		if (!p448_is_zero(&d)) {
+            printf("Odd ladder validation failure %d!\n", ++failures);
+            p448_print("a", &a);
+            printf("x=%"PRIxWORD", y=%"PRIxWORD", z=%"PRIxWORD"\n", x,y,z);
+            p448_print("c", &c);
+            p448_print("b", &b);
+			printf("\n");
+		}
+	}
+    
+    failures = 0;
+    for (i=0; i<nbase/10; i++) {
+        mask_t good;
+        do {
+            p448_randomize(&crand, &a);
+            good = deserialize_affine(&affine, &a);
+        } while (!good);
+        
+        convert_affine_to_extensible(&exta,&affine);
+        twist_and_double(&ext,&exta);
+        untwist_and_double(&exta,&ext);
+        serialize_extensible(&b, &exta);
+        untwist_and_double_and_serialize(&c, &ext);
+        
+        p448_sub(&d,&b,&c);
+        p448_bias(&d,2);
+        
+        if (good && !p448_is_zero(&d)){
+            printf("Iso+serial validation failure %d!\n", ++failures);
+            p448_print("a", &a);
+            p448_print("b", &b);
+            p448_print("c", &c);
+            printf("\n");
+        } else if (good) {
+            successes ++;
+        }
+    }
+    if (successes < i/3) {
+        printf("Iso+serial variation: only %d/%d successful.\n", successes, i);
+    }
+    
+    successes = failures = 0;
+    for (i=0; i<nbase/10; i++) {
+        struct p448_t aa;
+        struct tw_extensible_t exu,exv,exw;
+        
+        mask_t good;
+        do {
+            p448_randomize(&crand, &a);
+            good = deserialize_affine(&affine, &a);
+            convert_affine_to_extensible(&exta,&affine);
+            twist_and_double(&ext,&exta);
+        } while (!good);
+        do {
+            p448_randomize(&crand, &aa);
+            good = deserialize_affine(&affine, &aa);
+            convert_affine_to_extensible(&exta,&affine);
+            twist_and_double(&exu,&exta);
+        } while (!good);
+        p448_randomize(&crand, &aa);
+        
+        q448_randomize(&crand, sk);
+		if (i==0 || i==2) memset(&sk, 0, sizeof(sk));
+        q448_randomize(&crand, tk);
+		if (i==0 || i==1) memset(&tk, 0, sizeof(tk));
+        
+        copy_tw_extensible(&exv, &ext);
+        copy_tw_extensible(&exw, &exu);
+        scalarmul(&exv,sk);
+        scalarmul(&exw,tk);
+        convert_tw_extensible_to_tw_pniels(&pniels, &exw);
+        add_tw_pniels_to_tw_extensible(&exv,&pniels);
+        untwist_and_double(&exta,&exv);
+        serialize_extensible(&b, &exta);
+
+        (void)precompute_fixed_base_wnaf(wnaft,&exu,5);
+        linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
+        untwist_and_double(&exta,&exv);
+        serialize_extensible(&c, &exta);
+        
+        p448_sub(&d,&b,&c);
+        p448_bias(&d,2);
+        
+        if (!p448_is_zero(&d)){
+            printf("PreWNAF combo validation failure %d!\n", ++failures);
+            p448_print("a", &a);
+            p448_print("A", &aa);
+            q448_print("s", sk);
+            q448_print("t", tk);
+            p448_print("c", &c);
+            p448_print("b", &b);
+            printf("\n\n");
+        } else if (good) {
+            successes ++;
+        }
+    }
+    if (successes < i) {
+        printf("PreWNAF combo variation: only %d/%d successful.\n", successes, i);
+    }
+    
+    return 0;
+}
diff --git a/test/test.c b/test/test.c
new file mode 100644
index 0000000..3cddf2e
--- /dev/null
+++ b/test/test.c
@@ -0,0 +1,134 @@
+#include "test.h"
+
+#include <stdio.h>
+#include <string.h>
+
+
+int failed_tests, n_tests, failed_this_test, running_a_test;
+
+void end_test() {
+    if (!failed_this_test) {
+        printf("[PASS]\n");
+    }
+    n_tests ++;
+    running_a_test = 0;
+}
+
+void begin_test(const char *name) {
+    if (running_a_test) end_test();
+    printf("%s...%*s",name,(int)(30-strlen(name)),"");
+    fflush(stdout);
+    failed_this_test = 0;
+    running_a_test = 1;
+}
+
+void youfail() {
+    if (failed_this_test) return;
+    failed_this_test = 1;
+    failed_tests ++;
+    printf("[FAIL]\n");   
+}
+
+static int
+hexchar (char c) {
+    if (c >= '0' && c <= '9') {
+        return c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+        return 10 + c - 'a';
+    } else if (c >= 'A' && c <= 'F') {
+        return 10 + c - 'A';
+    } else {
+        return -1;
+    }
+}
+
+int
+hexdecode (
+    unsigned char *bytes,
+    const char *hex,
+    unsigned int nbytes
+) {
+    if (strlen(hex) != nbytes*2) {
+        return -1;
+    }
+    
+    unsigned int i;
+    for (i=0; i<nbytes; i++) {
+        int hi = hexchar(hex[2*i]),
+            lo = hexchar(hex[2*i+1]);
+        if (hi<0 || lo<0) return -1;
+        bytes[i] = hi*16 + lo;
+    }
+    
+    return 0;
+}
+
+void
+hexprint (
+    const char *descr,
+    const unsigned char *bytes,
+    unsigned int nbytes
+) {
+    if (descr) printf("%s = ", descr);
+    unsigned int i;
+    for (i=0; i<nbytes; i++) {
+        printf("%02x", bytes[i]);
+    }
+    printf("\n");
+}
+
+void p448_print (
+    const char *descr,
+    const struct p448_t *a
+) {
+    p448_t b;
+    p448_copy(&b, a);
+    p448_strong_reduce(&b);
+    int j;
+    printf("%s = 0x", descr);
+    for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) {
+        printf(PRIxWORD58, b.limb[j]);
+    }
+    printf("\n");
+}
+
+void scalar_print (
+    const char *descr,
+    const word_t *scalar,
+    int nwords
+) {
+    int j;
+    printf("%s = 0x", descr);
+    for (j=nwords-1; j>=0; j--) {
+        printf(PRIxWORDfull, scalar[j]);
+    }
+    printf("\n");
+}
+
+int main(int argc, char **argv) {
+    (void) argc;
+    (void) argv;
+    
+    n_tests = running_a_test = failed_tests = 0;
+    begin_test("SHA-512 NIST Monte Carlo");
+    test_sha512_monte_carlo();
+
+    begin_test("EC point operations");
+    test_pointops();
+    
+    begin_test("Scalarmul compatibility");
+    test_scalarmul_compatibility();
+    
+    begin_test("Scalarmul commutativity");
+    test_scalarmul_commutativity();
+    
+    if (running_a_test) end_test();
+    printf("\n");
+    if (failed_tests) {
+        printf("Failed %d / %d tests.\n", failed_tests, n_tests);
+    } else {
+        printf("Passed all %d tests.\n", n_tests);
+    }
+    
+    return failed_tests ? 1 : 0;
+}
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 0000000..5bbdc48
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,42 @@
+#ifndef __GOLDILOCKS_TEST_H__
+#define __GOLDILOCKS_TEST_H__ 1
+
+#include "word.h"
+#include "p448.h"
+
+int
+hexdecode (
+    unsigned char *bytes,
+    const char *hex,
+    unsigned int nbytes
+);
+
+void
+hexprint (
+    const char *descr,
+    const unsigned char *bytes,
+    unsigned int nbytes
+);
+    
+void p448_print (
+    const char *descr,
+    const struct p448_t *a
+);
+    
+void scalar_print (
+    const char *descr,
+    const word_t *scalar,
+    int nwords
+);
+
+void youfail();
+
+int test_sha512_monte_carlo();
+
+int test_scalarmul_compatibility ();
+
+int test_scalarmul_commutativity ();
+
+int test_pointops ();
+
+#endif // __GOLDILOCKS_TEST_H__
diff --git a/test/test_pointops.c b/test/test_pointops.c
new file mode 100644
index 0000000..6dfdab7
--- /dev/null
+++ b/test/test_pointops.c
@@ -0,0 +1,287 @@
+#include "test.h"
+
+#include <stdio.h>
+
+#include "ec_point.h"
+#include "p448.h"
+#include "crandom.h"
+
+
+static void
+failprint_ext (
+    const struct extensible_t *a
+) {
+    struct p448_t zi, scaled;
+    p448_print("    x", &a->x);
+    p448_print("    y", &a->y);
+    p448_print("    z", &a->z);
+    p448_inverse(&zi, &a->z);
+    p448_mul(&scaled, &zi, &a->x);
+    p448_print("    X", &scaled);
+    p448_mul(&scaled, &zi, &a->y);
+    p448_print("    Y", &scaled);
+    printf("\n");
+}
+
+static void
+failprint_tw_ext (
+    const struct tw_extensible_t *a
+) {
+    failprint_ext((const struct extensible_t *)a);
+}
+
+static mask_t
+fail_if_different (
+    const struct extensible_t *a,
+    const struct extensible_t *b,
+    const char *faildescr,
+    const char *adescr,
+    const char *bdescr
+) {
+    mask_t succ = eq_extensible(a, b);
+    
+    if (!succ) {
+        youfail();
+        printf("    %s\n", faildescr);
+        
+        printf("\n    %s:\n", adescr);
+        failprint_ext(a);
+        
+        printf("\n    %s:\n", bdescr);
+        failprint_ext(b);
+    }
+    
+    return succ;
+}
+
+static mask_t
+validate_ext(
+    const struct extensible_t *ext,
+    int evenness,
+    const char *description
+) {
+    mask_t succ = validate_extensible(ext), succ2;
+    const char *error = "Point isn't on the curve.";
+    if (evenness > 0) {
+        succ2 = is_even_pt(ext);
+        if (succ &~ succ2) error = "Point isn't even.";
+        succ &= succ2;
+    } else if (evenness < 0) {
+        succ2 = is_even_pt(ext);
+        if (succ &~ succ2) error = "Point is even but shouldn't be.";
+        succ &= succ2;
+    } /* FUTURE: quadness */
+    
+    if (~succ) {
+        youfail();
+        printf("    %s\n", error);
+        printf("    %s\n", description);
+        failprint_ext(ext);
+    }
+    
+    return succ;
+}
+
+static mask_t
+validate_tw_ext(
+    const struct tw_extensible_t *ext,
+    int evenness,
+    const char *description
+) {
+    mask_t succ = validate_tw_extensible(ext), succ2;
+    const char *error = "Point isn't on the twisted curve.";
+    if (evenness > 0) {
+        succ2 = is_even_tw(ext);
+        if (succ &~ succ2) error = "Point isn't even.";
+        succ &= succ2;
+    } else if (evenness < 0) {
+        succ2 = is_even_tw(ext);
+        if (succ &~ succ2) error = "Point is even but shouldn't be.";
+        succ &= succ2;
+    } /* FUTURE: quadness */
+    
+    if (~succ) {
+        youfail();
+        printf("    %s\n", error);
+        printf("    %s\n", description);
+        failprint_tw_ext(ext);
+    }
+    
+    return succ;
+}
+
+static mask_t
+fail_if_different_tw (
+    const struct tw_extensible_t *a,
+    const struct tw_extensible_t *b,
+    const char *faildescr,
+    const char *adescr,
+    const char *bdescr
+) {
+    return fail_if_different(
+        (const struct extensible_t *)a, (const struct extensible_t *)b,
+        faildescr,adescr,bdescr
+    );
+}
+
+static int
+add_double_test (
+    const struct affine_t *base1,
+    const struct affine_t *base2 
+) {
+    mask_t succ = MASK_SUCCESS;
+    struct extensible_t exb;
+    struct tw_extensible_t text1, text2, texta, textb;
+    struct tw_pniels_t pn;
+    
+    /* Convert to ext */
+    convert_affine_to_extensible(&exb, base1);
+    succ &= validate_ext(&exb,0,"base1");
+    twist_and_double(&text1, &exb);
+    succ &= validate_tw_ext(&text1,2,"iso1");
+    convert_affine_to_extensible(&exb, base2);
+    succ &= validate_ext(&exb,0,"base2");
+    twist_and_double(&text2, &exb);
+    succ &= validate_tw_ext(&text2,2,"iso2");
+    
+    /* a + b == b + a? */
+    convert_tw_extensible_to_tw_pniels(&pn, &text1);
+    copy_tw_extensible(&texta, &text2);
+    add_tw_pniels_to_tw_extensible(&texta, &pn);
+    
+    convert_tw_extensible_to_tw_pniels(&pn, &text2);
+    copy_tw_extensible(&textb, &text1);
+    add_tw_pniels_to_tw_extensible(&textb, &pn);
+    
+    succ &= fail_if_different_tw(&texta,&textb,"Addition commutativity","a+b","b+a");
+    
+    copy_tw_extensible(&textb, &text2);
+    add_tw_pniels_to_tw_extensible(&textb, &pn);
+    copy_tw_extensible(&texta, &text2);
+    double_tw_extensible(&texta);
+    
+    succ &= fail_if_different_tw(&texta,&textb,"Doubling test","2b","b+b");
+    
+    if (~succ) {
+        printf("    Bases were:\n");
+        p448_print("    x1", &base1->x);
+        p448_print("    y1", &base1->y);
+        p448_print("    x2", &base2->x);
+        p448_print("    y2", &base2->y);
+    }
+    
+    return succ ? 0 : -1;
+}
+
+static int
+single_twisting_test (
+    const struct affine_t *base
+) {
+    struct extensible_t exb, ext, tmpext;
+    struct tw_extensible_t text, text2;
+    mask_t succ = MASK_SUCCESS;
+    
+    convert_affine_to_extensible(&exb, base);
+    succ &= validate_ext(&exb,0,"base");
+    
+    /* check: dual . iso = 4 */
+    twist_and_double(&text, &exb);
+    succ &= validate_tw_ext(&text,2,"iso");
+    untwist_and_double(&ext, &text);
+    succ &= validate_ext(&ext,2,"dual.iso");
+    
+    copy_extensible(&tmpext,&exb);
+    double_extensible(&tmpext);
+    succ &= validate_ext(&tmpext,1,"2*base");
+    
+    double_extensible(&tmpext);
+    succ &= validate_ext(&tmpext,2,"4*base");
+    
+    succ &= fail_if_different(&ext,&tmpext,"Isogeny and dual","Dual . iso","4*base");
+    
+    /* check: twist and serialize */
+    test_only_twist(&text, &exb);
+    succ &= validate_tw_ext(&text,0,"tot");
+    mask_t evt = is_even_tw(&text), evb = is_even_pt(&exb);
+    if (evt != evb) {
+        youfail();
+        printf("    Different evenness from twist base: %d, twist: %d\n", (int)-evt, (int)-evb);
+        
+        succ = 0;
+    } /* FUTURE: quadness */
+    
+    p448_t sera,serb;
+    untwist_and_double_and_serialize(&sera,&text);
+    copy_extensible(&tmpext,&exb);
+    double_extensible(&tmpext);
+    serialize_extensible(&serb,&tmpext);
+    
+    /* check that their (doubled; FUTURE?) serializations are equal */
+    if (~p448_eq(&sera,&serb)) {
+        youfail();
+        printf("    Different serialization from twist + double ()\n");
+        p448_print("    t", &sera);
+        p448_print("    b", &serb);
+        succ = 0;
+    }
+    
+    untwist_and_double(&ext, &text);
+    succ &= validate_ext(&tmpext,1,"dual.tot");
+    
+    twist_and_double(&text2, &ext);
+    succ &= validate_tw_ext(&text2,2,"iso.dual.tot");
+
+    double_tw_extensible(&text);
+    succ &= validate_tw_ext(&text,1,"2*tot");
+
+    double_tw_extensible(&text);
+    succ &= validate_tw_ext(&text,2,"4*tot");
+    
+    succ &= fail_if_different_tw(&text,&text2,"Dual and isogeny","4*tot","iso.dual.tot");
+    
+    if (~succ) {
+        printf("    Base was:\n");
+        p448_print("    x", &base->x);
+        p448_print("    y", &base->y);
+    }
+    
+    
+    return succ ? 0 : -1;
+}
+
+int test_pointops () {
+    struct affine_t base, pbase;
+    struct p448_t ser448;
+    
+    struct crandom_state_t crand;
+    crandom_init_from_buffer(&crand, "test_pointops random initializer");
+    
+    int i, ret;
+    for (i=0; i<1000; i++) {
+        uint8_t ser[56];
+        crandom_generate(&crand, ser, sizeof(ser));
+        
+        /* TODO: we need a p448 generate, which can return random or pathological. */
+        mask_t succ = p448_deserialize(&ser448, ser);
+        if (!succ) {
+            youfail();
+            printf("   Unlikely: fail at p448_deserialize\n");
+            return -1;
+        }
+        
+        if (i) {
+            copy_affine(&pbase, &base);
+        }
+        elligator_2s_inject(&base, &ser448);
+        
+        if (i) {
+            ret = add_double_test(&base, &pbase);
+            if (ret) return ret;
+        }
+        
+        ret = single_twisting_test(&base);
+        if (ret) return ret;
+    }
+    
+    return 0;
+}
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
new file mode 100644
index 0000000..d98cfd8
--- /dev/null
+++ b/test/test_scalarmul.c
@@ -0,0 +1,289 @@
+#include "test.h"
+
+#include <stdio.h>
+
+#include "scalarmul.h"
+#include "ec_point.h"
+#include "p448.h"
+#include "crandom.h"
+
+/* 0 = succeed, 1 = inval, -1 = fail */
+static int
+single_scalarmul_compatibility_test (
+    const struct p448_t *base,
+    const word_t *scalar,
+    int nbits
+) {
+    struct tw_extensible_t text, work;
+    struct p448_t mont, ct, vl, vt;
+    
+    int ret = 0, i;
+    mask_t succ, succm;
+    
+    const struct p448_t
+    sqrt_d_minus_1 = {{
+        U58LE(0xd2e21836749f46),
+        U58LE(0x888db42b4f0179),
+        U58LE(0x5a189aabdeea38),
+        U58LE(0x51e65ca6f14c06),
+        U58LE(0xa49f7b424d9770),
+        U58LE(0xdcac4628c5f656),
+        U58LE(0x49443b8748734a),
+        U58LE(0x12fec0c0b25b7a)
+    }};
+    
+    succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base);
+    
+    succm = montgomery_ladder(&mont,base,scalar,nbits,1);
+    
+    if (succ != succm) {
+        youfail();
+        printf("    Deserialize_and_twist_approx succ=%d, montgomery_ladder succ=%d\n",
+            (int)-succ, (int)-succm);
+        printf("    nbits = %d\n", nbits);
+        p448_print("    base", base);
+        scalar_print("    scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS);
+        return -1;
+    }
+    
+    if (!succ) {
+        return 1;
+    }
+    
+    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
+    const int nparams = sizeof(params)/sizeof(params[0]);
+    struct fixed_base_table_t fbt;
+    struct p448_t fbout[nparams], wout[6];
+    memset(&fbt, 0, sizeof(fbt));
+    memset(&fbout, 0, sizeof(fbout));
+    memset(&wout, 0, sizeof(wout));
+        
+    /* compute using combs */
+    for (i=0; i<nparams; i++) {
+        int n=params[i].n, t=params[i].t, s=params[i].s;
+        succ = precompute_fixed_base(&fbt, &text, n, t, s, NULL);
+        if (!succ) {
+            youfail();
+            printf("    Failed to precompute_fixed_base(%d,%d,%d)\n", n, t, s);
+            continue;
+        }
+        
+        succ = scalarmul_fixed_base(&work, scalar, nbits, &fbt);
+        destroy_fixed_base(&fbt);
+        if (!succ) {
+            youfail();
+            printf("    Failed to scalarmul_fixed_base(%d,%d,%d)\n", n, t, s);
+            continue;
+        }
+        
+        untwist_and_double_and_serialize(&fbout[i], &work);
+    }
+    
+    /* compute using precomp wNAF */
+    for (i=0; i<=5; i++) {
+        struct tw_niels_t pre[1<<i];
+        
+        succ = precompute_fixed_base_wnaf(pre, &text, i);
+        if (!succ) {
+            youfail();
+            printf("    Failed to precompute_fixed_base_wnaf(%d)\n", i);
+            continue;
+        }
+        
+        scalarmul_fixed_base_wnaf_vt(&work, scalar, nbits, pre, i);
+        
+        untwist_and_double_and_serialize(&wout[i], &work);
+    }
+    
+    mask_t consistent = MASK_SUCCESS;
+    
+    if (nbits == 448) {
+        /* window methods currently only work on 448 bits. */
+        copy_tw_extensible(&work, &text);
+        scalarmul(&work, scalar);
+        untwist_and_double_and_serialize(&ct, &work);
+        
+        copy_tw_extensible(&work, &text);
+        scalarmul_vlook(&work, scalar);
+        untwist_and_double_and_serialize(&vl, &work);
+        
+        copy_tw_extensible(&work, &text);
+        scalarmul_vt(&work, scalar);
+        untwist_and_double_and_serialize(&vt, &work);
+        
+    
+        /* check consistency mont vs window */
+        consistent &= p448_eq(&mont, &ct);
+        consistent &= p448_eq(&mont, &vl);
+        consistent &= p448_eq(&mont, &vt);
+    }
+    
+    /* check consistency mont vs combs */
+    for (i=0; i<nparams; i++) {
+        consistent &= p448_eq(&mont,&fbout[i]);
+    }
+    
+    /* check consistency mont vs wNAF */
+    for (i=0; i<6; i++) {
+        consistent &= p448_eq(&mont,&wout[i]);
+    }
+    
+    /* If inconsistent, complain. */
+    if (!consistent) {
+        youfail();
+        printf("    Failed scalarmul consistency test with nbits=%d.\n",nbits);
+        p448_print("    base", base);
+        scalar_print("    scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS);
+        p448_print("    mont", &mont);
+        
+        for (i=0; i<nparams; i++) {
+            printf("    With n=%d, t=%d, s=%d:\n", params[i].n, params[i].t, params[i].s);
+            p448_print("    out ", &fbout[i]);
+        }
+        
+        for (i=0; i<6; i++) {
+            printf("    With w=%d:\n",i);
+            p448_print("    wNAF", &wout[i]);
+        }
+        
+    
+        if (nbits == 448) {
+            p448_print("    ct ", &ct);
+            p448_print("    vl ", &vl);
+            p448_print("    vt ", &vt);
+        }
+        
+        ret = -1;
+    }
+    
+    return ret;
+}
+
+/* 0 = succeed, 1 = inval, -1 = fail */
+static int
+single_scalarmul_commutativity_test (
+    const struct p448_t *base,
+    const word_t *scalar1,
+    int nbits1,
+    int ned1,
+    const word_t *scalar2,
+    int nbits2,
+    int ned2
+) {
+    struct p448_t m12, m21, tmp1, tmp2;
+    mask_t succ12a = montgomery_ladder(&tmp1,base,scalar1,nbits1,ned1);
+    mask_t succ12b = montgomery_ladder(&m12,&tmp1,scalar2,nbits2,ned2);
+    
+    mask_t succ21a = montgomery_ladder(&tmp2,base,scalar2,nbits2,ned2);
+    mask_t succ21b = montgomery_ladder(&m21,&tmp2,scalar1,nbits1,ned1);
+    
+    mask_t succ12 = succ12a & succ12b, succ21 = succ21a & succ21b;
+    
+    if (succ12 != succ21) {
+        youfail();
+        printf("    Failed scalarmul commutativity test with (nbits,ned) = (%d,%d), (%d,%d).\n",
+            nbits1,ned1,nbits2,ned2);
+        p448_print("    base", base);
+        p448_print("    tmp1", &tmp1);
+        p448_print("    tmp2", &tmp2);
+        scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
+        scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
+        printf("    good = ((%d,%d),(%d,%d))\n", (int)-succ12a,
+            (int)-succ12b, (int)-succ21a, (int)-succ21b);
+        return -1;
+    } else if (!succ12) {
+        // printf("    (nbits,ned) = (%d,%d), (%d,%d).\n", nbits1,ned1,nbits2,ned2);
+        // printf("    succ = (%d,%d), (%d,%d).\n", (int)-succ12a, (int)-succ12b, (int)-succ21a, (int)-succ21b);
+        return 1;
+    }
+    
+    mask_t consistent = p448_eq(&m12,&m21);
+    if (consistent) {
+        return 0;
+    } else {
+        youfail();
+        printf("    Failed scalarmul commutativity test with (nbits,ned) = (%d,%d), (%d,%d).\n",
+            nbits1,ned1,nbits2,ned2);
+        p448_print("    base", base);
+        scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
+        scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
+        p448_print("    m12 ", &m12);
+        p448_print("    m21 ", &m21);
+        return -1;
+    }
+}
+
+int test_scalarmul_commutativity () {
+    int i,j,k,got;
+    
+    struct crandom_state_t crand;
+    crandom_init_from_buffer(&crand, "scalarmul_commutativity_test RNG");
+    
+    for (i=0; i<=448; i+=7) {
+        for (j=0; j<=448; j+=7) {
+            got = 0;
+            
+            for (k=0; k<128 && !got; k++) {
+                uint8_t ser[56];
+                word_t scalar1[7], scalar2[7];
+                crandom_generate(&crand, ser, sizeof(ser));
+                crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1));
+                crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2));
+            
+                p448_t base;
+                mask_t succ = p448_deserialize(&base, ser);
+                if (!succ) continue;
+            
+                int ret = single_scalarmul_commutativity_test (&base, scalar1, i, i%3, scalar2, j, j%3);
+                got = !ret;
+                if (ret == -1) return -1;
+            }
+
+            if (!got) {
+                youfail();
+                printf("    Unlikely: rejected 128 scalars in a row.\n");
+                return -1;
+            }
+            
+        }
+    }
+    
+    return 0;
+}
+
+int test_scalarmul_compatibility () {
+    int i,j,k,got;
+    
+    struct crandom_state_t crand;
+    crandom_init_from_buffer(&crand, "scalarmul_compatibility_test RNG");
+    
+    for (i=0; i<=448; i+=7) {
+        for (j=0; j<=20; j++) {
+            got = 0;
+            
+            for (k=0; k<128 && !got; k++) {
+                uint8_t ser[56];
+                word_t scalar[7];
+                crandom_generate(&crand, ser, sizeof(ser));
+                crandom_generate(&crand, (uint8_t *)scalar, sizeof(scalar));
+            
+                p448_t base;
+                mask_t succ = p448_deserialize(&base, ser);
+                if (!succ) continue;
+            
+                int ret = single_scalarmul_compatibility_test (&base, scalar, i);
+                got = !ret;
+                if (ret == -1) return -1;
+            }
+
+            if (!got) {
+                youfail();
+                printf("    Unlikely: rejected 128 scalars in a row.\n");
+                return -1;
+            }
+            
+        }
+    }
+    
+    return 0;
+}
diff --git a/test/test_sha512.c b/test/test_sha512.c
new file mode 100644
index 0000000..e5c409f
--- /dev/null
+++ b/test/test_sha512.c
@@ -0,0 +1,270 @@
+#include "test.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "sha512.h"
+
+
+
+static int sha512_monte_carlo_core (
+    const char *seed,
+    const char *checks[100]
+) { 
+    struct sha512_ctx_t sha;
+    sha512_init(&sha);
+    
+    unsigned char md0[64],md1[64],md2[64];
+    
+    int ret = hexdecode(md0,seed,64);
+    if (ret) {
+        youfail();
+        printf("    SHA-512 NIST Monte Carlo validation seed hex decode failure.\n");
+        return -1;
+    }
+    
+    int i,j;
+
+    memcpy(md1,md0,sizeof(md1));
+    memcpy(md2,md0,sizeof(md1));
+    
+    for (j=0; j<100; j++) {
+        
+        for (i=3; i<1003; i++) {
+            sha512_update(&sha,md0,sizeof(md0));
+            sha512_update(&sha,md1,sizeof(md1));
+            sha512_update(&sha,md2,sizeof(md2));
+            memcpy(md0,md1,sizeof(md1));
+            memcpy(md1,md2,sizeof(md1));
+            sha512_final(&sha,md2);
+        }
+        
+        ret = hexdecode(md0,checks[j],64);
+        if (ret) {
+            youfail();
+            printf("    SHA-512 NIST Monte Carlo validation hex decode failure at iteration %d\n", j);
+            return -1;
+        } else if (memcmp(md0,md2,sizeof(md2))) {
+            youfail();
+            printf("    SHA-512 NIST Monte Carlo validation failure at iteration %d\n", j);
+            hexprint("    Expected", md0, 64);
+            hexprint("    But got ", md2, 64);
+            return j+1;
+        }
+        
+        memcpy(md0,md2,sizeof(md1));
+        memcpy(md1,md2,sizeof(md1));
+    }
+    
+    return 0;
+}
+
+int test_sha512_monte_carlo() {
+    const char *seed =
+        "5c337de5caf35d18ed90b5cddfce001ca1b8ee8602f367e7c24ccca6f893802f"
+        "b1aca7a3dae32dcd60800a59959bc540d63237876b799229ae71a2526fbc52cd";
+    const char *checks[100] = {
+        "ada69add0071b794463c8806a177326735fa624b68ab7bcab2388b9276c036e4"
+        "eaaff87333e83c81c0bca0359d4aeebcbcfd314c0630e0c2af68c1fb19cc470e",
+        "ef219b37c24ae507a2b2b26d1add51b31fb5327eb8c3b19b882fe38049433dbe"
+        "ccd63b3d5b99ba2398920bcefb8aca98cd28a1ee5d2aaf139ce58a15d71b06b4",
+        "c3d5087a62db0e5c6f5755c417f69037308cbce0e54519ea5be8171496cc6d18"
+        "023ba15768153cfd74c7e7dc103227e9eed4b0f82233362b2a7b1a2cbcda9daf",
+        "bb3a58f71148116e377505461d65d6c89906481fedfbcfe481b7aa8ceb977d25"
+        "2b3fe21bfff6e7fbf7575ceecf5936bd635e1cf52698c36ef6908ddbd5b6ae05",
+        "b68f0cd2d63566b3934a50666dec6d62ca1db98e49d7733084c1f86d91a8a08c"
+        "756fa7ece815e20930dd7cb66351bad8c087c2f94e8757cb98e7f4b86b21a8a8",
+        "937d7856a82a84c163c79417d0540c47daaf9ffe662c843737dbbcbe5f865bf6"
+        "f47a9d2bd10129a4f498073094653c324a2519a1c71ac1279b1623ff7d24647a",
+        "f8fbc058c2b9f84131c9decfa543a35ade41581f670398efd61b3abfced9c1cf"
+        "cb5324f2370487f9c59a65bc668ea596c8d22ce8a33014dfad28357fa7d05f04",
+        "4ab0c9484ff5c30fa64ae6e81510c5fea566eafb88f175f8bc19109f40fe8001"
+        "4c8b77fff10b8750778429bf3c5497e4cb92d9b30014f4cb975dff2a45244c28",
+        "685179397554d276513d630234a03419808c698abf2600d7490aabb8e455c6ab"
+        "6ea412c7729dc140a79dff66533c6946cbe90f9da9ed16e2e629db1651bea870",
+        "335e6e941ab7dadfecdb74ea6cb4e8584b6e3408841a33a6cf7fd6a63294b193"
+        "0a60983240311672acac3840a90e64cc366ce75081b2252627e9c31197ebad03",
+        "e3217f6af6e279e9445dc3738cbf9ba0e9edba0455844a73648139777afdea2c"
+        "4d8032e214f541bf92675fb23f24df8e4fe98e0003aadfb6d8f9cc2cd799bbf7",
+        "ee2fdfb3ae630613b7d890977cf2515deac272a37f27e4a01961ecf103d4ff5b"
+        "45cc8aef53b635dd75aa51aabf71c0642555ccd3281e0388f8ca09d83258cf30",
+        "6a30d97cc98af6a25b673dce7aeab8d762bf2e55ea0c6dc899179281f84dd02a"
+        "2896f77e9c106b472f55f7adbef7b1157be567ee1236ebdac2a3c5d8cb133eb5",
+        "ac1176abdc5f71170183d92ae55856221b0d95590af11d9d72ba605ec026bbec"
+        "52d6974bc43a1efb125ff2b161fbdc616fda00f04193a0bc26aacdfa052a5741",
+        "59fa909480620ecc08d34531a6da1b55158b74fc93ddf68e1d242615b6f3843a"
+        "7952e63e798c6445cde1b07e0be09d0d711cb7b42a0e7760a593b08acfceb63d",
+        "9eb253319efa61b864f27bd334d7dd78b38d3265fb544e0c8edee950a547e1d8"
+        "db921a285774ab94d66beae933298d20f2a5aa87c62fe1e383cc3b18e7af18ac",
+        "81735324005671f7bdad9e685ee8257f5e0622b9fcb5d38dbdfb2df27258c3e1"
+        "d46d76e24c0c92c744e1b50a2b4b0d31525b3af83cc80a75722d921bdeef59c4",
+        "17498cdff4323bb8021e44eca6559e05d8ff9a0ef2ee9d4ba0ac6e73f83972a0"
+        "dfbb6d47728fa70311d7c82e154966e1b7678263b0f65133e9116969193d429b",
+        "228c4574d7c45eb9ba9240722133fce74abe00c7328ab30b4bde373dc79afdd6"
+        "e0569d36268cd5eaa2f27205fc00512577bcbb6699e1d66ed85eafaba7548afb",
+        "3d40ccd9cc445bbecca9227c67fe455d89e0b7c1c858d32f30e2b544ca9a5a60"
+        "6535aea2e59fec6ec4d1ba898cc4338c6eadef9c0884bcf56aca2f481a2d7d3e",
+        "e1e577aeac92e3a2b7f8a262bf2ac9c037d2274ca6618fbe4cc21db7c699e994"
+        "6b6671ae45ea433a1e392a5bc9eec96fd641ba8f4a047f022a04a337227004df",
+        "5e4424c0bcb2f0f7a2428821a9d5840a82401f4440ae6bed25c53cd9e71cf9d3"
+        "9904d6a375bd721f4332ab0202529c91feb9c094c3e6d34ca4f66649ee6fa212",
+        "56b199d63ca37189d5ca0d40006ac7bcb9f39cbdc00ef7b8a5697caa7d81d05b"
+        "645a146995b1151d01958f1589337e14afc6e7dd10a815170e527a398e6ce8c3",
+        "d2d498ff93fb03013a64f295b5bc68e57d2fb5600da578aa011d43ff432eae3e"
+        "0c800f9e2a53155e56fdbf5e068fe2b4beb3e42b2585531b8b16c4d8ca3356c6",
+        "3d3875489903710f17cf4247b5842ace6f017b1a3b99e9ee5fbc04fc7898e78b"
+        "12693879878028ca40c63cd0f6925fb7d0ca0412e4f06619e3ace223690f03b8",
+        "a013e21cd1234483c95c2ea2757be949bc79401ba39b09c316a1612d594642be"
+        "65ca106e12695ac3808c57c6f2980e895fd1fe188946562afc238414e1e43649",
+        "c5f6367d7195489e16242f912fbe0d8002e947de3a7e9c53f77b1e5e90e05bd7"
+        "ca395e787e34cb5f500c02da59c9d83de35601de7ae80dae74a0d6b4a292d43b",
+        "7c28c44c6aaba83c122f24d68273e28a5afd65b4071d02b7ea3300478d511897"
+        "1e1356ae57cbc70d2a177ea464a1c2c50d4297b933e789c63b1481797ae8f08c",
+        "af7cb42b1c70a85ac1ae1c2991b25b657c19f4fcf83af7f7dc0ae1028c1452a6"
+        "a17dc98929634fe6ed3855b70b96bc2caa93d82037b94ebeddc77e4c1a7cc563",
+        "bd56ad4c0cbd162706053da929d667253aadcf417affb483fff4f2699bf406d1"
+        "28cfdf5196dfbb05bb89ccbf04c5147bd2ebb3156b0bc1768ca6faa171c91c01",
+        "004d7b0fff9bcddf4b3913ae190a76728705a3d23874d92a8b7ff246c8fcad46"
+        "623cb04723c8aded0cba4968d1a8cc1375b99005786c1bcb7ae4bf13325c3ae0",
+        "8299a5bf5ed64f525c4eebbeca969fc1b91a81adb58c584bdd2d7676386a31fa"
+        "546643a3cf505007584f02fb712d708cab645bf078a1b9339f5a76aee985d017",
+        "ce7100f3455db1a9776a9f40d562ea998afca1f9fee7e0d81c8db34cf68ad23a"
+        "8bfa6fc04774703e1e56d5196b66966158fcf2a8335a58c6ba7ba1af756ba1dc",
+        "90aaabcb655ee921b8350229efe6064a60051cf0cac858fa3d43afd5b97cc823"
+        "01bd1b8cc1f874022e5af948185638783a13ca1bbd5049ace7fbf4f6d90c201f",
+        "3cf0a25b33ded3e0806dfe603b9987f1d6f2b3fdcb1ec7f8566828c00e17e8f5"
+        "9e38b3bca302396c7525ca194e6cc8501369059e2e34ae21e3141215876847c4",
+        "bdc5266aee339a1ff13fcf5229773cd3d14b47101e83076927c160bb71bf7445"
+        "590525a2012d52af008e118e16df1b6bfcaf8f22b4e45f9e749f3c20625a2bc8",
+        "ef8d2ba885381ab97756d59dbbbf53a1ea35d152b2d8f82c3518430aa34e7083"
+        "59194ea43950d032e151f576d343a5c3cfe6b71d4ed0ead9d3a107402589bad0",
+        "194ea5324c4179998dd7057755f255fdea04dadf533f7851e3e9718b610948e3"
+        "2fd28323077d9421142ac808978adfa325b668c8599a2e01c757a5a14ed2dd37",
+        "106984d2f0087e621dae760552bc6279072267883c204079481af6034354f1a2"
+        "b77c17e6c039a1063e479342aa3ccd90330dd3fb5a7d5e976619497e2d3326cd",
+        "a1347216f1a6db47b90c4ded3c5c75440f54c22c87d538314d1340f86f88acba"
+        "01378acb933ddad0adc6b75d55bfb7e8efc9c4a531b2a410610b7515b6dac66a",
+        "b76e4db147e0eaa4f04880654088b9d0fce518c8c377d92c846345604dc6b2b1"
+        "8d377fdb8e30f06d9bcfe6d7dacc07d6adff73d98d49f8f132b80f3084390830",
+        "acd4e527763dfd4513f0def0b1edf8ea12dc78d336b7b796f3dcc32e10687254"
+        "43a2f55ab4f666b27d6bf2ab39669c98293f0a9108051fd3144d31a1ed171ddd",
+        "10128c15494bc87a87374f676ef9fe2df20b36ffcca41a80bd40b216637b3de7"
+        "10efd070e277827820a7bba3cceb7b21f8fe7f9775d6c4df4d3da5349434ec49",
+        "2632dd5c188c6ed3a4610405fdda704add752f5424d9de65a51400fe478e26cd"
+        "0412e5f91ca4b744c34f4954f40a3a4254431d21954623208b527b7b4daa687e",
+        "45707f5b6fc5ccd1f78d77f177d10fb8b462c74cc821518cd5cfa4b5d6b40b41"
+        "8044900693c37abbb82367d340fec67f800d74072935da1706b4d90ae26099c7",
+        "56c37f31220b5b3040373d91b2c5e42fe9e601a12f7f8dc4534459bf28e484b8"
+        "713db243c5782c031e674003a3c14c42fd152e7188789065e82795e10f87d54b",
+        "5da94c899d48bd8299fee3d81662f8d6c5f8f8bc54d18cb0368b13cebaee7ad7"
+        "1e74ea80f34974ad166f04f9a0602809166fe4085a475a8ca86cade12b6754c4",
+        "0664363f97ba910760b0922e31ca880ca97469506cb007e3108c36c3ce3ce180"
+        "1fb4197609479339e8820632b6a38bffffee05a9adc11cc544b9aa6f5b95cc6f",
+        "732c41a1edaa727c04f627ff158aaff67c18efd667216132b99ab84d108996a1"
+        "0bb008b5d803b22ed1aa78bb0d10f8a762fd34777d7dccce8e84827ba88d4193",
+        "fc9c21d67e393a2b05a23a17d8db630cbaebaa3def211181749f1bcad1815606"
+        "27fb60ee20fae2e5980cbf50fce0a19dce807e7fb75c4da0ef008bc75d413a65",
+        "0453b765afc1edffa595efe345177f5805ed3abc1297ceab757ae7161723a614"
+        "4cb543299f418049276d16b7896662631634fab9549127c10f27505b7dee8665",
+        "3853f3bf024e0668e8d1ea53733a97537f97d9307c5f3a19864ab4eeb1654710"
+        "693bb961a344dec8a758f5e64b26fcb6dd423419c4a114fa749211a9de06c281",
+        "240137f0dd57beb3f7fc283bb3ead423c67883fd46f4e27471d7be57ad469a49"
+        "bad03a3658418bd55614678f3a463bceff85291314b90ef43ccbcb028f0a7a07",
+        "f9050a5271edbe4cfdb9520ec05bbdc3cbcb9bce36fd212338d3e7028a39b9ab"
+        "30793e561d75a2e424193264c7f0775e65599ef0c94e0ad24dbfe18252364267",
+        "47caa7a5862fad837aaa409a4a9df2575e645528c35159115911b7c4e2f08ae4"
+        "9d68de97249b31b83ce2c163f649cad4559dc6e6a7191f2922d79a5fd6af167b",
+        "13f5825c41fa49edf6104e3e35c9c224eba93e37374f730004c39c54e7391e4a"
+        "847fd61865235a3fe32224c96fbe86f7e14c3d5df496e83ec989a71b4f293a44",
+        "e5b55e05efe1ca6b9a96a57e3a1523d610d70f837e93b31fa98c2736d3e114d2"
+        "38d46ec6b6e3d19e774b253f6b0c7a2ebe69b7e60fc0874444806b2a2278df45",
+        "f14a586ac30f0af255f597a9aef9abba5e99c04d17b01f24427c4ee2c196b52a"
+        "cb1ceefc9b15cb822b3ecffdc2f7c49e11d3fc0769acee33361537d379c62e0c",
+        "7e2d3398807195c48e6ec52d20710bbf8b21ea8de4d1abc197897ccc58aeff40"
+        "259edc67270cdae0edcc686c0d0dccc5760c1495ab1cf48482dc2000ae2d42ad",
+        "2f3d5c5f990bf615d5e8b396ccbd0337da39fad09b059f955a431db76a9dc720"
+        "dffc4e02c0be397c7e0463799cd75fd6ab7c52bec66c8df5ef0d47e14a4c5927",
+        "483a1764d308cc494a2b543d29ba616483aefdf91c7769fd084eedaac1add189"
+        "1df95d317a47430b2bf73e4081f86597020e28afe2d34a22b77ea62b6112d09a",
+        "bfa88691ec951511651c6f14af100eeb26d87729e18ac3ef49a80d73ffeaeea5"
+        "3e97c4a7277a7ee9f2fba070b1c9720d6cdba407dd82267019e3f0f5662b2f2b",
+        "4c17c8e2e7132dbf82afebc40efc77926d16f4d2c082d846dac28733aa767e28"
+        "40ebf04f2563df75933466a36e11968d342e4157827605d04d9627ce9b5216c8",
+        "70bbfc29a2a765220af84e7bb10d759a3152ad4b5643ef6b89966950ec7ef950"
+        "3d57bc0a28c4ee789a60bf9dcac59139e15241d73b990410cf92eff213da9eca",
+        "8d1d56f37fc19b84984a6fa33aa9c2dbdbf79a29c04ad0b4cf20333e6bec9434"
+        "47be2416242f8cd2f9732e79bb925cc5a61a80c5fc9c079961243fd1c1f5900e",
+        "492fd0171f4dcd5d20ea6c0d34b5576c8894664ae5955e6737f5e3b711c2804d"
+        "99ccca065b7ec18c82da98b18a3029b765c51ebc7c433b36492e0ed6b8511bb6",
+        "7f49e8e54db7e5b4323cae2db71f3e8b8eba172dcad3602e9b7b058007a55893"
+        "58732d5afffa56072a46e89b1ea27ef8d556deb86b569c635d394f15d99d8a15",
+        "56884a6a9210d5f371e25823efb2511a9c410c26a441e07c1bdffe8605084267"
+        "d49c315baf6a692d7d97844b2714b4930877a5d7f52cf6fa151700fcb6980546",
+        "6aaef8284eef221ecb17ea3c9596f075b5155fe7b925d737ed3c6543c761c28c"
+        "7cd9d9d4b5e2a37b2f183a2a367bbd34b633497bc7a1737d61c8c1f3ef295062",
+        "38ef178f5688e59d47c375252db7b39f40c0c84169878ee7ba5086e4b25fea81"
+        "076b9c37847e9e6bf24ae0b343689c265ec5ca7469e619acd61b0276721efb1b",
+        "e3fe1aabad120777cf24eaae289b486632ca46ceb89afae73dbae5fa87c76787"
+        "9369355a9cc5c21ca604ed91d0f2f58c466573f3e6d88e52c62c0d3cb188e141",
+        "82f5bd920457bb2763a0da031a7fed47b236951b1ea420c20fd2b6de1dbfbb9c"
+        "4600ea7092788493e2d4be6ee24b6dba04e57af3e8f2f14d9837295420ac7631",
+        "6d0b26208ba9b1615067bb3ff97b292fe67e4c02d240d649c32370e0a4cd22d0"
+        "3bdf864be4d24a3f5f51aeccfd1afd5191e590edeb5f7bec323b0506c3104b89",
+        "d081083158054d08371ec84f4d3aa5aa761734ac6091a30330a861fda056f835"
+        "c750bf4f7981af1693ff28545366bd05cec47bccd77a7d237befb0135c534138",
+        "6ba8b52780b8a07a2a2015dd8f0c5e7437b8e024c4ee428f7ba91dfea118cb72"
+        "a939872550983317132b841b7cbc29a22b8f1cfea0c55203cafc69b55ed6244a",
+        "312692b0a51f002b7f06d05b39d15a5637dbddd2f4f1a73e6c88a4c841cdba5c"
+        "d8e69c0939ab39bb1a9c54fa35402143c97edb9704a0e9e1a98701710f6a5dad",
+        "aaee960de201a8dcccff95b834fccf0dafc03fe6cffc0429162bf4aff01165ab"
+        "07a0c9435e9cb412121b7ba010657ccc3152118602b665072136317d92fd4262",
+        "21fdff552e08c86c07f080cefacaaaf31846eb893bfe2e4f88c3c3cd8cbf592a"
+        "84500942695a5e5ae971ab343ce2695dd1baeb1f94dd4b53d678e14265e421ae",
+        "ca8f1a5b2172f6adb474da53b35e3f73ffd88263d3eecde72e48b16e1a065801"
+        "5b555ee319005a1d82802e91431ee777610f9b1028d819921e1044ad426b0270",
+        "ce5ab25eff9c1ddc569a1eaaa66b689109ee269db7066e0b02d39b3564fd14ca"
+        "6249987b7791e203d3d7c2ebf18558d2f23f94c03dd1d03aa63849e4d2889a76",
+        "a6f8b0561000dd4ae8b828c5f676e8c1a6474c4a042a645f1815bd52e9ff53c9"
+        "7dc36d5d8997f8ce332185feead76267f5b2e63f597fb3345ca0046e58fc0f24",
+        "fec86794bad4106c5ad1c1a2d9a1b7aae480396ec231eb5cac21c4077d17a0b6"
+        "52da0037363399a5a1dababa4a40e4c54b9124167580dee9108c4dbb24c57512",
+        "594f5dd3f4c87bdc0d81309386e9163a9718e34c7b0dcb4613f8487aa786f9d2"
+        "11cfb61bb247fa9f5ecef042e710f192850f5571807294bfd8a54397850e5773",
+        "d81ad866f25ef6a0a6431d267114da564513e5ebdcf48db7e95db8cf32a89f0a"
+        "b107874d796035db97420ffcf1db5f04dc1a52ddbbb960fc63b7f3f835cc8be6",
+        "431d537e098e9949f6a68108d55d20952e3bfcdeb7273bac3917e37790a84fa5"
+        "db04c33a79c113a06cf333e831d7702a00853a93fd0aa5146d934f4f71242a6a",
+        "4ed95636c6885ae4e63d042e82f4da830c702dbf3b9746d64770a64dd666b332"
+        "08315f3a947c4dff790771ef283788a9c74da83e22b97f750286a820ee46698c",
+        "a9bcb60b4d7724cdddddbc232b4ac70b94d0d7e9f0724b1222d918930cbb9bdb"
+        "b04b3ad43e3c8caf3bf8b004ee4aec6bd527ff8eb6189b44827f7ba7057f6a90",
+        "d6d5e44d5bb07fc4144ab6ab309f048968f73f7992beb326047e9e2cd7af6240"
+        "bc8abf46703c32fdb58fb2a8672594a660ef855be74f24cec09d4fb00219de82",
+        "dfda9ac0c7147530da97715ccf47814182255f2f2cf40287db97a4c63b43fcd3"
+        "9e6d41e560921492badb253a7dea0aba863c7c33b912bb59d1ff4de03a4f03bb",
+        "0395faaaf2e907f27779d6f1cc9c9db68ec390a38fbb0702c6475b46f7a39949"
+        "8d46fd8014f834b131e1e83abba0359b1f16d8fc0a393580615def2ad0caba73",
+        "41cb98f09029abe85d24a0f131f116c7f69f54f7e91c250642606512bf3da4ca"
+        "89ba70a4714a5f66d9ae81ff09317dadaff12a02057074c970f0f02a52bfafd2",
+        "8e8f161d48e306c5533ed614b8ef3a1979df6db7e13d0780a73c4a3980ddf0a9"
+        "5f93941d412c93683e39915a660c3fbec0dbb1bb6beea2e2099cd968011535c0",
+        "789593f0b8fb83ef9b3ec50ab8f6e1e47344f763d4f7ceab5600989e7b6fd5fe"
+        "f6ee5e487975f64474af6cd71ae4d9ecce8f009edea0227c7ebe73080b8f961b",
+        "f37e1449e0b313d9537a6177f7a31158d353e5b79c781facf02526ec94e0c6cf"
+        "da37105bac67098b194ea82efb307c2929a9ab8aca0e76c53e829e3f901cd245",
+        "2e74e745caaf2d449ab3b031dd214b48616853a512cf2e95c40cb8e7594fe5e4"
+        "879ac8a26d02eb35b3b96a5c9e7dcae3e15fd050a0bcc1fb3b9cb9c4df0fad3e",
+        "6eac7069c26082e52574ca6a58abb9b1b9faf452e8cca9f1c7023679ce192ca5"
+        "54892f30e38104d39088a24df35612444a0fc90084af7535fd9344fa51dded84",
+        "ada6caf30c4f6e3644d952366e01519af6771b406e2c447552f0c597b8dd10e9"
+        "e9b4e699c9a835de03f422be8980538d9786172dfd2fe511db272a1543d5aa35",
+        "4d4b0086b2cb05d713f2805caa7e6605c8f7dbbb2e0f92aa159aebdcd6306030"
+        "5f47b748f1bca6e0b6e11cf8f9697fcccb6584b878c4b54a699290728a40aa1b",
+        "97420b8a0ad102aeb92139da2c052d2748dd7d2dbb93a9ea79dc15b520d0ca7c"
+        "ab8cb7a00f5b5aebcb49d7e7f52a27180935ce617aeecdecba04064c668edd37",
+        "4aa7dad74eb51d09a6ae7735c4b795b078f51c314f14f42a0d63071e13bdc5fd"
+        "9f51612e77b36d44567502a3b5eb66c609ec017e51d8df93e58d1a44f3c1e375"
+    };
+    
+    return sha512_monte_carlo_core(seed, checks);
+}