diff --git a/HISTORY.txt b/HISTORY.txt
index 3e5f946..702513e 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,46 @@
+May 3, 2104:
+    Minor changes to internal routines mean that this version is not
+    compatible with the previous one.
+
+    Added ARM NEON code.
+    
+    Added the ability to precompute multiples of a partner's public key.  This
+    takes slightly longer than a signature verification, but reduces future
+    verifications with the precomputed key by ~63% and ECDH by ~70%.
+    
+        goldilocks_precompute_public_key
+        goldilocks_destroy_precomputed_public_key
+        goldilocks_verify_precomputed
+        goldilocks_shared_secret_precomputed
+    
+    The precomputation feature are is protected by a macro
+        GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+    which can be #defined to 0 to compile these functions out.  Unlike most
+    of Goldilocks' functions, goldilocks_precompute_public_key uses malloc()
+    (and goldilocks_destroy_precomputed_public_key uses free()).
+    
+    Changed private keys to be derived from just the symmetric part.  This
+    means that you can compress them to 32 bytes for cold storage, or derive
+    keypairs from crypto secrets from other systems.
+        goldilocks_derive_private_key
+        goldilocks_underive_private_key
+        goldilocks_private_to_public
+    
+    Fixed a number of bugs related to vector alignment on Sandy Bridge, which
+    has AVX but uses SSE2 alignment (because it doesn't have AVX2).  Maybe I
+    should just switch it to use AVX2 alignment?
+    
+    Beginning to factor out curve-specific magic, so as to build other curves
+    with the Goldilocks framework.  That would enable fair tests against eg
+    E-521, Ed25519 etc.  Still would be a lot of work.
+    
+    More thorough testing of arithmetic.  Now uses GMP for testing framework,
+    but not in the actual library.
+    
+    Added some high-level tests for the whole library, including some (bs)
+    negative testing.  Obviously, effective negative testing is a very difficult
+    proposition in a crypto library.
+
 March 29, 2014:
     Added a test directory with various tests.  Currently testing SHA512 Monte
     Carlo, compatibility of the different scalarmul functions, and some
diff --git a/Makefile b/Makefile
index 3e03193..49a1e3f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,23 +1,57 @@
 # Copyright (c) 2014 Cryptography Research, Inc.
 # Released under the MIT License.  See LICENSE.txt for license information.
 
+
+UNAME := $(shell uname)
+MACHINE := $(shell uname -m)
+
+ifeq ($(UNAME),Darwin)
 CC = clang
-LD = clang
+else
+CC = gcc
+endif
+LD = $(CC)
+
+ifneq (,$(findstring x86_64,$(MACHINE)))
+ARCH ?= arch_x86_64
+else
+# no i386 port yet
+ARCH ?= arch_arm_32
+endif
 
-ARCH = arch_x86_64
 
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-	-Wgcc-compat -Wmissing-declarations
+	 -Wmissing-declarations -Wunused-function $(EXWARN)
+	 
+	 
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
 LANGFLAGS = -std=c99
-GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
+GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 OFLAGS = -O3
-#XFLAGS = -DN_TESTS_BASE=1000
-ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
-#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16
 
-CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
-LDFLAGS = $(ARCHFLAGS)
+ifneq (,$(findstring arm,$(MACHINE)))
+ifneq (,$(findstring neon,$(ARCH)))
+ARCHFLAGS += -mfpu=neon
+else
+ARCHFLAGS += -mfpu=vfpv3-d16
+endif
+ARCHFLAGS += -mcpu=cortex-a9 # FIXME
+GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
+else
+ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO
+endif
+
+ifeq ($(CC),clang)
+WARNFLAGS += -Wgcc-compat
+endif
+
+ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC)))
+# ARCHFLAGS += -m32
+ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1
+endif
+
+CFLAGS  = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
+LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
 ASFLAGS = $(ARCHFLAGS)
 
 .PHONY: clean all test bench todo doc lib
@@ -29,7 +63,7 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
   build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o
 
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
-	build/test_pointops.o
+	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o
 
 BENCHCOMPONENTS=build/bench.o
 
@@ -45,15 +79,20 @@ build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^
 
 build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
-	$(LD) $(LDFLAGS) -o $@ $^
+	$(LD) $(LDFLAGS) -o $@ $^ -lgmp
 
 lib: build/goldilocks.so
 
 build/goldilocks.so: $(LIBCOMPONENTS)
 	rm -f $@
+ifeq ($(UNAME),Darwin)
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-		  -exported_symbols_list src/exported.sym \
 		  $(LIBCOMPONENTS)
+else
+	$(LD) -shared -Wl,-soname,goldilocks.so.1 -Wl,--gc-sections -o $@ $(LIBCOMPONENTS)
+	strip --discard-all $@
+	ln -sf $@ build/goldilocks.so.1
+endif
 
 build/timestamp:
 	mkdir -p build
@@ -80,9 +119,9 @@ doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/
 
 todo::
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
-		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
+		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
 	@echo '============================='
-	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
+	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
 	  /bin/echo -n $$i'       ' | head -c 10; \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
@@ -90,7 +129,7 @@ todo::
 	@echo '============================='
 	@echo -n 'Total     '
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
-		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l
+		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l
 
 bench: build/bench
 	./$<
diff --git a/include/goldilocks.h b/include/goldilocks.h
index 7476a6c..f012adb 100644
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -12,13 +12,42 @@
 
 #include <stdint.h>
 
+#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+/** If nonzero, implement precomputation for verify and ECDH. */
+#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
+#endif
+
+/** The size of the Goldilocks field, in bits. */
+#define GOLDI_FIELD_BITS          448
+
+/** The size of the Goldilocks scalars, in bits. */
+#define GOLDI_SCALAR_BITS         446
+
+/** The same size, in bytes. */
+#define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)
+
+/** The size of a Goldilocks public key, in bytes. */
+#define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES
+
+/** The extra bytes in a Goldilocks private key for the symmetric key. */
+#define GOLDI_SYMKEY_BYTES        32
+
+/** The size of a shared secret. */
+#define GOLDI_SHARED_SECRET_BYTES 64
+
+/** The size of a Goldilocks private key, in bytes. */
+#define GOLDI_PRIVATE_KEY_BYTES   (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)
+
+/** The size of a Goldilocks private key, in bytes. */
+#define GOLDI_SIGNATURE_BYTES     (2*GOLDI_FIELD_BYTES)
+
 /**
  * @brief Serialized form of a Goldilocks public key.
  *
  * @warning This isn't even my final form!
  */
 struct goldilocks_public_key_t {
-    uint8_t opaque[56]; /**< Serialized data. */
+    uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
 };
 
 /**
@@ -30,7 +59,7 @@ struct goldilocks_public_key_t {
  * @warning This isn't even my final form!
  */
 struct goldilocks_private_key_t {
-    uint8_t opaque[144]; /**< Serialized data. */
+    uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
 };
 
 #ifdef __cplusplus
@@ -72,7 +101,7 @@ static const int GOLDI_EALREADYINIT  = 44805;
  */
 int
 goldilocks_init ()
-__attribute__((warn_unused_result));
+__attribute__((warn_unused_result,visibility ("default")));
 
 
 /**
@@ -90,7 +119,40 @@ int
 goldilocks_keygen (
     struct goldilocks_private_key_t *privkey,
     struct goldilocks_public_key_t *pubkey
-) __attribute__((warn_unused_result,nonnull(1,2)));
+) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Derive a key from its compressed form.
+ * @param [out] privkey The derived private key.
+ * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_derive_private_key (
+    struct goldilocks_private_key_t *privkey,
+    const unsigned char proto[GOLDI_SYMKEY_BYTES]
+) __attribute__((nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Compress a private key (by copying out the proto-key)
+ * @param [out] proto The proto-key.
+ * @param [in] privkey The private key.
+ *
+ * @warning This isn't even my final form!
+ * @todo test.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+void
+goldilocks_underive_private_key (
+    unsigned char proto[GOLDI_SYMKEY_BYTES],
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2),visibility ("default")));
 
 /**
  * @brief Extract the public key from a private key.
@@ -107,7 +169,7 @@ int
 goldilocks_private_to_public (
     struct goldilocks_public_key_t *pubkey,
     const struct goldilocks_private_key_t *privkey
-) __attribute__((nonnull(1,2)));
+) __attribute__((nonnull(1,2),visibility ("default")));
 
 /**
  * @brief Generate a Diffie-Hellman shared secret in constant time.
@@ -140,10 +202,10 @@ goldilocks_private_to_public (
  */
 int
 goldilocks_shared_secret (
-    uint8_t shared[64],
+    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
     const struct goldilocks_private_key_t *my_privkey,
     const struct goldilocks_public_key_t *your_pubkey
-) __attribute__((warn_unused_result,nonnull(1,2,3)));
+) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
     
 /**
  * @brief Sign a message.
@@ -166,11 +228,11 @@ goldilocks_shared_secret (
  */
 int
 goldilocks_sign (
-    uint8_t signature_out[56*2],
+    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
     const uint8_t *message,
     uint64_t message_len,
     const struct goldilocks_private_key_t *privkey
-) __attribute__((nonnull(1,2,4)));
+) __attribute__((nonnull(1,2,4),visibility ("default")));
 
 /**
  * @brief Verify a signature.
@@ -197,11 +259,108 @@ goldilocks_sign (
  */
 int
 goldilocks_verify (
-    const uint8_t signature[56*2],
+    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
     const uint8_t *message,
     uint64_t message_len,
     const struct goldilocks_public_key_t *pubkey
-) __attribute__((warn_unused_result,nonnull(1,2,4)));
+) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
+
+#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+
+/** A public key which has been expanded by precomputation for higher speed. */
+struct goldilocks_precomputed_public_key_t;
+
+/**
+ * @brief Expand a public key by precomputation.
+ *
+ * @todo Give actual error returns, instead of ambiguous NULL.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] pub The public key.
+ * @retval NULL We ran out of memory, or the 
+ */
+struct goldilocks_precomputed_public_key_t *
+goldilocks_precompute_public_key (
+    const struct goldilocks_public_key_t *pub
+) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));
+
+/**
+ * @brief Overwrite an expanded public key with zeros, then destroy it.
+ *
+ * If the input is NULL, this function does nothing.
+ *
+ * @param [in] precom The public key.
+ */
+void
+goldilocks_destroy_precomputed_public_key (
+    struct goldilocks_precomputed_public_key_t *precom
+) __attribute__((visibility ("default")));
+
+/**
+ * @brief Verify a signature.
+ *
+ * This function is fairly strict.  It will correctly detect when
+ * the signature has the wrong cofactor component, or when the sig
+ * values aren't less than p or q.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] signature The signature.
+ * @param [in] message The message to be verified.
+ * @param [in] message_len The length of the message to be verified.
+ * @param [in] pubkey The signer's public key, expanded by precomputation.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EINVAL The public key or signature is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_verify_precomputed (
+   const uint8_t signature[GOLDI_SIGNATURE_BYTES],
+   const uint8_t *message,
+   uint64_t message_len,
+   const struct goldilocks_precomputed_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
+   
+/**
+ * @brief Generate a Diffie-Hellman shared secret in constant time.
+ * Uses a precomputation on the other party's public key for efficiency.
+ *
+ * This function uses some compile-time flags whose merit remains to
+ * be decided.
+ *
+ * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
+ * of zeros to the secret before hashing.  In the case that the other
+ * party's key is detectably corrupt, instead the symmetric part
+ * of the secret key is used to produce a pseudorandom value.
+ *
+ * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
+ * the two parties' public keys is prepended to the hash.
+ *
+ * In the current version, this function can safely be run even without
+ * goldilocks_init().  But this property is not guaranteed for future
+ * versions, so call it anyway.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] shared The shared secret established with the other party.
+ * @param [in] my_privkey My private key.
+ * @param [in] your_pubkey The other party's precomputed public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EINVAL   The other party's key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_shared_secret_precomputed (
+   uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
+   const struct goldilocks_private_key_t *my_privkey,
+   const struct goldilocks_precomputed_public_key_t *your_pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
+
+#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */
 
 #ifdef __cplusplus
 }; /* extern "C" */
diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c
index c764955..fa3c583 100644
--- a/src/arch_arm_32/p448.c
+++ b/src/arch_arm_32/p448.c
@@ -28,6 +28,8 @@ smlal (
     const uint32_t a,
     const uint32_t b
 ) {
+
+#ifdef  __ARMEL__
     uint32_t lo = *acc, hi = (*acc)>>32;
     
     __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -35,6 +37,9 @@ smlal (
         : [a]"r"(a), [b]"r"(b));
     
     *acc = lo + (((uint64_t)hi)<<32);
+#else
+    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+#endif
 }
 
 static inline void __attribute__((gnu_inline,always_inline))
@@ -43,6 +48,7 @@ smlal2 (
     const uint32_t a,
     const uint32_t b
 ) {
+#ifdef __ARMEL__
     uint32_t lo = *acc, hi = (*acc)>>32;
     
     __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -50,6 +56,9 @@ smlal2 (
         : [a]"r"(a), [b]"r"(2*b));
     
     *acc = lo + (((uint64_t)hi)<<32);
+#else
+    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+#endif
 }
 
 static inline void __attribute__((gnu_inline,always_inline))
@@ -58,6 +67,7 @@ smull (
     const uint32_t a,
     const uint32_t b
 ) {
+#ifdef __ARMEL__
     uint32_t lo, hi;
     
     __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
@@ -65,6 +75,9 @@ smull (
         : [a]"r"(a), [b]"r"(b));
     
     *acc = lo + (((uint64_t)hi)<<32);
+#else
+    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+#endif
 }
 
 static inline void __attribute__((gnu_inline,always_inline))
@@ -73,6 +86,7 @@ smull2 (
     const uint32_t a,
     const uint32_t b
 ) {
+#ifdef __ARMEL__
     uint32_t lo, hi;
     
     __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
@@ -80,6 +94,9 @@ smull2 (
         : [a]"r"(a), [b]"r"(2*b));
     
     *acc = lo + (((uint64_t)hi)<<32);
+#else
+    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+#endif
 }
 
 void
@@ -760,13 +777,13 @@ p448_mulw (
     const p448_t *as,
     uint64_t b
 ) {
-    const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
+    uint32_t mask = (1ull<<28)-1;  
+    const uint32_t bhi = b>>28, blo = b & mask;
     
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
 
     uint64_t accum0, accum8;
-    uint32_t mask = (1ull<<28)-1;  
 
     int i;
 
@@ -957,7 +974,7 @@ p448_deserialize (
         for (j=0; j<7; j++) {
             out |= ((uint64_t)serial[7*i+j])<<(8*j);
         }
-        x->limb[2*i] = out & (1ull<<28)-1;
+        x->limb[2*i] = out & ((1ull<<28)-1);
         x->limb[2*i+1] = out >> 28;
     }
     
diff --git a/src/arch_arm_32/p448.h b/src/arch_arm_32/p448.h
index 4628a89..befc9e0 100644
--- a/src/arch_arm_32/p448.h
+++ b/src/arch_arm_32/p448.h
@@ -173,7 +173,7 @@ p448_set_ui (
     uint64_t x
 ) {
     int i;
-    out->limb[0] = x & (1<<28)-1;
+    out->limb[0] = x & ((1<<28)-1);
     out->limb[1] = x>>28;
     for (i=2; i<16; i++) {
       out->limb[i] = 0;
@@ -188,7 +188,11 @@ p448_cond_swap (
 ) {
     big_register_t *aa = (big_register_t*)a;
     big_register_t *bb = (big_register_t*)b;
+#if __ARM_NEON__
+    big_register_t m = vdupq_n_u32(doswap);
+#else
     big_register_t m = doswap;
+#endif
 
     unsigned int i;
     for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
@@ -260,8 +264,12 @@ p448_cond_neg(
     struct p448_t negated;
     big_register_t *aa = (big_register_t *)a;
     big_register_t *nn = (big_register_t*)&negated;
+#if __ARM_NEON__
+    big_register_t m = vdupq_n_u32(doNegate);
+#else
     big_register_t m = doNegate;
-    
+#endif
+
     p448_neg(&negated, a);
     p448_bias(&negated, 2);
     
diff --git a/src/arch_neon/ec_point.c b/src/arch_neon/ec_point.c
new file mode 100644
index 0000000..823e43d
--- /dev/null
+++ b/src/arch_neon/ec_point.c
@@ -0,0 +1,959 @@
+/**
+ * @cond internal
+ * @file ec_point.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @warning This file was automatically generated.
+ */
+
+#include "ec_point.h"
+
+
+void
+p448_isr (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L1,     x );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mul  (   &L2,     x,   &L1 );
+    p448_sqrn (   &L1,   &L2,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L1,   &L0,     3 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_sqrn (   &L2,   &L0,     9 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (   &L2,     x,   &L0 );
+    p448_sqrn (   &L0,   &L2,    18 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqrn (   &L0,   &L2,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,    37 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_sqrn (   &L0,   &L1,   111 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     x,   &L0 );
+    p448_sqrn (   &L0,   &L1,   223 );
+    p448_mul  (     a,   &L2,   &L0 );
+}
+
+void
+p448_inverse (
+    struct p448_t*       a,
+    const struct p448_t* x
+) {
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mul  (     a,     x,   &L0 );
+}
+
+void
+add_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->a,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->b,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_sub  ( &d->y, &d->z, &d->x );
+    p448_bias ( &d->y,     2 );
+    p448_weak_reduce( &d->y );
+    p448_add  (   &L0, &d->x, &d->z );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+sub_tw_niels_from_tw_extensible (
+    struct tw_extensible_t*  d,
+    const struct tw_niels_t* e
+) {
+    struct p448_t L0, L1;
+    p448_sub  (   &L1, &d->y, &d->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &e->b,   &L1 );
+    p448_add  (   &L1, &d->x, &d->y );
+    p448_mul  ( &d->y, &e->a,   &L1 );
+    p448_mul  (   &L1, &d->u, &d->t );
+    p448_mul  ( &d->x, &e->c,   &L1 );
+    p448_add  ( &d->u,   &L0, &d->y );
+    p448_sub  ( &d->t, &d->y,   &L0 );
+    p448_bias ( &d->t,     2 );
+    p448_weak_reduce( &d->t );
+    p448_add  ( &d->y, &d->x, &d->z );
+    p448_sub  (   &L0, &d->z, &d->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &d->z,   &L0, &d->y );
+    p448_mul  ( &d->x, &d->y, &d->t );
+    p448_mul  ( &d->y,   &L0, &d->u );
+}
+
+void
+add_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    add_tw_niels_to_tw_extensible(     e, &a->n );
+}
+
+void
+sub_tw_pniels_from_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* a
+) {
+    struct p448_t L0;
+    p448_mul  (   &L0, &e->z, &a->z );
+    p448_copy ( &e->z,   &L0 );
+    sub_tw_niels_from_tw_extensible(     e, &a->n );
+}
+
+void
+double_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  ( &a->u,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  (   &L1, &a->t );
+    p448_sub  ( &a->t,   &L1, &a->u );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  (   &L1,   &L0,   &L2 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     1 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+double_extensible (
+    struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->x );
+    p448_sqr  (   &L0, &a->y );
+    p448_add  (   &L1,   &L2,   &L0 );
+    p448_add  ( &a->t, &a->y, &a->x );
+    p448_sqr  ( &a->u, &a->t );
+    p448_sub  ( &a->t, &a->u,   &L1 );
+    p448_bias ( &a->t,     3 );
+    p448_weak_reduce( &a->t );
+    p448_sub  ( &a->u,   &L0,   &L2 );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_sqr  ( &a->x, &a->z );
+    p448_bias ( &a->x,     2 );
+    p448_add  ( &a->z, &a->x, &a->x );
+    p448_sub  (   &L0, &a->z,   &L1 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->z,   &L1,   &L0 );
+    p448_mul  ( &a->x,   &L0, &a->t );
+    p448_mul  ( &a->y,   &L1, &a->u );
+}
+
+void
+twist_and_double (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  ( &b->u, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  (   &L0, &b->t );
+    p448_sub  ( &b->t,   &L0, &b->u );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  (   &L0, &b->z, &b->x );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     2 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+untwist_and_double (
+    struct extensible_t*          b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0;
+    p448_sqr  ( &b->x, &a->x );
+    p448_sqr  ( &b->z, &a->y );
+    p448_add  (   &L0, &b->x, &b->z );
+    p448_add  ( &b->t, &a->y, &a->x );
+    p448_sqr  ( &b->u, &b->t );
+    p448_sub  ( &b->t, &b->u,   &L0 );
+    p448_bias ( &b->t,     3 );
+    p448_weak_reduce( &b->t );
+    p448_sub  ( &b->u, &b->z, &b->x );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sqr  ( &b->x, &a->z );
+    p448_bias ( &b->x,     1 );
+    p448_add  ( &b->z, &b->x, &b->x );
+    p448_sub  ( &b->y, &b->z, &b->u );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->z,   &L0, &b->y );
+    p448_mul  ( &b->x, &b->y, &b->t );
+    p448_mul  ( &b->y,   &L0, &b->u );
+}
+
+void
+convert_tw_affine_to_tw_pniels (
+    struct tw_pniels_t*       b,
+    const struct tw_affine_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->y, &a->x );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_set_ui( &b->z,     2 );
+}
+
+void
+convert_tw_affine_to_tw_extensible (
+    struct tw_extensible_t*   b,
+    const struct tw_affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_affine_to_extensible (
+    struct extensible_t*   b,
+    const struct affine_t* a
+) {
+    p448_copy ( &b->x, &a->x );
+    p448_copy ( &b->y, &a->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &a->x );
+    p448_copy ( &b->u, &a->y );
+}
+
+void
+convert_tw_extensible_to_tw_pniels (
+    struct tw_pniels_t*           b,
+    const struct tw_extensible_t* a
+) {
+    p448_sub  ( &b->n.a, &a->y, &a->x );
+    p448_bias ( &b->n.a,     2 );
+    p448_weak_reduce( &b->n.a );
+    p448_add  ( &b->n.b, &a->x, &a->y );
+    p448_weak_reduce( &b->n.b );
+    p448_mul  ( &b->n.c, &a->u, &a->t );
+    p448_mulw ( &b->z, &b->n.c, 78164 );
+    p448_neg  ( &b->n.c, &b->z );
+    p448_bias ( &b->n.c,     2 );
+    p448_weak_reduce( &b->n.c );
+    p448_add  ( &b->z, &a->z, &a->z );
+    p448_weak_reduce( &b->z );
+}
+
+void
+convert_tw_pniels_to_tw_extensible (
+    struct tw_extensible_t*   e,
+    const struct tw_pniels_t* d
+) {
+    p448_add  ( &e->u, &d->n.b, &d->n.a );
+    p448_sub  ( &e->t, &d->n.b, &d->n.a );
+    p448_bias ( &e->t,     2 );
+    p448_weak_reduce( &e->t );
+    p448_mul  ( &e->x, &d->z, &e->t );
+    p448_mul  ( &e->y, &d->z, &e->u );
+    p448_sqr  ( &e->z, &d->z );
+}
+
+void
+convert_tw_niels_to_tw_extensible (
+    struct tw_extensible_t*  e,
+    const struct tw_niels_t* d
+) {
+    p448_add  ( &e->y, &d->b, &d->a );
+    p448_weak_reduce( &e->y );
+    p448_sub  ( &e->x, &d->b, &d->a );
+    p448_bias ( &e->x,     2 );
+    p448_weak_reduce( &e->x );
+    p448_set_ui( &e->z,     1 );
+    p448_copy ( &e->t, &e->x );
+    p448_copy ( &e->u, &e->y );
+}
+
+void
+montgomery_step (
+    struct montgomery_t* a
+) {
+    struct p448_t L0, L1;
+    p448_add  (   &L0, &a->zd, &a->xd );
+    p448_sub  (   &L1, &a->xd, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_sub  ( &a->zd, &a->xa, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_add  ( &a->zd, &a->za, &a->xa );
+    p448_mul  ( &a->za,   &L1, &a->zd );
+    p448_add  ( &a->xa, &a->za, &a->xd );
+    p448_sqr  ( &a->zd, &a->xa );
+    p448_mul  ( &a->xa, &a->z0, &a->zd );
+    p448_sub  ( &a->zd, &a->xd, &a->za );
+    p448_bias ( &a->zd,     2 );
+    p448_weak_reduce( &a->zd );
+    p448_sqr  ( &a->za, &a->zd );
+    p448_sqr  ( &a->xd,   &L0 );
+    p448_sqr  (   &L0,   &L1 );
+    p448_mulw ( &a->zd, &a->xd, 39082 );
+    p448_sub  (   &L1, &a->xd,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->xd,   &L0, &a->zd );
+    p448_sub  (   &L0, &a->zd,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &a->zd,   &L0,   &L1 );
+}
+
+void
+deserialize_montgomery (
+    struct montgomery_t* a,
+    const struct p448_t* sbz
+) {
+    p448_sqr  ( &a->z0,   sbz );
+    p448_set_ui( &a->xd,     1 );
+    p448_set_ui( &a->zd,     0 );
+    p448_set_ui( &a->xa,     1 );
+    p448_copy ( &a->za, &a->z0 );
+}
+
+mask_t
+serialize_montgomery (
+    struct p448_t*             b,
+    const struct montgomery_t* a,
+    const struct p448_t*       sbz
+) {
+    mask_t L0, L1, L2;
+    struct p448_t L3, L4, L5, L6;
+    p448_mul  (   &L6, &a->z0, &a->zd );
+    p448_sub  (   &L4,   &L6, &a->xd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6, &a->za,   &L4 );
+    p448_mul  (   &L5, &a->z0, &a->xd );
+    p448_sub  (   &L4,   &L5, &a->zd );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L3, &a->xa,   &L4 );
+    p448_add  (   &L5,   &L3,   &L6 );
+    p448_sub  (   &L4,   &L6,   &L3 );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_copy (   &L5, &a->z0 );
+    p448_addw (   &L5,     1 );
+    p448_sqr  (   &L4,   &L5 );
+    p448_mulw (   &L5,   &L4, 39082 );
+    p448_neg  (   &L4,   &L5 );
+    p448_add  (   &L5, &a->z0, &a->z0 );
+    p448_bias (   &L5,     1 );
+    p448_add  (   &L3,   &L5,   &L5 );
+    p448_add  (   &L5,   &L3,   &L4 );
+    p448_weak_reduce(   &L5 );
+    p448_mul  (   &L3, &a->xd,   &L5 );
+       L1 = p448_is_zero( &a->zd );
+       L2 = -   L1;
+    p448_mask (   &L4,   &L3,    L1 );
+    p448_add  (   &L5,   &L4, &a->zd );
+       L0 = ~   L1;
+    p448_mul  (   &L4,   sbz,   &L6 );
+    p448_addw (   &L4,    L2 );
+    p448_mul  (   &L6,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_mul  (   &L5,   &L6, &a->xd );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_isr  (   &L3,   &L6 );
+    p448_mul  (   &L5,   &L4,   &L3 );
+    p448_sqr  (   &L4,   &L3 );
+    p448_mul  (   &L3,   &L6,   &L4 );
+    p448_mask (     b,   &L5,    L0 );
+    p448_subw (   &L3,     1 );
+    p448_bias (   &L3,     1 );
+       L1 = p448_is_zero(   &L3 );
+       L0 = p448_is_zero(   sbz );
+    return    L1 |    L0;
+}
+
+void
+serialize_extensible (
+    struct p448_t*             b,
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sub  (   &L0, &a->y, &a->z );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_add  (     b, &a->z, &a->y );
+    p448_mul  (   &L1, &a->z, &a->x );
+    p448_mul  (   &L2,   &L0,   &L1 );
+    p448_mul  (   &L1,   &L2,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (     b,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+}
+
+void
+untwist_and_double_and_serialize (
+    struct p448_t*                b,
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->y, &a->x );
+    p448_add  (     b, &a->y, &a->x );
+    p448_sqr  (   &L1,     b );
+    p448_add  (   &L2,   &L3,   &L3 );
+    p448_sub  (     b,   &L1,   &L2 );
+    p448_bias (     b,     3 );
+    p448_weak_reduce(     b );
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1,   &L2 );
+    p448_add  (   &L2,     b,     b );
+    p448_mulw (     b,   &L2, 39082 );
+    p448_neg  (   &L2,     b );
+    p448_bias (   &L2,     2 );
+    p448_mulw (   &L0,   &L2, 39082 );
+    p448_neg  (     b,   &L0 );
+    p448_bias (     b,     2 );
+    p448_mul  (   &L0,   &L2,   &L1 );
+    p448_mul  (   &L2,     b,   &L0 );
+    p448_isr  (   &L0,   &L2 );
+    p448_mul  (   &L1,     b,   &L0 );
+    p448_sqr  (     b,   &L0 );
+    p448_mul  (   &L0,   &L2,     b );
+    p448_mul  (     b,   &L1,   &L3 );
+}
+
+void
+twist_even (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    p448_sqr  ( &b->y, &a->z );
+    p448_sqr  ( &b->z, &a->x );
+    p448_sub  ( &b->u, &b->y, &b->z );
+    p448_bias ( &b->u,     2 );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->z, &a->z, &a->x );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->y, &b->z, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->x, &b->z, &b->y );
+    p448_mul  ( &b->t, &b->x, &b->u );
+    p448_mul  ( &b->y, &b->x, &b->t );
+    p448_isr  ( &b->t, &b->y );
+    p448_mul  ( &b->u, &b->x, &b->t );
+    p448_sqr  ( &b->x, &b->t );
+    p448_mul  ( &b->t, &b->y, &b->x );
+    p448_mul  ( &b->x, &a->x, &b->u );
+    p448_mul  ( &b->y, &a->y, &b->u );
+       L1 = p448_is_zero( &b->z );
+       L0 = -   L1;
+    p448_addw ( &b->y,    L0 );
+    p448_weak_reduce( &b->y );
+    p448_set_ui( &b->z,     1 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+void
+test_only_twist (
+    struct tw_extensible_t*    b,
+    const struct extensible_t* a
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_sqr  ( &b->u, &a->z );
+    p448_sqr  ( &b->y, &a->x );
+    p448_sub  ( &b->z, &b->u, &b->y );
+    p448_bias ( &b->z,     2 );
+    p448_add  ( &b->y, &b->z, &b->z );
+    p448_add  ( &b->u, &b->y, &b->y );
+    p448_weak_reduce( &b->u );
+    p448_sub  ( &b->y, &a->z, &a->x );
+    p448_bias ( &b->y,     2 );
+    p448_weak_reduce( &b->y );
+    p448_mul  ( &b->x, &b->y, &a->y );
+    p448_sub  ( &b->z, &a->z, &a->y );
+    p448_bias ( &b->z,     2 );
+    p448_weak_reduce( &b->z );
+    p448_mul  ( &b->t, &b->z, &b->x );
+    p448_mul  (   &L3, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L3 );
+    p448_isr  (   &L2, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L2 );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  ( &b->t, &b->x,   &L3 );
+    p448_add  ( &b->x, &a->y, &a->x );
+    p448_weak_reduce( &b->x );
+    p448_sub  (   &L2, &a->x, &a->y );
+    p448_bias (   &L2,     2 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L3, &b->t,   &L2 );
+    p448_add  (   &L2,   &L3, &b->x );
+    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_bias ( &b->t,     2 );
+    p448_weak_reduce( &b->t );
+    p448_mul  ( &b->x,   &L2, &b->u );
+       L0 = p448_is_zero( &b->y );
+       L1 = -   L0;
+    p448_addw ( &b->x,    L1 );
+    p448_weak_reduce( &b->x );
+    p448_mul  ( &b->y, &b->t, &b->u );
+       L0 = p448_is_zero( &b->z );
+       L1 = -   L0;
+    p448_addw ( &b->y,    L1 );
+    p448_weak_reduce( &b->y );
+       L1 = p448_is_zero( &a->y );
+       L0 =    L1 +     1;
+    p448_set_ui( &b->z,    L0 );
+    p448_copy ( &b->t, &b->x );
+    p448_copy ( &b->u, &b->y );
+}
+
+mask_t
+is_square (
+    const struct p448_t* x
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_isr  (   &L2,     x );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  (   &L2,     x,   &L3 );
+    p448_subw (   &L2,     1 );
+    p448_bias (   &L2,     1 );
+       L1 = p448_is_zero(   &L2 );
+       L0 = p448_is_zero(     x );
+    return    L1 |    L0;
+}
+
+mask_t
+is_even_pt (
+    const struct extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+is_even_tw (
+    const struct tw_extensible_t* a
+) {
+    struct p448_t L0, L1, L2;
+    p448_sqr  (   &L2, &a->z );
+    p448_sqr  (   &L1, &a->x );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_weak_reduce(   &L0 );
+    return is_square (   &L0 );
+}
+
+mask_t
+deserialize_affine (
+    struct affine_t*     a,
+    const struct p448_t* sz
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L1,    sz );
+    p448_copy (   &L3,   &L1 );
+    p448_addw (   &L3,     1 );
+    p448_sqr  ( &a->x,   &L3 );
+    p448_mulw (   &L3, &a->x, 39082 );
+    p448_neg  ( &a->x,   &L3 );
+    p448_add  (   &L3,   &L1,   &L1 );
+    p448_bias (   &L3,     1 );
+    p448_add  ( &a->y,   &L3,   &L3 );
+    p448_add  (   &L3, &a->y, &a->x );
+    p448_weak_reduce(   &L3 );
+    p448_copy ( &a->y,   &L1 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  ( &a->x, &a->y );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_mul  ( &a->y, &a->x,   &L3 );
+    p448_sqr  (   &L2, &a->x );
+    p448_mul  (   &L0,   &L2, &a->y );
+    p448_mul  ( &a->y, &a->x,   &L0 );
+    p448_isr  (   &L3, &a->y );
+    p448_mul  ( &a->y,   &L2,   &L3 );
+    p448_sqr  (   &L2,   &L3 );
+    p448_mul  (   &L3,   &L0,   &L2 );
+    p448_mul  (   &L0, &a->x,   &L3 );
+    p448_add  (   &L2, &a->y, &a->y );
+    p448_mul  ( &a->x,    sz,   &L2 );
+    p448_addw (   &L1,     1 );
+    p448_mul  ( &a->y,   &L1,   &L3 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+deserialize_and_twist_approx (
+    struct tw_extensible_t* a,
+    const struct p448_t*    sdm1,
+    const struct p448_t*    sz
+) {
+    struct p448_t L0, L1;
+    p448_sqr  ( &a->z,    sz );
+    p448_copy ( &a->y, &a->z );
+    p448_addw ( &a->y,     1 );
+    p448_sqr  ( &a->x, &a->y );
+    p448_mulw ( &a->y, &a->x, 39082 );
+    p448_neg  ( &a->x, &a->y );
+    p448_add  ( &a->y, &a->z, &a->z );
+    p448_bias ( &a->y,     1 );
+    p448_add  ( &a->u, &a->y, &a->y );
+    p448_add  ( &a->y, &a->u, &a->x );
+    p448_weak_reduce( &a->y );
+    p448_sqr  ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  ( &a->u, &a->x );
+    p448_bias ( &a->u,     2 );
+    p448_weak_reduce( &a->u );
+    p448_mul  ( &a->x,  sdm1, &a->u );
+    p448_mul  (   &L0, &a->x, &a->y );
+    p448_mul  ( &a->t,   &L0, &a->y );
+    p448_mul  ( &a->u, &a->x, &a->t );
+    p448_mul  ( &a->t, &a->u,   &L0 );
+    p448_mul  ( &a->y, &a->x, &a->t );
+    p448_isr  (   &L0, &a->y );
+    p448_mul  ( &a->y, &a->u,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &a->u, &a->t,   &L1 );
+    p448_mul  ( &a->t, &a->x, &a->u );
+    p448_add  ( &a->x,    sz,    sz );
+    p448_mul  (   &L0, &a->u, &a->x );
+    p448_copy ( &a->x, &a->z );
+    p448_subw ( &a->x,     1 );
+    p448_neg  (   &L1, &a->x );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  ( &a->x,   &L1,   &L0 );
+    p448_mul  (   &L0, &a->u, &a->y );
+    p448_addw ( &a->z,     1 );
+    p448_mul  ( &a->y, &a->z,   &L0 );
+    p448_subw ( &a->t,     1 );
+    p448_bias ( &a->t,     1 );
+    mask_t ret = p448_is_zero( &a->t );
+    p448_set_ui( &a->z,     1 );
+    p448_copy ( &a->t, &a->x );
+    p448_copy ( &a->u, &a->y );
+    return ret;
+}
+
+void
+set_identity_extensible (
+    struct extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_tw_extensible (
+    struct tw_extensible_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+    p448_set_ui( &a->z,     1 );
+    p448_set_ui( &a->t,     0 );
+    p448_set_ui( &a->u,     0 );
+}
+
+void
+set_identity_affine (
+    struct affine_t* a
+) {
+    p448_set_ui( &a->x,     0 );
+    p448_set_ui( &a->y,     1 );
+}
+
+mask_t
+eq_affine (
+    const struct affine_t* a,
+    const struct affine_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2;
+    p448_sub  (   &L2, &a->x, &b->x );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_sub  (   &L2, &a->y, &b->y );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_extensible (
+    const struct extensible_t* a,
+    const struct extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+eq_tw_extensible (
+    const struct tw_extensible_t* a,
+    const struct tw_extensible_t* b
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+void
+elligator_2s_inject (
+    struct affine_t*     a,
+    const struct p448_t* r
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    p448_sqr  ( &a->x,     r );
+    p448_sqr  (   &L3, &a->x );
+    p448_copy ( &a->y,   &L3 );
+    p448_subw ( &a->y,     1 );
+    p448_neg  (   &L9, &a->y );
+    p448_bias (   &L9,     2 );
+    p448_weak_reduce(   &L9 );
+    p448_sqr  (   &L2,   &L9 );
+    p448_mulw (   &L8,   &L2, 1527402724 );
+    p448_mulw (   &L7,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_weak_reduce( &a->y );
+    p448_mulw (   &L8,   &L2, 6109454568 );
+    p448_sub  (   &L7, &a->y,   &L8 );
+    p448_bias (   &L7,     2 );
+    p448_weak_reduce(   &L7 );
+    p448_mulw (   &L4, &a->y, 78160 );
+    p448_mul  (   &L6,   &L7,   &L9 );
+    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mul  (   &L4,   &L7,   &L8 );
+    p448_isr  (   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_sqr  (   &L6,   &L5 );
+    p448_mul  (   &L5,   &L8,   &L6 );
+    p448_mul  (   &L8,   &L7,   &L5 );
+    p448_mul  (   &L7,   &L8,   &L5 );
+    p448_copy (   &L5, &a->x );
+    p448_subw (   &L5,     1 );
+    p448_addw ( &a->x,     1 );
+    p448_mul  (   &L6, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_bias ( &a->x,     3 );
+    p448_weak_reduce( &a->x );
+    p448_mul  (   &L5,   &L4, &a->x );
+    p448_mulw (   &L4,   &L5, 78160 );
+    p448_neg  ( &a->x,   &L4 );
+    p448_bias ( &a->x,     2 );
+    p448_weak_reduce( &a->x );
+    p448_add  (   &L4,   &L3,   &L3 );
+    p448_add  (   &L3,   &L4,   &L2 );
+    p448_subw (   &L3,     2 );
+    p448_bias (   &L3,     1 );
+    p448_weak_reduce(   &L3 );
+    p448_mul  (   &L2,   &L3,   &L8 );
+    p448_mulw (   &L3,   &L2, 3054649120 );
+    p448_add  (   &L2,   &L3, &a->y );
+    p448_mul  ( &a->y,   &L7,   &L2 );
+       L1 = p448_is_zero(   &L9 );
+       L0 = -   L1;
+    p448_addw ( &a->y,    L0 );
+    p448_weak_reduce( &a->y );
+}
+
+mask_t
+validate_affine (
+    const struct affine_t* a
+) {
+    struct p448_t L0, L1, L2, L3;
+    p448_sqr  (   &L0, &a->y );
+    p448_sqr  (   &L2, &a->x );
+    p448_add  (   &L3,   &L2,   &L0 );
+    p448_subw (   &L3,     1 );
+    p448_mulw (   &L1,   &L2, 39081 );
+    p448_neg  (   &L2,   &L1 );
+    p448_bias (   &L2,     2 );
+    p448_mul  (   &L1,   &L0,   &L2 );
+    p448_sub  (   &L0,   &L3,   &L1 );
+    p448_bias (   &L0,     3 );
+    return p448_is_zero(   &L0 );
+}
+
+mask_t
+validate_tw_extensible (
+    const struct tw_extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L2, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L2 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L3, &ext->x, &ext->y );
+    p448_neg  (   &L2,   &L3 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_bias (   &L3,     2 );
+       L1 = p448_is_zero(   &L3 );
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L2,   &L4 );
+    p448_addw (   &L2,     0 );
+    p448_sqr  (   &L3, &ext->x );
+    p448_add  (   &L4,   &L3,   &L2 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L3, &ext->t );
+    p448_mul  (   &L2,   &L3,   &L5 );
+    p448_mulw (   &L3,   &L2, 39081 );
+    p448_neg  (   &L5,   &L3 );
+    p448_add  (   &L3,   &L5,   &L4 );
+    p448_neg  (   &L5,   &L2 );
+    p448_add  (   &L4,   &L5,   &L3 );
+    p448_sqr  (   &L3, &ext->z );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     4 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+mask_t
+validate_extensible (
+    const struct extensible_t* ext
+) {
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
+    /*
+     * Check invariant:
+     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
+     */
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L3,   &L4 );
+    p448_addw (   &L3,     0 );
+    p448_sqr  (   &L2, &ext->z );
+    p448_add  (   &L4,   &L2,   &L3 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L2, &ext->t );
+    p448_mul  (   &L3,   &L2,   &L5 );
+    p448_mulw (   &L5,   &L3, 39081 );
+    p448_neg  (   &L2,   &L5 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_sqr  (   &L2, &ext->x );
+    p448_neg  (   &L4,   &L2 );
+    p448_add  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     4 );
+       L1 = p448_is_zero(   &L2 );
+    /*
+     * Check invariant:
+     * 0 = -x*y + z*t*u
+     */
+    p448_mul  (   &L3, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L3 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L2, &ext->x, &ext->y );
+    p448_neg  (   &L3,   &L2 );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
+}
+
+
diff --git a/src/arch_neon/neon_emulation.h b/src/arch_neon/neon_emulation.h
new file mode 100644
index 0000000..6fecbc7
--- /dev/null
+++ b/src/arch_neon/neon_emulation.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file "neon_emulation.h"
+ * @brief NEON intrinsic emulation using clang's vector extensions.
+ *
+ * This lets you test and debug NEON code on x86.
+ */
+#ifndef __NEON_EMULATION_H__
+#define __NEON_EMULATION_H__ 1
+
+#include "word.h"
+
+#include <stdint.h>
+#include <assert.h>
+
+static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) {
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
+xx_vaddup_s64(int64x2_t x) {
+    x.y += x.x;
+    return x;
+}
+
+typedef struct { int32x2_t val[2]; } int32x2x2_t;
+static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) {
+    int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}};
+    return out;
+}
+
+static __inline__ void __attribute__((gnu_inline,always_inline))
+xx_vtrnq_s64 (
+    int64x2_t *x,
+    int64x2_t *y
+) {
+    int64_t tmp = (*x).y;
+    (*x).y = (*y).x;
+    (*y).x = tmp;
+}
+
+int64x2_t vsraq_n_s64 (
+    int64x2_t a,
+    int64x2_t v,
+    const int x
+) {
+    return a + (v >> x);
+}
+
+int64x2_t vshrq_n_s64 (
+    int64x2_t v,
+    const int x
+) {
+    return v >> x;
+}
+
+static inline int64_t vgetq_lane_s64 (
+    int64x2_t acc,
+    const int lane
+) {
+    return lane ? acc.y : acc.x;
+}
+
+static inline int32_t vget_lane_s32 (
+    int32x2_t acc,
+    const int lane
+) {
+    return lane ? acc.y : acc.x;
+}
+
+static inline int64x2_t vmlal_lane_s32 (
+    int64x2_t acc,
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    return acc + xx*(lane?yy.yy:yy.xx);
+}
+
+static inline int64x2_t vmlsl_lane_s32 (
+    int64x2_t acc,
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    return acc - xx*(lane?yy.yy:yy.xx);
+}
+
+static inline int64x2_t vqdmlsl_lane_s32 (
+    int64x2_t acc,
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
+    assert(tmp.x >> 63 == tmp.x>>62);
+    assert(tmp.y >> 63 == tmp.y>>62);
+    return acc - 2*tmp;
+}
+
+static inline int64x2_t vqdmlal_lane_s32 (
+    int64x2_t acc,
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
+    assert(tmp.x >> 63 == tmp.x>>62);
+    assert(tmp.y >> 63 == tmp.y>>62);
+    return acc + 2*tmp;
+}
+
+static inline int64x2_t vqdmull_lane_s32 (
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
+    assert(tmp.x >> 63 == tmp.x>>62);
+    assert(tmp.y >> 63 == tmp.y>>62);
+    return 2*tmp;
+}
+
+static inline int32x2_t vmovn_s64(
+    int64x2_t x
+) {
+    int32x2_t y = {x.x,x.y};
+    return y;
+}
+
+static inline int64x2_t vmull_lane_s32 (
+    int32x2_t x,
+    int32x2_t y,
+    int lane
+) {
+    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
+    return xx*(lane?yy.yy:yy.xx);
+}
+
+#endif /* __NEON_EMULATION_H__ */
diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c
new file mode 100644
index 0000000..6cd78aa
--- /dev/null
+++ b/src/arch_neon/p448.c
@@ -0,0 +1,749 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "word.h"
+#include "p448.h"
+
+static inline mask_t __attribute__((always_inline))
+is_zero (
+    word_t x
+) {
+    dword_t xx = x;
+    xx--;
+    return xx >> WORD_BITS;
+}
+
+static uint64_t widemul_32 (
+    const uint32_t a,
+    const uint32_t b
+) {
+    return ((uint64_t)a)* b;
+}
+
+#ifdef __ARM_NEON__
+static __inline__ void __attribute__((gnu_inline,always_inline))
+xx_vtrnq_s64 (
+    int64x2_t *x,
+    int64x2_t *y
+) {
+    __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y));
+}
+
+static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
+xx_vaddup_s64(int64x2_t x) {
+    __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
+    return x;
+}
+#else
+#include "neon_emulation.h"
+#endif // ARM_NEON
+
+static inline void __attribute__((gnu_inline,always_inline))
+smlal (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smlal2 (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smull (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+}
+
+static inline void __attribute__((gnu_inline,always_inline))
+smull2 (
+    uint64_t *acc,
+    const uint32_t a,
+    const uint32_t b
+) {
+    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
+}
+
+// static inline int64x2_t copy_now(int64x2_t x) {
+//     int64x2_t y;
+//     __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x));
+//     return y;
+// }
+
+void
+p448_mul (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    const p448_t *bs
+) {
+    const uint32_t *a = as->limb, *b = bs->limb;
+    uint32_t *c = cs->limb;
+    
+    const int32x2_t
+        *val = (const int32x2_t *)a,
+        *vbl = (const int32x2_t *)b,
+        *vah = (const int32x2_t *)(&a[8]),
+        *vbh = (const int32x2_t *)(&b[8]);
+    
+    int32x2_t
+        *vcl = (int32x2_t *)c,
+        *vch = (int32x2_t *)(&c[8]),
+        vmask = {(1<<28) - 1, (1<<28)-1};
+
+    int64x2_t accumx0a, accumx0b;
+    int64x2_t accumx1a, accumx1b;
+    int64x2_t accumx2a, accumx2b;
+    int64x2_t accumx3a, accumx3b;
+    int64x2_t accumx4a, accumx4b;
+    int64x2_t accumx5a, accumx5b;
+    int64x2_t accumx6a, accumx6b;
+    int64x2_t accumx7a, accumx7b;
+    int64x2_t carry;
+    int32x2x2_t trn_res;
+    int32x2_t delta;
+    
+    accumx0a = vmull_lane_s32(          delta = val[1] + vah[1], vbh[3], 0);
+    accumx1a = vmull_lane_s32(          delta, vbh[3], 1);
+    accumx2a = vmull_lane_s32(          delta = val[2] + vah[2], vbh[3], 0);
+    accumx3a = vmull_lane_s32(          delta, vbh[3], 1);
+    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
+    accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
+    accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
+    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
+    accumx2b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[1], 0);
+    accumx3b = vmull_lane_s32(          delta, vbh[1], 1);
+    accumx0b = vmull_lane_s32(          delta, vbh[0], 0);
+    accumx1b = vmull_lane_s32(          delta, vbh[0], 1);
+    accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
+    accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
+    accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
+    accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
+    accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
+    accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
+    accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
+    accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
+    accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
+    accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
+    accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
+    accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
+    accumx2b += accumx2a;
+    accumx3b += accumx3a;
+    accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
+    accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
+    accumx0b += accumx0a;
+    accumx1b += accumx1a;
+    accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
+    accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
+    accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
+    accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
+    accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
+    accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
+    accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
+    accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
+    accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
+    accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
+    accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
+    accumx2a += accumx2b;
+    accumx3a += accumx3b;
+    accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
+    accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
+    accumx0a += accumx0b;
+    accumx1a += accumx1b;
+    accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
+    accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
+    accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
+    accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
+    xx_vtrnq_s64(&accumx0a, &accumx0b);
+    xx_vtrnq_s64(&accumx1a, &accumx1b);
+    xx_vtrnq_s64(&accumx2a, &accumx2b);
+    xx_vtrnq_s64(&accumx3a, &accumx3b);
+    accumx0b += accumx1a;
+    accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
+    accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
+    accumx2a += accumx1b;
+    accumx2b += accumx3a;
+    accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
+    accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
+    trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
+    vcl[0] = trn_res.val[1] & vmask;
+    vch[0] = trn_res.val[0] & vmask;
+    trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
+    vcl[1] = trn_res.val[1] & vmask;
+    vch[1] = trn_res.val[0] & vmask;
+    carry = accumx3b;
+    
+    accumx4a = vmull_lane_s32(          delta = val[3] + vah[3], vbh[3], 0);
+    accumx5a = vmull_lane_s32(          delta, vbh[3], 1);
+    accumx6b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[3], 0);
+    accumx7b = vmull_lane_s32(          delta, vbh[3], 1);
+    accumx4b = accumx4a;
+    accumx5b = accumx5a;
+    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
+    accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
+    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
+    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
+    accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
+    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
+    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
+    accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
+    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
+    accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
+    accumx6a = accumx6b;
+    accumx7a = accumx7b;
+    accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
+    accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
+    accumx4a += accumx4b;
+    accumx5a += accumx5b;
+    accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
+    accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
+    accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
+    accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
+    accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
+    accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
+    accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
+    accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
+    accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
+    accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
+    accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
+    accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
+    accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
+    accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
+    /**/
+    accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
+    accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
+    accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
+    accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
+    accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
+    accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
+    accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
+    accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
+    accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
+    accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
+    accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
+    accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
+    
+    xx_vtrnq_s64(&accumx4a, &accumx4b);
+    xx_vtrnq_s64(&accumx5a, &accumx5b);
+    xx_vtrnq_s64(&accumx6a, &accumx6b);
+    xx_vtrnq_s64(&accumx7a, &accumx7b);
+    accumx4a += carry;
+    accumx4b += accumx5a;
+    accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
+    accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
+    accumx6a += accumx5b;
+    accumx6b += accumx7a;
+    
+    trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
+    vcl[2] = trn_res.val[1] & vmask;
+    vch[2] = trn_res.val[0] & vmask;
+    accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
+    accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
+    trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
+    vcl[3] = trn_res.val[1] & vmask;
+    vch[3] = trn_res.val[0] & vmask;
+    
+    accumx7b = xx_vaddup_s64(accumx7b);
+
+    int32x2_t t0 = vcl[0], t1 = vch[0];
+    trn_res = vtrn_s32(t0,t1);
+    t0 = trn_res.val[0]; t1 = trn_res.val[1];
+    
+    accumx7b = vaddw_s32(accumx7b, t0);
+    t0 = vmovn_s64(accumx7b) & vmask;
+    
+    accumx7b = vshrq_n_s64(accumx7b,28);
+    accumx7b = vaddw_s32(accumx7b, t1);
+    t1 = vmovn_s64(accumx7b) & vmask;
+    trn_res = vtrn_s32(t0,t1);
+    vcl[0] = trn_res.val[0];
+    vch[0] = trn_res.val[1];
+    accumx7b = vshrq_n_s64(accumx7b,28);
+
+    t0 = vmovn_s64(accumx7b);
+    
+    uint32_t
+        c0 = vget_lane_s32(t0,0),
+        c1 = vget_lane_s32(t0,1);
+    c[2]  += c0;
+    c[10] += c1;
+}
+
+void
+p448_sqr (
+    p448_t *__restrict__ cs,
+    const p448_t *as
+) {
+    /* FUTURE possible improvements:
+     *    don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end
+     *        or use phi/nega-phi for everything, montgomery style
+     *        or find some sort of phi algorithm which doesn't have this problem
+     *    break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks
+     *
+     * These improvements are all pretty minor, but I guess together they might matter?
+     */
+    
+    const uint32_t *b = as->limb;
+    uint32_t *c = cs->limb;
+
+    int32x2_t vbm[4];
+    
+    const int32x2_t
+        *vbl = (const int32x2_t *)b,
+        *vbh = (const int32x2_t *)(&b[8]);
+        
+    int i;
+    for (i=0; i<4; i++) {
+        vbm[i] = vbl[i] - vbh[i];
+    }
+    
+    int32x2_t
+        *vcl = (int32x2_t *)c,
+        *vch = (int32x2_t *)(&c[8]),
+        vmask = {(1<<28) - 1, (1<<28)-1};
+
+    int64x2_t accumx0a, accumx0b;
+    int64x2_t accumx1a, accumx1b;
+    int64x2_t accumx2a, accumx2b;
+    int64x2_t accumx3a, accumx3b;
+    int64x2_t accumx4a, accumx4b;
+    int64x2_t accumx5a, accumx5b;
+    int64x2_t accumx6a, accumx6b;
+    int64x2_t accumx7a, accumx7b;
+    int64x2_t carry;
+    int32x2x2_t trn_res;
+    
+    accumx0a = vqdmull_lane_s32(          vbh[1], vbh[3], 0);
+    accumx1a = vqdmull_lane_s32(          vbh[1], vbh[3], 1);
+    accumx2a = vqdmull_lane_s32(          vbh[2], vbh[3], 0);
+    accumx3a = vqdmull_lane_s32(          vbh[2], vbh[3], 1);
+    accumx0a =   vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0);
+    accumx1a =   vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1);
+    accumx2b = accumx2a;
+    accumx3b = accumx3a;
+    accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0);
+    accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1);
+    accumx0b = accumx0a;
+    accumx1b = accumx1a;
+    accumx0b =   vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0);
+    accumx1b =   vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1);
+    accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0);
+    accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1);
+    accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0);
+    accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1);
+    accumx0b =   vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0);
+    accumx1b =   vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1);
+    accumx2a += accumx2b;
+    accumx3a += accumx3b;
+    accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0);
+    accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1);
+    accumx0a += accumx0b;
+    accumx1a += accumx1b;
+    accumx0a =   vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0);
+    accumx1a =   vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1);
+    accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0);
+    accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1);
+    accumx0a =   vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0);
+    accumx1a =   vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1);
+    accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0);
+    accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1);
+    accumx0b += accumx0a;
+    accumx1b += accumx1a;
+    accumx0b =   vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0);
+    accumx1b =   vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1);
+    accumx2b += accumx2a;
+    accumx3b += accumx3a;
+    accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0);
+    accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1);
+    xx_vtrnq_s64(&accumx0b, &accumx0a);
+    xx_vtrnq_s64(&accumx1b, &accumx1a);
+    xx_vtrnq_s64(&accumx2b, &accumx2a);
+    xx_vtrnq_s64(&accumx3b, &accumx3a);
+    accumx0a += accumx1b;
+    accumx0a = vsraq_n_s64(accumx0a,accumx0b,28);
+    accumx1a = vsraq_n_s64(accumx1a,accumx0a,28);
+    accumx2b += accumx1a;
+    accumx2a += accumx3b;
+    accumx2a = vsraq_n_s64(accumx2a,accumx2b,28);
+    accumx3a = vsraq_n_s64(accumx3a,accumx2a,28);
+    trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a));
+    vcl[0] = trn_res.val[1] & vmask;
+    vch[0] = trn_res.val[0] & vmask;
+    trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a));
+    vcl[1] = trn_res.val[1] & vmask;
+    vch[1] = trn_res.val[0] & vmask;
+    carry = accumx3a;
+    
+    accumx4a =   vmull_lane_s32(          vbh[3], vbh[3], 0);
+    accumx5a =   vmull_lane_s32(          vbh[3], vbh[3], 1);
+    accumx6b = vqdmull_lane_s32(          vbh[0], vbh[3], 0);
+    accumx7b = vqdmull_lane_s32(          vbh[0], vbh[3], 1);
+    accumx4b = accumx4a;
+    accumx5b = accumx5a;
+    accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0);
+    accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1);
+    accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0);
+    accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1);
+    accumx4b =   vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0);
+    accumx5b =   vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1);
+    accumx4b =   vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0);
+    accumx5b =   vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1);
+    accumx6a = accumx6b;
+    accumx7a = accumx7b;
+    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0);
+    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1);
+    accumx4a += accumx4b;
+    accumx5a += accumx5b;
+    accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0);
+    accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1);
+    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0);
+    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1);
+    accumx4a =   vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0);
+    accumx5a =   vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1);
+    accumx4a =   vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0);
+    accumx5a =   vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1);
+    accumx6b += accumx6a;
+    accumx7b += accumx7a;
+    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0);
+    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1);
+    accumx4b += accumx4a;
+    accumx5b += accumx5a;
+    accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0);
+    accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1);
+    accumx4b =   vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0);
+    accumx5b =   vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1);
+    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0);
+    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1);
+    
+    xx_vtrnq_s64(&accumx4b, &accumx4a);
+    xx_vtrnq_s64(&accumx5b, &accumx5a);
+    xx_vtrnq_s64(&accumx6b, &accumx6a);
+    xx_vtrnq_s64(&accumx7b, &accumx7a);
+    accumx4b += carry;
+    accumx4a += accumx5b;
+    accumx4a = vsraq_n_s64(accumx4a,accumx4b,28);
+    accumx5a = vsraq_n_s64(accumx5a,accumx4a,28);
+    accumx6b += accumx5a;
+    accumx6a += accumx7b;
+    
+    trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a));
+    vcl[2] = trn_res.val[1] & vmask;
+    vch[2] = trn_res.val[0] & vmask;
+    accumx6a = vsraq_n_s64(accumx6a,accumx6b,28);
+    accumx7a = vsraq_n_s64(accumx7a,accumx6a,28);
+    trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a));
+    vcl[3] = trn_res.val[1] & vmask;
+    vch[3] = trn_res.val[0] & vmask;
+    
+    accumx7a = xx_vaddup_s64(accumx7a);
+
+    int32x2_t t0 = vcl[0], t1 = vch[0];
+    trn_res = vtrn_s32(t0,t1);
+    t0 = trn_res.val[0]; t1 = trn_res.val[1];
+    
+    accumx7a = vaddw_s32(accumx7a, t0);
+    t0 = vmovn_s64(accumx7a) & vmask;
+    
+    accumx7a = vshrq_n_s64(accumx7a,28);
+    accumx7a = vaddw_s32(accumx7a, t1);
+    t1 = vmovn_s64(accumx7a) & vmask;
+    trn_res = vtrn_s32(t0,t1);
+    vcl[0] = trn_res.val[0];
+    vch[0] = trn_res.val[1];
+    accumx7a = vshrq_n_s64(accumx7a,28);
+
+    t0 = vmovn_s64(accumx7a);
+    
+    uint32_t
+        c0 = vget_lane_s32(t0,0),
+        c1 = vget_lane_s32(t0,1);
+    c[2]  += c0;
+    c[10] += c1;
+}
+
+void
+p448_mulw (
+    p448_t *__restrict__ cs,
+    const p448_t *as,
+    uint64_t b
+) {
+    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
+    
+    const uint32_t *a = as->limb;
+    uint32_t *c = cs->limb;
+
+    uint64_t accum0, accum8;
+    uint32_t mask = (1ull<<28)-1;  
+
+    int i;
+
+    uint32_t c0, c8, n0, n8;
+    accum0 = widemul_32(bhi, a[15]);
+    accum8 = widemul_32(bhi, a[15] + a[7]);
+    c0 = a[0]; c8 = a[8];
+    smlal(&accum0, blo, c0);
+    smlal(&accum8, blo, c8);
+
+    c[0] = accum0 & mask; accum0 >>= 28;
+    c[8] = accum8 & mask; accum8 >>= 28;
+    
+    i=1;
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+        
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        c0 = a[i]; c8 = a[i+8];
+        smlal(&accum0, bhi, n0);
+        smlal(&accum8, bhi, n8);
+        smlal(&accum0, blo, c0);
+        smlal(&accum8, blo, c8);
+        
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+    {
+        n0 = a[i]; n8 = a[i+8];
+        smlal(&accum0, bhi, c0);
+        smlal(&accum8, bhi, c8);
+        smlal(&accum0, blo, n0);
+        smlal(&accum8, blo, n8);
+
+        c[i] = accum0 & mask; accum0 >>= 28;
+        c[i+8] = accum8 & mask; accum8 >>= 28;
+        i++;
+    }
+
+    accum0 += accum8 + c[8];
+    c[8] = accum0 & mask;
+    c[9] += accum0 >> 28;
+
+    accum8 += c[0];
+    c[0] = accum8 & mask;
+    c[1] += accum8 >> 28;
+}
+
+void
+p448_strong_reduce (
+    p448_t *a
+) {
+    word_t mask = (1ull<<28)-1;
+
+    /* first, clear high */
+    a->limb[8] += a->limb[15]>>28;
+    a->limb[0] += a->limb[15]>>28;
+    a->limb[15] &= mask;
+
+    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+
+    dsword_t scarry = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
+        a->limb[i] = scarry & mask;
+        scarry >>= 28;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
+    * so let's add back in p.  will carry back off the top for 2^448.
+    */
+
+    assert(is_zero(scarry) | is_zero(scarry+1));
+
+    word_t scarry_mask = scarry & mask;
+    dword_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<16; i++) {
+        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
+        a->limb[i] = carry & mask;
+        carry >>= 28;
+    }
+
+    assert(is_zero(carry + scarry));
+}
+
+mask_t
+p448_is_zero (
+    const struct p448_t *a
+) {
+    struct p448_t b;
+    p448_copy(&b,a);
+    p448_strong_reduce(&b);
+
+    uint32_t any = 0;
+    int i;
+    for (i=0; i<16; i++) {
+        any |= b.limb[i];
+    }
+    return is_zero(any);
+}
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+) {
+    int i,j;
+    p448_t red;
+    p448_copy(&red, x);
+    p448_strong_reduce(&red);
+    for (i=0; i<8; i++) {
+        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
+        for (j=0; j<7; j++) {
+            serial[7*i+j] = limb;
+            limb >>= 8;
+        }
+        assert(limb == 0);
+    }
+}
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+) {
+    int i,j;
+    for (i=0; i<8; i++) {
+        uint64_t out = 0;
+        for (j=0; j<7; j++) {
+            out |= ((uint64_t)serial[7*i+j])<<(8*j);
+        }
+        x->limb[2*i] = out & ((1ull<<28)-1);
+        x->limb[2*i+1] = out >> 28;
+    }
+    
+    /* Check for reduction.
+     *
+     * The idea is to create a variable ge which is all ones (rather, 56 ones)
+     * if and only if the low $i$ words of $x$ are >= those of p.
+     *
+     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
+     */
+    uint32_t ge = -1, mask = (1ull<<28)-1;
+    for (i=0; i<8; i++) {
+        ge &= x->limb[i];
+    }
+    
+    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
+    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    
+    /* Propagate the rest */
+    for (i=9; i<16; i++) {
+        ge &= x->limb[i];
+    }
+    
+    return ~is_zero(ge ^ mask);
+}
+
+void
+simultaneous_invert_p448(
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+) {
+  if (n==0) {
+      return;
+  } else if (n==1) {
+      p448_inverse(out,in);
+      return;
+  }
+  
+  p448_copy(&out[1], &in[0]);
+  int i;
+  for (i=1; i<(int) (n-1); i++) {
+      p448_mul(&out[i+1], &out[i], &in[i]);
+  }
+  p448_mul(&out[0], &out[n-1], &in[n-1]);
+  
+  struct p448_t tmp;
+  p448_inverse(&tmp, &out[0]);
+  p448_copy(&out[0], &tmp);
+  
+  /* at this point, out[0] = product(in[i]) ^ -1
+   * out[i] = product(in[0]..in[i-1]) if i != 0
+   */
+  for (i=n-1; i>0; i--) {
+      p448_mul(&tmp, &out[i], &out[0]);
+      p448_copy(&out[i], &tmp);
+      
+      p448_mul(&tmp, &out[0], &in[i]);
+      p448_copy(&out[0], &tmp);
+  }
+}
diff --git a/src/arch_neon/p448.h b/src/arch_neon/p448.h
new file mode 100644
index 0000000..94dacd7
--- /dev/null
+++ b/src/arch_neon/p448.h
@@ -0,0 +1,378 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __P448_H__
+#define __P448_H__ 1
+
+#include "word.h"
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef struct p448_t {
+  uint32_t limb[16];
+} __attribute__((aligned(32))) p448_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) __attribute__((unused,always_inline));
+           
+static __inline__ void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t do_swap
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+            
+static __inline__ void
+p448_cond_neg (
+    p448_t *a,
+    mask_t doNegate
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p448_weak_reduce (
+    p448_t *inout
+) __attribute__((unused,always_inline));
+             
+void
+p448_strong_reduce (
+    p448_t *inout
+);
+
+mask_t
+p448_is_zero (
+    const p448_t *in
+);
+             
+static __inline__ void
+p448_bias (
+    p448_t *inout,
+    int amount
+) __attribute__((unused,always_inline));
+
+void
+p448_mul (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    const p448_t *b
+);
+
+void
+p448_mulw (
+    p448_t *__restrict__ out,
+    const p448_t *a,
+    uint64_t b
+);
+
+void
+p448_sqr (
+    p448_t *__restrict__ out,
+    const p448_t *a
+);
+         
+static __inline__ void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) __attribute__((unused,always_inline));
+
+void
+p448_serialize (
+    uint8_t *serial,
+    const struct p448_t *x
+);
+
+mask_t
+p448_deserialize (
+    p448_t *x,
+    const uint8_t serial[56]
+);
+    
+static __inline__ void
+p448_mask(
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) __attribute__((unused,always_inline));
+
+/**
+* Returns 1/x.
+* 
+* If x=0, returns 0.
+*/
+void
+p448_inverse (
+   struct p448_t*       a,
+   const struct p448_t* x
+);
+       
+void
+simultaneous_invert_p448 (
+    struct p448_t *__restrict__ out,
+    const struct p448_t *in,
+    unsigned int n
+);
+
+static inline mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) __attribute__((always_inline,unused));
+
+/* -------------- Inline functions begin here -------------- */
+
+void
+p448_set_ui (
+    p448_t *out,
+    uint64_t x
+) {
+    int i;
+    out->limb[0] = x & ((1<<28)-1);
+    out->limb[1] = x>>28;
+    for (i=2; i<16; i++) {
+      out->limb[i] = 0;
+    }
+}
+            
+void
+p448_cond_swap (
+    p448_t *a,
+    p448_t *b,
+    mask_t doswap
+) {
+    big_register_t *aa = (big_register_t*)a;
+    big_register_t *bb = (big_register_t*)b;
+    big_register_t m = br_set_to_mask(doswap);
+
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        big_register_t x = m & (aa[i]^bb[i]);
+        aa[i] ^= x;
+        bb[i] ^= x;
+    }
+}
+
+void
+p448_add (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    */
+}
+
+void
+p448_sub (
+    p448_t *out,
+    const p448_t *a,
+    const p448_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] - b->limb[i];
+    }
+    */
+}
+
+void
+p448_neg (
+    p448_t *out,
+    const p448_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
+        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = -a->limb[i];
+    }
+    */
+}
+
+void
+p448_cond_neg(
+    p448_t *a,
+    mask_t doNegate
+) {
+    unsigned int i;
+    struct p448_t negated;
+    big_register_t *aa = (big_register_t *)a;
+    big_register_t *nn = (big_register_t*)&negated;
+    big_register_t m = br_set_to_mask(doNegate);
+    
+    p448_neg(&negated, a);
+    p448_bias(&negated, 2);
+    
+    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
+        aa[i] = (aa[i] & ~m) | (nn[i] & m);
+    }
+}
+
+void
+p448_addw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] += x;
+}
+             
+void
+p448_subw (
+    p448_t *a,
+    uint32_t x
+) {
+  a->limb[0] -= x;
+}
+
+void
+p448_copy (
+    p448_t *out,
+    const p448_t *a
+) {
+  *out = *a;
+}
+
+void
+p448_bias (
+    p448_t *a,
+    int amt
+) {
+    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
+    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
+    uint32x4_t *aa = (uint32x4_t*) a;
+    aa[0] += lo;
+    aa[1] += lo;
+    aa[2] += hi;
+    aa[3] += lo;
+}
+
+void
+p448_weak_reduce (
+    p448_t *a
+) {
+    uint64_t mask = (1ull<<28) - 1;
+    uint64_t tmp = a->limb[15] >> 28;
+    int i;
+    a->limb[8] += tmp;
+    for (i=15; i>0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+    }
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+void
+p448_sqrn (
+    p448_t *__restrict__ y,
+    const p448_t *x,
+    int n
+) {
+    p448_t tmp;
+    assert(n>0);
+    if (n&1) {
+        p448_sqr(y,x);
+        n--;
+    } else {
+        p448_sqr(&tmp,x);
+        p448_sqr(y,&tmp);
+        n-=2;
+    }
+    for (; n; n-=2) {
+        p448_sqr(&tmp,y);
+        p448_sqr(y,&tmp);
+    }
+}
+
+mask_t
+p448_eq (
+    const struct p448_t *a,
+    const struct p448_t *b
+) {
+    struct p448_t ra, rb;
+    p448_copy(&ra, a);
+    p448_copy(&rb, b);
+    p448_weak_reduce(&ra);
+    p448_weak_reduce(&rb);
+    p448_sub(&ra, &ra, &rb);
+    p448_bias(&ra, 2);
+    return p448_is_zero(&ra);
+}
+
+void
+p448_mask (
+    struct p448_t *a,
+    const struct p448_t *b,
+    mask_t mask
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
+        a->limb[i] = b->limb[i] & mask;
+    }
+}
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __P448_H__ */
diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c
index 7a37195..4abc788 100644
--- a/src/arch_x86_64/p448.c
+++ b/src/arch_x86_64/p448.c
@@ -17,13 +17,14 @@ p448_mul (
     __uint128_t accum0 = 0, accum1 = 0, accum2;
     uint64_t mask = (1ull<<56) - 1;  
 
-    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32)));
+    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
 
     /* For some reason clang doesn't vectorize this without prompting? */
     unsigned int i;
     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
         ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
-        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
+        ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
     }
     /*
     for (int i=0; i<4; i++) {
@@ -81,18 +82,15 @@ p448_mul (
     accum0 >>= 56;
     accum1 >>= 56;
 
-    accum2  = widemul(&aa[2],&bb[3]);
-    msb(&accum0, &a[2], &b[3]);
-    mac(&accum1, &a[6], &b[7]);
+    accum2  = widemul(&a[2],&b[7]);
+    mac(&accum0, &a[6], &bb[3]);
+    mac(&accum1, &aa[2], &bbb[3]);
 
-    mac(&accum2, &aa[3], &bb[2]);
-    msb(&accum0, &a[3], &b[2]);
-    mac(&accum1, &a[7], &b[6]);
+    mac(&accum2, &a[3], &b[6]);
+    mac(&accum0, &a[7], &bb[2]);
+    mac(&accum1, &aa[3], &bbb[2]);
 
-    accum1 += accum2;
-    accum0 += accum2;
-
-    accum2  = widemul(&a[0],&b[1]);
+    mac(&accum2, &a[0],&b[1]);
     mac(&accum1, &aa[0], &bb[1]);
     mac(&accum0, &a[4], &b[5]);
 
@@ -109,14 +107,11 @@ p448_mul (
     accum0 >>= 56;
     accum1 >>= 56;
 
-    accum2  = widemul(&aa[3],&bb[3]);
-    msb(&accum0, &a[3], &b[3]);
-    mac(&accum1, &a[7], &b[7]);
-
-    accum1 += accum2;
-    accum0 += accum2;
+    accum2  = widemul(&a[3],&b[7]);
+    mac(&accum0, &a[7], &bb[3]);
+    mac(&accum1, &aa[3], &bbb[3]);
 
-    accum2  = widemul(&a[0],&b[2]);
+    mac(&accum2, &a[0],&b[2]);
     mac(&accum1, &aa[0], &bb[2]);
     mac(&accum0, &a[4], &b[6]);
 
@@ -186,11 +181,9 @@ p448_mulw (
     c[3] = accum0 & mask; accum0 >>= 56;
     c[7] = accum4 & mask; accum4 >>= 56;
 
-    c[4] += accum0 + accum4;
-    c[0] += accum4;
+    // c[4] += accum0 + accum4;
+    // c[0] += accum4;
     
-    /*
-     * TODO: double-check that this is not necessary.
     accum0 += accum4 + c[4];
     c[4] = accum0 & mask;
     c[5] += accum0 >> 56;
@@ -198,7 +191,6 @@ p448_mulw (
     accum4 += c[0];
     c[0] = accum4 & mask;
     c[1] += accum4 >> 56;
-    */
 }
 
 void
diff --git a/src/arch_x86_64/p448.h b/src/arch_x86_64/p448.h
index b0b4dc0..2e2fbed 100644
--- a/src/arch_x86_64/p448.h
+++ b/src/arch_x86_64/p448.h
@@ -290,7 +290,10 @@ p448_copy (
     p448_t *out,
     const p448_t *a
 ) {
-  *out = *a;
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
+        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
+    }
 }
 
 void
@@ -299,10 +302,25 @@ p448_bias (
     int amt
 ) {
     uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
+    
+#if __AVX2__
     uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
     uint64x4_t *aa = (uint64x4_t*) a;
     aa[0] += lo;
     aa[1] += hi;
+#elif __SSE2__
+    uint64x2_t lo = {co1,co1}, hi = {co2,co1};
+    uint64x2_t *aa = (uint64x2_t*) a;
+    aa[0] += lo;
+    aa[1] += lo;
+    aa[2] += hi;
+    aa[3] += lo;
+#else
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
+        a->limb[i] += (i==4) ? co2 : co1;
+    }
+#endif
 }
 
 void
diff --git a/src/exported.sym b/src/exported.sym
deleted file mode 100644
index e26f3db..0000000
--- a/src/exported.sym
+++ /dev/null
@@ -1,6 +0,0 @@
-_goldilocks_init
-_goldilocks_keygen
-_goldilocks_shared_secret
-_goldilocks_sign
-_goldilocks_verify
-_goldilocks_private_to_public
diff --git a/src/goldilocks.c b/src/goldilocks.c
index f178d7a..4314e46 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -32,7 +32,10 @@
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif
 
-/* FUTURE: auto */
+#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS))
+#define GOLDI_DIVERSIFY_BYTES 8
+
+/* FUTURE: auto.  MAGIC */
 const struct affine_t goldilocks_base_point = {
     {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
        U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
@@ -42,11 +45,12 @@ const struct affine_t goldilocks_base_point = {
     {{ 19 }}
 };
 
+/* These are just unique identifiers */
 static const char *G_INITING = "initializing";
 static const char *G_INITED = "initialized";
 static const char *G_FAILED = "failed to initialize";
 
-/* FUTURE: auto */
+/* FUTURE: auto.  MAGIC */
 static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
     U64LE(0xdc873d6d54a7bb0d),
     U64LE(0xde933d8d723a70aa),
@@ -54,19 +58,45 @@ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
     0x8335dc16
 };
 const struct barrett_prime_t goldi_q448 = {
-    448/WORD_BITS,
+    GOLDI_FIELD_WORDS,
     62 % WORD_BITS,
     sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
     goldi_q448_lo
 };
 
-/* FUTURE: auto */
+/* MAGIC */
+static const struct p448_t
+sqrt_d_minus_1 = {{
+    U58LE(0xd2e21836749f46),
+    U58LE(0x888db42b4f0179),
+    U58LE(0x5a189aabdeea38),
+    U58LE(0x51e65ca6f14c06),
+    U58LE(0xa49f7b424d9770),
+    U58LE(0xdcac4628c5f656),
+    U58LE(0x49443b8748734a),
+    U58LE(0x12fec0c0b25b7a)
+}};
+
+struct goldilocks_precomputed_public_key_t {
+    struct goldilocks_public_key_t pub;
+    struct fixed_base_table_t table;
+};
+
+#ifndef USE_BIG_TABLES
+#if __ARM_NEON__
+#define USE_BIG_TABLES 1
+#else
+#define USE_BIG_TABLES (WORD_BITS==64)
+#endif
+#endif
+
+/* FUTURE: auto.  MAGIC */
 struct {
     const char * volatile state;
 #if GOLDILOCKS_USE_PTHREAD
     pthread_mutex_t mutex;
 #endif
-    struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64];
+    struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64];
     struct fixed_base_table_t fixed_base;
     struct tw_niels_t wnafs[32];
     struct crandom_state_t rand;
@@ -107,7 +137,7 @@ goldilocks_init () {
     /* Precompute the tables. */
     mask_t succ;
     
-    int big = (WORD_BITS==64);
+    int big = USE_BIG_TABLES;
     uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;
 
     succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
@@ -135,55 +165,77 @@ fail:
     return -1;
 }
 
-static const struct p448_t
-sqrt_d_minus_1 = {{
-    U58LE(0xd2e21836749f46),
-    U58LE(0x888db42b4f0179),
-    U58LE(0x5a189aabdeea38),
-    U58LE(0x51e65ca6f14c06),
-    U58LE(0xa49f7b424d9770),
-    U58LE(0xdcac4628c5f656),
-    U58LE(0x49443b8748734a),
-    U58LE(0x12fec0c0b25b7a)
-}};
-
 int
-goldilocks_keygen (
+goldilocks_derive_private_key (
     struct goldilocks_private_key_t *privkey,
-    struct goldilocks_public_key_t *pubkey
+    const unsigned char proto[GOLDI_SYMKEY_BYTES]
 ) {
     if (!goldilocks_check_init()) {
         return GOLDI_EUNINIT;
     }
     
-    word_t sk[448*2/WORD_BITS];
+    memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES);
+    
+    unsigned char skb[SHA512_OUTPUT_BYTES];
+    word_t sk[GOLDI_FIELD_WORDS];
+    assert(sizeof(skb) >= sizeof(sk));
     
+    struct sha512_ctx_t ctx;
     struct tw_extensible_t exta;
     struct p448_t pk;
+    
+    sha512_init(&ctx);
+    sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
+    sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
+    sha512_final(&ctx, (unsigned char *)skb);
+
+    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448);
+    barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);
+
+    scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
+    untwist_and_double_and_serialize(&pk, &exta);
+    
+    p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
+    
+    return GOLDI_EOK;
+}
+
+void
+goldilocks_underive_private_key (
+    unsigned char proto[GOLDI_SYMKEY_BYTES],
+    const struct goldilocks_private_key_t *privkey
+) {
+    memcpy(proto, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
+}
+
+int
+goldilocks_keygen (
+    struct goldilocks_private_key_t *privkey,
+    struct goldilocks_public_key_t *pubkey
+) {
+    if (!goldilocks_check_init()) {
+        return GOLDI_EUNINIT;
+    }
+    
+    unsigned char proto[GOLDI_SYMKEY_BYTES];
 
 #if GOLDILOCKS_USE_PTHREAD
     int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex);
     if (ml_ret) return ml_ret;
 #endif
 
-    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
-    int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
-    if (!ret) ret = ret2;
+    int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto));
 
 #if GOLDILOCKS_USE_PTHREAD
     ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
     if (ml_ret) abort();
 #endif
     
-    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448);
-    barrett_serialize(privkey->opaque, sk, 448/8);
-    
-    scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base);
-    //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
-    untwist_and_double_and_serialize(&pk, &exta);
+    int ret2 = goldilocks_derive_private_key(privkey, proto);
+    if (!ret) ret = ret2;
     
-    p448_serialize(pubkey->opaque, &pk);
-    memcpy(&privkey->opaque[56], pubkey->opaque, 56);
+    ret2 = goldilocks_private_to_public(pubkey, privkey);
+    if (!ret) ret = ret2;
     
     return ret ? GOLDI_ENODICE : GOLDI_EOK;
 }
@@ -194,7 +246,7 @@ goldilocks_private_to_public (
     const struct goldilocks_private_key_t *privkey
 ) {
     struct p448_t pk;
-    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]);
+    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
     
     if (msucc) {
         p448_serialize(pubkey->opaque, &pk);
@@ -204,30 +256,46 @@ goldilocks_private_to_public (
     }
 }
 
-int
-goldilocks_shared_secret (
-    uint8_t shared[64],
+static int
+goldilocks_shared_secret_core (
+    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
     const struct goldilocks_private_key_t *my_privkey,
-    const struct goldilocks_public_key_t *your_pubkey
+    const struct goldilocks_public_key_t *your_pubkey,
+    const struct goldilocks_precomputed_public_key_t *pre
 ) {
     /* This function doesn't actually need anything in goldilocks_global,
      * so it doesn't check init.
      */
     
-    word_t sk[448/WORD_BITS];
+    assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
+    
+    word_t sk[GOLDI_FIELD_WORDS];
     struct p448_t pk;
     
     mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
     
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
     struct p448_t sum, prod;
-    msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
+    msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
     p448_mul(&prod,&pk,&sum);
     p448_add(&sum,&pk,&sum);
 #endif
     
     msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
-    succ &= montgomery_ladder(&pk,&pk,sk,446,2);
+    
+#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+    if (pre) {
+        struct tw_extensible_t tw;
+        succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table);
+        untwist_and_double_and_serialize(&pk, &tw);
+    } else {
+        succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
+    }
+#else
+    (void)pre;
+    succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
+#endif
+    
     
     p448_serialize(shared,&pk);
     
@@ -236,28 +304,28 @@ goldilocks_shared_secret (
     sha512_init(&ctx);
 
 #ifdef EXPERIMENT_ECDH_OBLITERATE_CT
-    uint8_t oblit[40];
+    uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES];
     unsigned i;
-    for (i=0; i<8; i++) {
+    for (i=0; i<GOLDI_DIVERSIFY_BYTES; i++) {
         oblit[i] = "noshared"[i] & ~(succ&msucc);
     }
-    for (i=0; i<32; i++) {
-        oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
+    for (i=0; i<GOLDI_SYMKEY_BYTES; i++) {
+        oblit[GOLDI_DIVERSIFY_BYTES+i] = my_privkey->opaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc);
     }
-    sha512_update(&ctx, oblit, 40);
+    sha512_update(&ctx, oblit, sizeof(oblit));
 #endif
     
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
     /* stir in the sum and product of the pubkeys. */
-    uint8_t a_pk[56];
+    uint8_t a_pk[GOLDI_FIELD_BYTES];
     p448_serialize(a_pk, &sum);
-    sha512_update(&ctx, a_pk, 56);
+    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
     p448_serialize(a_pk, &prod);
-    sha512_update(&ctx, a_pk, 56);
+    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
        
     /* stir in the shared key and finish */
-    sha512_update(&ctx, shared, 56);
+    sha512_update(&ctx, shared, GOLDI_FIELD_BYTES);
     sha512_final(&ctx, shared);
     
     return (GOLDI_ECORRUPT & ~msucc)
@@ -265,9 +333,42 @@ goldilocks_shared_secret (
         | (GOLDI_EOK & msucc & succ);
 }
 
+int
+goldilocks_shared_secret (
+    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
+    const struct goldilocks_private_key_t *my_privkey,
+    const struct goldilocks_public_key_t *your_pubkey
+) {
+    return goldilocks_shared_secret_core(
+        shared,
+        my_privkey,
+        your_pubkey,
+        NULL
+    );
+}
+
+static void
+goldilocks_derive_challenge(
+    word_t challenge[GOLDI_FIELD_WORDS],
+    const unsigned char pubkey[GOLDI_FIELD_BYTES],
+    const unsigned char gnonce[GOLDI_FIELD_BYTES],
+    const unsigned char *message,
+    uint64_t message_len
+) {
+    /* challenge = H(pk, [nonceG], message). */
+    unsigned char sha_out[SHA512_OUTPUT_BYTES];
+    struct sha512_ctx_t ctx;
+    sha512_init(&ctx);
+    sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES);
+    sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
+    sha512_update(&ctx, message, message_len);
+    sha512_final(&ctx, sha_out);
+    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448);
+}
+
 int
 goldilocks_sign (
-    uint8_t signature_out[56*2],
+    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
     const uint8_t *message,
     uint64_t message_len,
     const struct goldilocks_private_key_t *privkey
@@ -277,7 +378,7 @@ goldilocks_sign (
     }
     
     /* challenge = H(pk, [nonceG], message). */
-    word_t skw[448/WORD_BITS];
+    word_t skw[GOLDI_FIELD_WORDS];
     mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
     if (!succ) {
         memset(skw,0,sizeof(skw));
@@ -285,48 +386,50 @@ goldilocks_sign (
     }
         
     /* Derive a nonce.  TODO: use HMAC. FUTURE: factor. */
-    unsigned char sha_out[512/8];
-    word_t tk[448/WORD_BITS];
+    unsigned char sha_out[SHA512_OUTPUT_BYTES];
+    word_t tk[GOLDI_FIELD_WORDS];
     struct sha512_ctx_t ctx;
     sha512_init(&ctx);
     sha512_update(&ctx, (const unsigned char *)"signonce", 8);
-    sha512_update(&ctx, &privkey->opaque[112], 32);
+    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
     sha512_update(&ctx, message, message_len);
-    sha512_update(&ctx, &privkey->opaque[112], 32);
+    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
     sha512_final(&ctx, sha_out);
-    barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448);
+    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448);
     
     /* 4[nonce]G */
-    uint8_t signature_tmp[56];
+    uint8_t signature_tmp[GOLDI_FIELD_BYTES];
     struct tw_extensible_t exta;
     struct p448_t gsk;
-    scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base);
+    scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
     double_tw_extensible(&exta);
     untwist_and_double_and_serialize(&gsk, &exta);
     p448_serialize(signature_tmp, &gsk);
     
-    word_t challenge[448/WORD_BITS];
-    sha512_update(&ctx, &privkey->opaque[56], 56);
-    sha512_update(&ctx, signature_tmp, 56);
-    sha512_update(&ctx, message, message_len);
-    sha512_final(&ctx, sha_out);
-    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
+    word_t challenge[GOLDI_FIELD_WORDS];
+    goldilocks_derive_challenge (
+        challenge,
+        &privkey->opaque[GOLDI_FIELD_BYTES],
+        signature_tmp,
+        message,
+        message_len
+    );
     
     // reduce challenge and sub.
-    barrett_negate(challenge,448/WORD_BITS,&goldi_q448);
+    barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448);
 
     barrett_mac(
-        tk,448/WORD_BITS,
-        challenge,448/WORD_BITS,
-        skw,448/WORD_BITS,
+        tk,GOLDI_FIELD_WORDS,
+        challenge,GOLDI_FIELD_WORDS,
+        skw,GOLDI_FIELD_WORDS,
         &goldi_q448
     );
         
-    word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1);
-    barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448);
+    word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1);
+    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448);
         
-    memcpy(signature_out, signature_tmp, 56);
-    barrett_serialize(signature_out+56, tk, 448/8);
+    memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
+    barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
     memset((unsigned char *)tk,0,sizeof(tk));
     memset((unsigned char *)skw,0,sizeof(skw));
     memset((unsigned char *)challenge,0,sizeof(challenge));
@@ -342,7 +445,7 @@ goldilocks_sign (
 
 int
 goldilocks_verify (
-    const uint8_t signature[56*2],
+    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
     const uint8_t *message,
     uint64_t message_len,
     const struct goldilocks_public_key_t *pubkey
@@ -352,24 +455,16 @@ goldilocks_verify (
     }
     
     struct p448_t pk;
-    word_t s[448/WORD_BITS];
+    word_t s[GOLDI_FIELD_WORDS];
     
     mask_t succ = p448_deserialize(&pk,pubkey->opaque);
     if (!succ) return GOLDI_EINVAL;
     
-    succ = barrett_deserialize(s, &signature[56], &goldi_q448);
+    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
     if (!succ) return GOLDI_EINVAL;
     
-    /* challenge = H(pk, [nonceG], message). */
-    unsigned char sha_out[512/8];
-    word_t challenge[448/WORD_BITS];
-    struct sha512_ctx_t ctx;
-    sha512_init(&ctx);
-    sha512_update(&ctx, pubkey->opaque, 56);
-    sha512_update(&ctx, signature, 56);
-    sha512_update(&ctx, message, message_len);
-    sha512_final(&ctx, sha_out);
-    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
+    word_t challenge[GOLDI_FIELD_WORDS];
+    goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
     
     struct p448_t eph;
     struct tw_extensible_t pk_text;
@@ -381,7 +476,102 @@ goldilocks_verify (
     succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
     if (!succ) return GOLDI_EINVAL;
     
-    linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 );
+    linear_combo_var_fixed_vt( &pk_text,
+        challenge, GOLDI_SCALAR_BITS,
+        s, GOLDI_SCALAR_BITS,
+        goldilocks_global.wnafs, 5 );
+    
+    untwist_and_double_and_serialize( &pk, &pk_text );
+    p448_sub(&eph, &eph, &pk);
+    p448_bias(&eph, 2);
+    
+    succ = p448_is_zero(&eph);
+    
+    return succ ? 0 : GOLDI_EINVAL;
+}
+
+#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+
+struct goldilocks_precomputed_public_key_t *
+goldilocks_precompute_public_key (
+    const struct goldilocks_public_key_t *pub    
+) {
+    struct goldilocks_precomputed_public_key_t *precom;
+    precom = (struct goldilocks_precomputed_public_key_t *)
+        malloc(sizeof(*precom));
+    
+    if (!precom) return NULL;
+    
+    struct tw_extensible_t pk_text;
+    
+    struct p448_t pk;
+    mask_t succ = p448_deserialize(&pk, pub->opaque);
+    if (!succ) {
+        free(precom);
+        return NULL;
+    }
+    
+    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
+    if (!succ) {
+        free(precom);
+        return NULL;
+    }
+    
+    int big = USE_BIG_TABLES;
+    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;
+
+    succ =  precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL);
+    if (!succ) {
+        free(precom);
+        return NULL;
+    }
+    
+    memcpy(&precom->pub,pub,sizeof(*pub));
+    
+    return precom;
+}
+
+void
+goldilocks_destroy_precomputed_public_key (
+    struct goldilocks_precomputed_public_key_t *precom
+) {
+    if (!precom) return;
+    destroy_fixed_base(&precom->table);
+    memset(&precom->pub.opaque, 0, sizeof(precom->pub));
+    free(precom);
+}
+
+int
+goldilocks_verify_precomputed (
+    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_precomputed_public_key_t *pubkey
+) {
+    if (!goldilocks_check_init()) {
+        return GOLDI_EUNINIT;
+    }
+
+    word_t s[GOLDI_FIELD_WORDS];
+    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
+    if (!succ) return GOLDI_EINVAL;
+    
+    word_t challenge[GOLDI_FIELD_WORDS];
+    goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
+    
+    struct p448_t eph, pk;
+    struct tw_extensible_t pk_text;
+    
+    /* deserialize [nonce]G */
+    succ = p448_deserialize(&eph, signature);
+    if (!succ) return GOLDI_EINVAL;
+        
+    succ = linear_combo_combs_vt (
+        &pk_text,
+        challenge, GOLDI_SCALAR_BITS, &pubkey->table,
+        s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base
+    );
+    if (!succ) return GOLDI_EINVAL;
     
     untwist_and_double_and_serialize( &pk, &pk_text );
     p448_sub(&eph, &eph, &pk);
@@ -391,3 +581,20 @@ goldilocks_verify (
     
     return succ ? 0 : GOLDI_EINVAL;
 }
+
+int
+goldilocks_shared_secret_precomputed (
+    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
+    const struct goldilocks_private_key_t *my_privkey,
+    const struct goldilocks_precomputed_public_key_t *your_pubkey
+) {
+    return goldilocks_shared_secret_core(
+        shared,
+        my_privkey,
+        &your_pubkey->pub,
+        your_pubkey
+    );
+}
+
+#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+
diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h
index 02a8a1e..1dac686 100644
--- a/src/include/intrinsics.h
+++ b/src/include/intrinsics.h
@@ -12,7 +12,9 @@
 
 #include <sys/types.h>
 
+#if __i386__ || __x86_64__
 #include <immintrin.h>
+#endif
 
 #define INTRINSIC \
   static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))
diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h
index 122fccc..8b42fd7 100644
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -26,7 +26,7 @@ struct fixed_base_table_t {
   struct tw_niels_t *table;
   
   /** Adjustments to the scalar in even and odd cases, respectively. */
-  word_t scalar_adjustments[2*(448/WORD_BITS)];
+  word_t scalar_adjustments[2*(448/WORD_BITS)];  /* MAGIC */
   
   /** The number of combs in the table. */
   unsigned int n;
@@ -103,7 +103,7 @@ montgomery_ladder (
 void
 scalarmul (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
+    const word_t scalar[448/WORD_BITS] /* MAGIC */
     /* TODO? int nbits */
 );
     
@@ -124,7 +124,7 @@ scalarmul (
 void
 scalarmul_vlook (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
+    const word_t scalar[448/WORD_BITS] /* MAGIC */
     /* TODO? int nbits */
 );
 
@@ -209,7 +209,7 @@ scalarmul_fixed_base (
 void
 scalarmul_vt (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
+    const word_t scalar[448/WORD_BITS] /* MAGIC */
 );
 
 
@@ -274,14 +274,42 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt (
     struct tw_extensible_t *working,
-    const word_t scalar_var[448/WORD_BITS],
+    const word_t scalar_var[448/WORD_BITS], /* MAGIC */
     unsigned int nbits_var,
-    const word_t scalar_pre[448/WORD_BITS],
+    const word_t scalar_pre[448/WORD_BITS], /* MAGIC */
     unsigned int nbits_pre,
     const struct tw_niels_t *precmp,
     unsigned int table_bits_pre
 );
 
+/**
+ * Variable-time scalar linear combination of two fixed points.
+ *
+ * @warning This function takes variable time.  It is intended for
+ * signature verification.
+ *
+ * @param [out] working The output point.
+ * @param [in] scalar1 The first scalar.
+ * @param [in] nbits1 The number of bits in the first scalar.
+ * @param [in] table1 The first precomputed table.
+ * @param [in] scalar2 The second scalar.
+ * @param [in] nbits1 The number of bits in the second scalar.
+ * @param [in] table1 The second precomputed table.
+ *
+ * @retval MASK_SUCCESS Success.
+ * @retval MASK_FAILURE Failure, because eg the tables are too small.
+ */
+mask_t
+linear_combo_combs_vt (
+    struct tw_extensible_t *out,
+    const word_t scalar1[448/WORD_BITS],
+    unsigned int nbits1,
+    const struct fixed_base_table_t *table1,
+    const word_t scalar2[448/WORD_BITS],
+    unsigned int nbits2,
+    const struct fixed_base_table_t *table2
+);
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/src/include/sha512.h b/src/include/sha512.h
index cad1588..760e31e 100644
--- a/src/include/sha512.h
+++ b/src/include/sha512.h
@@ -10,6 +10,8 @@
 extern "C" {
 #endif
 
+#define SHA512_OUTPUT_BYTES 64
+
 /**
  * SHA512 hashing context.
  *
@@ -37,7 +39,7 @@ sha512_update (
 void
 sha512_final (
     struct sha512_ctx_t *ctx,
-    uint8_t result[64]
+    uint8_t result[SHA512_OUTPUT_BYTES]
 );
     
 #ifdef __cplusplus
diff --git a/src/include/word.h b/src/include/word.h
index 0fc7427..c638785 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -8,11 +8,22 @@
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600
 
+#ifndef __APPLE__
+#define _BSD_SOURCE
+#include <endian.h>
+#endif
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>
 
+#if __ARM_NEON__
+#include <arm_neon.h>
+#elif __SSE2__
+#include <immintrin.h>
+#endif
+
 #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
 /* It's a 64-bit machine if:
  * // limits.h thinks so
@@ -33,6 +44,7 @@ typedef __int128_t dsword_t;
 #define PRIxWORD58   "%014" PRIx64
 #define U64LE(x) x##ull
 #define U58LE(x) x##ull
+#define letohWORD letoh64
 #else
 typedef uint16_t hword_t;
 typedef uint32_t word_t;
@@ -45,15 +57,19 @@ typedef int64_t dsword_t;
 #define PRIxWORD58   "%07" PRIx32
 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
+#define letohWORD letoh32
 #endif
 
 #define WORD_BITS (sizeof(word_t) * 8)
 
-/* TODO: vector width for procs like ARM; gcc support */
-typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));
-
+typedef word_t mask_t;
 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;
 
+
+
+#ifdef __ARM_NEON__
+typedef uint32x4_t vecmask_t;
+#else
 /* FIXME this only works on clang */
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
@@ -61,8 +77,13 @@ typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
+typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
+typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
+/* TODO: vector width for procs like ARM; gcc support */
+typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
+#endif
 
 #if __AVX2__
 typedef uint32x8_t big_register_t;
@@ -82,11 +103,28 @@ typedef uint32_t big_register_t;
 #endif
 
 
-#if __AVX2__ || __SSE2__ || __ARM_NEON__
+#ifdef __ARM_NEON__
+static __inline__ big_register_t
+br_set_to_mask(mask_t x) {
+    return vdupq_n_u32(x);
+}
+#else
+static __inline__ big_register_t
+br_set_to_mask(mask_t x) {
+    return (big_register_t)x;
+}
+#endif
+
+#if __AVX2__ || __SSE2__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
     return (big_register_t)(x == (big_register_t)0);
 }
+#elif __ARM_NEON__
+static __inline__ big_register_t
+br_is_zero(big_register_t x) {
+    return vceqq_u32(x,x^x);
+}
 #else
 static __inline__ mask_t
 br_is_zero(word_t x) {
@@ -96,6 +134,22 @@ br_is_zero(word_t x) {
 
 
 
+
+#ifdef __APPLE__
+static inline uint64_t
+htobe64 (uint64_t x) {
+    __asm__ ("bswapq %0" : "+r"(x));
+    return x;
+}
+static inline uint64_t
+htole64 (uint64_t x) { return x; }
+
+static inline uint64_t
+letoh64 (uint64_t x) { return x; }
+#endif
+
+
+
 /**
  * Allocate memory which is sufficiently aligned to be used for the
  * largest vector on the system (for now that's a big_register_t).
diff --git a/src/scalarmul.c b/src/scalarmul.c
index 1ad856c..89891db 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -63,14 +63,14 @@ cond_negate_tw_pniels (
     cond_negate_tw_niels(&n->n, doNegate);
 }
 
-void    
+static __inline__ void
 constant_time_lookup_tw_pniels (
     struct tw_pniels_t *out,
     const struct tw_pniels_t *in,
     int nin,
     int idx
 ) {
-    big_register_t big_one = 1, big_i = idx;
+    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
     big_register_t *o = (big_register_t *)out;
     const big_register_t *i = (const big_register_t *)in;
     int j;
@@ -85,14 +85,14 @@ constant_time_lookup_tw_pniels (
     }
 }
 
-static __inline__ void    
+static __inline__ void
 constant_time_lookup_tw_niels (
     struct tw_niels_t *out,
     const struct tw_niels_t *in,
     int nin,
     int idx
 ) {
-    big_register_t big_one = 1, big_i = idx;
+    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
     big_register_t *o = (big_register_t *)out;
     const big_register_t *i = (const big_register_t *)in;
     int j;
@@ -139,64 +139,73 @@ scalarmul (
     struct tw_extensible_t *working,
     const word_t scalar[448/WORD_BITS]
 ) {
-
-    const int nbits=448; /* HACK? */
+    const int nbits=450; /* MAGIC */
     word_t prepared_data[448*2/WORD_BITS] = {
-        U64LE(0x9595b847fdf73126),
-        U64LE(0x9bb9b8a856af5200),
-        U64LE(0xb3136e22f37d5c4f),
-        U64LE(0x0000000189a19442),
+        
+        U64LE(0xebec9967f5d3f5c2),
+        U64LE(0x0aa09b49b16c9a02),
+        U64LE(0x7f6126aec172cd8e),
+        U64LE(0x00000007b027e54d),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000),
         U64LE(0x4000000000000000),
-
-        U64LE(0x721cf5b5529eec33),
-        U64LE(0x7a4cf635c8e9c2ab),
-        U64LE(0xeec492d944a725bf),
-        U64LE(0x000000020cd77058),
+            
+        U64LE(0xc873d6d54a7bb0cf),
+        U64LE(0xe933d8d723a70aad),
+        U64LE(0xbb124b65129c96fd),
+        U64LE(0x00000008335dc163),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000)
-    }; /* TODO: split off */
+    }; /* MAGIC */
     
     word_t scalar2[448/WORD_BITS];
     convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
+    
+    const int WINDOW = 5, /* MAGIC */
+        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
+        NTABLE = 1<<(WINDOW-1);
 
     struct tw_extensible_t tabulator;
     copy_tw_extensible(&tabulator, working);
     double_tw_extensible(&tabulator);
 
-    struct tw_pniels_t pn, multiples[8];
+    struct tw_pniels_t pn, multiples[NTABLE];
     convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
     convert_tw_extensible_to_tw_pniels(&multiples[0], working);
 
-    int i;
-    for (i=1; i<8; i++) {
+    int i,j;
+    for (i=1; i<NTABLE; i++) {
         add_tw_pniels_to_tw_extensible(working, &pn);
         convert_tw_extensible_to_tw_pniels(&multiples[i], working);
     }
 
-    i = nbits - 4;
-    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
-        inv = (bits>>3)-1;
+    i = nbits - WINDOW;
+    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
+        inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
     
-    constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
+    constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
     cond_negate_tw_pniels(&pn, inv);
     convert_tw_pniels_to_tw_extensible(working, &pn);
 		
 
-    for (i-=4; i>=0; i-=4) {
-        double_tw_extensible(working);
-        double_tw_extensible(working);
-        double_tw_extensible(working);
-        double_tw_extensible(working);
+    for (i-=WINDOW; i>=0; i-=WINDOW) {
+        for (j=0; j<WINDOW; j++) {
+            double_tw_extensible(working);
+        }
 
-        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
-        inv = (bits>>3)-1;
+        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
+        
+        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
+            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
+        }
+                
+        bits &= WINDOW_MASK;
+        inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
-        constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
+        constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
         cond_negate_tw_pniels(&pn, inv);
         add_tw_pniels_to_tw_extensible(working, &pn);
     }
@@ -207,81 +216,89 @@ scalarmul_vlook (
     struct tw_extensible_t *working,
     const word_t scalar[448/WORD_BITS]
 ) {
-
-    const int nbits=448; /* HACK? */
+    const int nbits=450; /* HACK? */
     word_t prepared_data[448*2/WORD_BITS] = {
-        U64LE(0x9595b847fdf73126),
-        U64LE(0x9bb9b8a856af5200),
-        U64LE(0xb3136e22f37d5c4f),
-        U64LE(0x0000000189a19442),
+        
+        U64LE(0xebec9967f5d3f5c2),
+        U64LE(0x0aa09b49b16c9a02),
+        U64LE(0x7f6126aec172cd8e),
+        U64LE(0x00000007b027e54d),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000),
         U64LE(0x4000000000000000),
-
-        U64LE(0x721cf5b5529eec33),
-        U64LE(0x7a4cf635c8e9c2ab),
-        U64LE(0xeec492d944a725bf),
-        U64LE(0x000000020cd77058),
+            
+        U64LE(0xc873d6d54a7bb0cf),
+        U64LE(0xe933d8d723a70aad),
+        U64LE(0xbb124b65129c96fd),
+        U64LE(0x00000008335dc163),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000),
         U64LE(0x0000000000000000)
-    }; /* TODO: split off */
+    }; /* MAGIC: split off */
     
     word_t scalar2[448/WORD_BITS];
     convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
+    
+    const int WINDOW = 5, /* MAGIC */
+        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
+        NTABLE = 1<<(WINDOW-1);
 
     struct tw_extensible_t tabulator;
     copy_tw_extensible(&tabulator, working);
     double_tw_extensible(&tabulator);
 
-    struct tw_pniels_t pn, multiples[8];
+    struct tw_pniels_t pn, multiples[NTABLE];
     convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
     convert_tw_extensible_to_tw_pniels(&multiples[0], working);
 
-    int i;
-    for (i=1; i<8; i++) {
+    int i,j;
+    for (i=1; i<NTABLE; i++) {
         add_tw_pniels_to_tw_extensible(working, &pn);
         convert_tw_extensible_to_tw_pniels(&multiples[i], working);
     }
 
-    i = nbits - 4;
-    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
-        inv = (bits>>3)-1;
+    i = nbits - WINDOW;
+    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
+        inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
 
-	copy_tw_pniels(&pn, &multiples[bits&7]);
+    copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
     cond_negate_tw_pniels(&pn, inv);
     convert_tw_pniels_to_tw_extensible(working, &pn);
 		
 
-    for (i-=4; i>=0; i-=4) {
-        double_tw_extensible(working);
-        double_tw_extensible(working);
-        double_tw_extensible(working);
-        double_tw_extensible(working);
+    for (i-=WINDOW; i>=0; i-=WINDOW) {
+        for (j=0; j<WINDOW; j++) {
+            double_tw_extensible(working);
+        }
 
-        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
-        inv = (bits>>3)-1;
+        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
+        
+        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
+            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
+        }
+                
+        bits &= WINDOW_MASK;
+        inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
-		copy_tw_pniels(&pn, &multiples[bits&7]);
+        copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
         cond_negate_tw_pniels(&pn, inv);
         add_tw_pniels_to_tw_extensible(working, &pn);
     }
 }
 
-
-mask_t
-scalarmul_fixed_base (
-    struct tw_extensible_t *out,
-    const word_t scalar[448/WORD_BITS],
+static mask_t
+schedule_scalar_for_combs (
+    word_t *scalar2,
+    const word_t *scalar,
     unsigned int nbits,
     const struct fixed_base_table_t *table
 ) {
+    unsigned int i;
     unsigned int n = table->n, t = table->t, s = table->s;
-    assert(n >= 1 && t >= 1 && s >= 1);
     
-    if (n*t*s < nbits) {
+    if (n*t*s < nbits || n < 1 || t < 1 || s < 1) {
         return MASK_FAILURE;
     }
     
@@ -289,10 +306,9 @@ scalarmul_fixed_base (
         scalar2_words = scalar_words;
     if (scalar2_words < 448 / WORD_BITS)
         scalar2_words = 448 / WORD_BITS;
-    word_t scalar2[scalar2_words], scalar3[scalar2_words];
+    word_t scalar3[scalar2_words];
     
     /* Copy scalar to scalar3, but clear its high bits (if there are any) */
-    unsigned int i,j,k;
     for (i=0; i<scalar_words; i++) {
         scalar3[i] = scalar[i];
     }
@@ -309,6 +325,31 @@ scalarmul_fixed_base (
         table->scalar_adjustments , 448 / WORD_BITS
     );
     
+    return MASK_SUCCESS;
+}
+
+mask_t
+scalarmul_fixed_base (
+    struct tw_extensible_t *out,
+    const word_t scalar[448/WORD_BITS],
+    unsigned int nbits,
+    const struct fixed_base_table_t *table
+) {
+    unsigned int i,j,k;
+    unsigned int n = table->n, t = table->t, s = table->s;
+    
+    unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
+    if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS;
+    
+    word_t scalar2[scalar2_words];
+
+    mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table);
+    if (!succ) return MASK_FAILURE;
+    
+#ifdef __clang_analyzer__
+    assert(t >= 1);
+#endif
+    
     struct tw_niels_t ni;
     
     for (i=0; i<s; i++) {
@@ -345,6 +386,90 @@ scalarmul_fixed_base (
     return MASK_SUCCESS;
 }
 
+mask_t
+linear_combo_combs_vt (
+    struct tw_extensible_t *out,
+    const word_t scalar1[448/WORD_BITS],
+    unsigned int nbits1,
+    const struct fixed_base_table_t *table1,
+    const word_t scalar2[448/WORD_BITS],
+    unsigned int nbits2,
+    const struct fixed_base_table_t *table2
+) { 
+    unsigned int i,j,k,sc;
+    unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
+    
+    unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
+    if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS;
+    
+    unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
+    if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS;
+    
+    word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];
+
+    /* Schedule the scalars */
+    mask_t succ;
+    succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1);
+    if (!succ) return MASK_FAILURE;
+  
+    succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2);
+    if (!succ) return MASK_FAILURE;
+
+#ifdef __clang_analyzer__
+    assert(table1->t >= 1);
+    assert(table2->t >= 1);
+#endif
+  
+    struct tw_niels_t ni;
+    
+    unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
+    word_t *scalars[2] = {scalar1b,scalar2b};
+    
+    for (i=0; i<smax; i++) {
+        if (i) double_tw_extensible(out);
+            
+        for (sc=0; sc<2; sc++) {
+            const struct fixed_base_table_t *table = sc ? table2 : table1;
+            
+            int ii = i-smax+table->s;
+            if (ii < 0) continue;
+            assert(ii < (int)table->s);
+        
+            for (j=0; j<table->n; j++) {
+            
+                int tab = 0;
+
+                for (k=0; k<table->t; k++) {
+                    unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t);
+                    if (bit < swords[sc] * WORD_BITS) {
+                        tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
+                    }
+                }
+            
+                mask_t invert = (tab>>(table->t-1))-1;
+                tab ^= invert;
+                tab &= (1<<(table->t-1)) - 1;
+            
+                copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]);
+                cond_negate_tw_niels(&ni,invert);
+                
+                if (started) {
+                    add_tw_niels_to_tw_extensible(out, &ni);
+                } else {
+                    convert_tw_niels_to_tw_extensible(out, &ni);
+                    started = 1;
+                }
+            
+            }
+        }
+        
+        assert(started);
+    }
+    
+    return MASK_SUCCESS;
+}
+
+
 mask_t
 precompute_fixed_base (
   struct fixed_base_table_t *out,
@@ -354,7 +479,7 @@ precompute_fixed_base (
   unsigned int s,
   struct tw_niels_t *prealloc
 ) {
-    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) {
+    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */
         memset(out, 0, sizeof(*out));
         return 0;
     }
@@ -402,7 +527,7 @@ precompute_fixed_base (
     
     adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);
 
-    /* FIXME: factor out somehow */
+    /* MAGIC: factor out somehow */
     const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
         U64LE(0xdc873d6d54a7bb0d),
         U64LE(0xde933d8d723a70aa),
@@ -462,13 +587,13 @@ precompute_fixed_base (
         /* Gray-code phase */
         for (j=0;; j++) {
             int gray = j ^ (j>>1);
-            int idx = ((i+1)<<(t-1))-1 ^ gray;
+            int idx = (((i+1)<<(t-1))-1) ^ gray;
 
             convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
             copy_tw_niels(&table[idx], &pn_tmp.n);
             p448_copy(&zs[idx], &pn_tmp.z);
 			
-            if (j >= (1<<(t-1)) - 1) break;
+            if (j >= (1u<<(t-1)) - 1) break;
             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
 
             for (k=0; delta>1; k++)
@@ -777,7 +902,7 @@ linear_combo_var_fixed_vt(
     const struct tw_niels_t *precmp,
     unsigned int table_bits_pre
 ) {
-    const int table_bits_var = 3;
+    const int table_bits_var = 4;
     struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
     struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
     
diff --git a/src/sha512.c b/src/sha512.c
index dd1468b..3e46287 100644
--- a/src/sha512.c
+++ b/src/sha512.c
@@ -2,12 +2,8 @@
  * Copyright (c) 2014 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
-#ifndef __APPLE__
-#define _BSD_SOURCE
-#include <endian.h>
-#endif
-
 #include "sha512.h"
+#include "word.h"
 
 #include <string.h>
 #include <assert.h>
@@ -20,14 +16,6 @@ rotate_r (
   return (x >> d) | (x << (64-d));
 }
 
-#ifdef __APPLE__
-static inline uint64_t
-htobe64 (uint64_t x) {
-    __asm__ ("bswapq %0" : "+r"(x));
-    return x;
-}
-#endif
-
 static const uint64_t
 sha512_init_state[8] = {
     0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 
diff --git a/test/bench.c b/test/bench.c
index b54488f..2e90e9a 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -17,23 +17,28 @@
 #include "goldilocks.h"
 #include "sha512.h"
 
-double now() {
+static __inline__ void
+ignore_result ( int result ) {
+    (void)result;
+}
+
+static double now() {
   struct timeval tv;
   gettimeofday(&tv, NULL);
   
   return tv.tv_sec + tv.tv_usec/1000000.0;
 }
 
-void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
+static void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
     crandom_generate(crand, (unsigned char *)a, sizeof(*a));
     p448_strong_reduce(a);
 }
 
-void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
+static void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
     crandom_generate(crand, (unsigned char *)sk, 448/8);
 }
 
-void p448_print( const char *descr, const struct p448_t *a ) {
+static void p448_print( const char *descr, const struct p448_t *a ) {
     p448_t b;
     p448_copy(&b, a);
     p448_strong_reduce(&b);
@@ -45,17 +50,21 @@ void p448_print( const char *descr, const struct p448_t *a ) {
     printf("\n");
 }
 
-void p448_print_full( const char *descr, const struct p448_t *a ) {
+static void __attribute__((unused))
+p448_print_full (
+    const char *descr,
+    const struct p448_t *a
+) {
     int j;
     printf("%s = 0x", descr);
     for (j=15; j>=0; j--) {
         printf("%02" PRIxWORD "_" PRIxWORD58 " ",
-            a->limb[j]>>28, a->limb[j]&(1<<28)-1);
+            a->limb[j]>>28, a->limb[j]&((1<<28)-1));
     }
     printf("\n");
 }
 
-void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
+static void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
     int j;
     printf("%s = 0x", descr);
     for (j=448/WORD_BITS-1; j>=0; j--) {
@@ -295,7 +304,7 @@ int main(int argc, char **argv) {
 	
     when = now();
     for (i=0; i<nbase/10; i++) {
-        (void)montgomery_ladder(&a,&b,sk,448,0);
+        ignore_result(montgomery_ladder(&a,&b,sk,448,0));
     }
     when = now() - when;
     printf("full ladder: %5.1fµs\n", when * 1e6 / i);
@@ -310,11 +319,18 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         scalarmul_vlook(&ext,sk);
-        untwist_and_double_and_serialize(&a,&ext);
     }
     when = now() - when;
     printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
     
+    when = now();
+    for (i=0; i<nbase/10; i++) {
+        scalarmul(&ext,sk);
+        untwist_and_double_and_serialize(&a,&ext);
+    }
+    when = now() - when;
+    printf("edwards smc: %5.1fµs\n", when * 1e6 / i);
+    
     when = now();
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
@@ -326,7 +342,7 @@ int main(int argc, char **argv) {
     struct tw_niels_t wnaft[1<<6];
     when = now();
     for (i=0; i<nbase/10; i++) {
-        (void)precompute_fixed_base_wnaf(wnaft,&ext,6);
+        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,6));
     }
     when = now() - when;
     printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -341,7 +357,7 @@ int main(int argc, char **argv) {
     
     when = now();
     for (i=0; i<nbase/10; i++) {
-        (void)precompute_fixed_base_wnaf(wnaft,&ext,4);
+        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,4));
     }
     when = now() - when;
     printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -356,7 +372,7 @@ int main(int argc, char **argv) {
 
     when = now();
     for (i=0; i<nbase/10; i++) {
-        (void)precompute_fixed_base_wnaf(wnaft,&ext,5);
+        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,5));
     }
     when = now() - when;
     printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -401,7 +417,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         if (i) destroy_fixed_base(&t_5_5_18);
-        (void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL);
+        ignore_result(precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL));
     }
     when = now() - when;
     printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
@@ -409,7 +425,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         if (i) destroy_fixed_base(&t_3_5_30);
-        (void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL);
+        ignore_result(precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL));
     }
     when = now() - when;
     printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i);
@@ -417,7 +433,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         if (i) destroy_fixed_base(&t_5_3_30);
-        (void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL);
+        ignore_result(precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL));
     }
     when = now() - when;
     printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i);
@@ -425,15 +441,15 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         if (i) destroy_fixed_base(&t_15_3_10);
-        (void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL);
+        ignore_result(precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL));
     }
     when = now() - when;
-    printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i);
+    printf("pre(15,3,10):%5.1fµs\n", when * 1e6 / i);
     
     when = now();
     for (i=0; i<nbase/10; i++) {
         if (i) destroy_fixed_base(&t_8_4_14);
-        (void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL);
+        ignore_result(precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL));
     }
     when = now() - when;
     printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i);
@@ -471,7 +487,7 @@ int main(int argc, char **argv) {
         scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10);
     }
     when = now() - when;
-    printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i);
+    printf("com(15,3,10):%5.1fµs\n", when * 1e6 / i);
     
     printf("\nGoldilocks:\n");
     
@@ -494,7 +510,7 @@ int main(int argc, char **argv) {
     printf("keygen:      %5.1fµs\n", when * 1e6 / i);
     
     uint8_t ss1[64],ss2[64];
-    int gres1,gres2;
+    int gres1=0,gres2=0;
     when = now();
     for (i=0; i<nbase; i++) {
         if (i&1) {
@@ -540,18 +556,43 @@ int main(int argc, char **argv) {
     
     when = now();
     for (i=0; i<nbase; i++) {
-        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
-        (void)res;
+        int ver = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
+        assert(!ver);
     }
     when = now() - when;
     printf("verify:      %5.1fµs\n", when * 1e6 / i);
     
+    struct goldilocks_precomputed_public_key_t *pre = NULL;
+    when = now();
+    for (i=0; i<nbase; i++) {
+        goldilocks_destroy_precomputed_public_key(pre);
+        pre = goldilocks_precompute_public_key(&gpk);
+    }
+    when = now() - when;
+    printf("precompute:  %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        int ver = goldilocks_verify_precomputed(sout,(const unsigned char *)message,message_len,pre);
+        assert(!ver);
+    }
+    when = now() - when;
+    printf("verify pre:  %5.1fµs\n", when * 1e6 / i);
+    
+    when = now();
+    for (i=0; i<nbase; i++) {
+        int ret = goldilocks_shared_secret_precomputed(ss1,&gsk,pre);
+        assert(!ret);
+    }
+    when = now() - when;
+    printf("ecdh pre:    %5.1fµs\n", when * 1e6 / i);
+    
     printf("\nTesting...\n");
     
     
     int failures=0, successes = 0;
     for (i=0; i<nbase/10; i++) {
-        (void)goldilocks_keygen(&gsk,&gpk);
+        ignore_result(goldilocks_keygen(&gsk,&gpk));
         goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
         res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
         if (res) failures++;
@@ -574,9 +615,9 @@ int main(int argc, char **argv) {
         y = (hword_t)y;
         word_t z=x*y;
         
-	(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0);
-        (void)montgomery_ladder(&c,&b,&y,WORD_BITS,0);
-        (void)montgomery_ladder(&b,&a,&z,WORD_BITS,0);
+    	ignore_result(montgomery_ladder(&b,&a,&x,WORD_BITS,0));
+        ignore_result(montgomery_ladder(&c,&b,&y,WORD_BITS,0));
+        ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0));
         
         p448_sub(&d,&b,&c);
         p448_bias(&d,2);
@@ -655,7 +696,7 @@ int main(int argc, char **argv) {
         untwist_and_double(&exta,&exv);
         serialize_extensible(&b, &exta);
 
-        (void)precompute_fixed_base_wnaf(wnaft,&exu,5);
+        ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5));
         linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
         untwist_and_double(&exta,&exv);
         serialize_extensible(&c, &exta);
diff --git a/test/test.c b/test/test.c
index 3cddf2e..f8b9907 100644
--- a/test/test.c
+++ b/test/test.c
@@ -6,7 +6,7 @@
 
 int failed_tests, n_tests, failed_this_test, running_a_test;
 
-void end_test() {
+static void end_test() {
     if (!failed_this_test) {
         printf("[PASS]\n");
     }
@@ -14,7 +14,7 @@ void end_test() {
     running_a_test = 0;
 }
 
-void begin_test(const char *name) {
+static void begin_test(const char *name) {
     if (running_a_test) end_test();
     printf("%s...%*s",name,(int)(30-strlen(name)),"");
     fflush(stdout);
@@ -110,8 +110,9 @@ int main(int argc, char **argv) {
     (void) argv;
     
     n_tests = running_a_test = failed_tests = 0;
-    begin_test("SHA-512 NIST Monte Carlo");
-    test_sha512_monte_carlo();
+
+    begin_test("Arithmetic");
+    test_arithmetic();
 
     begin_test("EC point operations");
     test_pointops();
@@ -122,6 +123,15 @@ int main(int argc, char **argv) {
     begin_test("Scalarmul commutativity");
     test_scalarmul_commutativity();
     
+    begin_test("Linear combo");
+    test_linear_combo();
+    
+    begin_test("SHA-512 NIST Monte Carlo");
+    test_sha512_monte_carlo();
+    
+    begin_test("Goldilocks complete system");
+    test_goldilocks();
+    
     if (running_a_test) end_test();
     printf("\n");
     if (failed_tests) {
diff --git a/test/test.h b/test/test.h
index 5bbdc48..91d6c3f 100644
--- a/test/test.h
+++ b/test/test.h
@@ -33,10 +33,16 @@ void youfail();
 
 int test_sha512_monte_carlo();
 
+int test_linear_combo ();
+
 int test_scalarmul_compatibility ();
 
 int test_scalarmul_commutativity ();
 
+int test_arithmetic ();
+
+int test_goldilocks ();
+
 int test_pointops ();
 
 #endif // __GOLDILOCKS_TEST_H__
diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c
new file mode 100644
index 0000000..7fde48c
--- /dev/null
+++ b/test/test_arithmetic.c
@@ -0,0 +1,196 @@
+#include "p448.h"
+#include "test.h"
+#include <gmp.h>
+#include <string.h>
+#include <stdio.h>
+
+mpz_t mp_p448;
+
+static mask_t mpz_to_p448 (
+    struct p448_t *out,
+    const mpz_t in
+) {
+    uint8_t ser[56];
+    mpz_t modded;
+    memset(ser,0,sizeof(ser));
+    mpz_init(modded);
+    mpz_mod(modded, in, mp_p448);
+    mpz_export(ser, NULL, -1, 1, -1, 0, modded);
+    mask_t succ = p448_deserialize(out, ser);
+    return succ;
+}
+
+static mask_t p448_assert_eq_gmp(
+    const char *descr,
+    const struct p448_t *x,
+    const mpz_t y,
+    float lowBound,
+    float highBound
+) {
+    uint8_t xser[56], yser[56];
+    mpz_t modded;
+    
+    memset(yser,0,sizeof(yser));
+    
+    p448_serialize(xser, x);
+    
+    mpz_init(modded);
+    mpz_mod(modded, y, mp_p448);
+    mpz_export(yser, NULL, -1, 1, -1, 0, modded);
+    
+    unsigned int i;
+    for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
+        int bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
+        word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
+            (1ull<<bits) - 2 : (1ull<<bits) - 1;
+        if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
+            youfail();
+            printf("    P448 limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n",
+                 i, x->limb[i], lowBound, highBound, descr, yardstick);
+            break;
+        }
+    }
+    
+    if (memcmp(xser,yser,56)) {
+        youfail();
+        printf("    Failed arithmetic test %s\n", descr);
+        p448_print("    p448", x);
+        printf("    gmp  = 0x");
+        int j;
+        for (j=55; j>=0; j--) {
+            printf("%02x", yser[j]);
+        }
+        printf("\n");
+        return MASK_FAILURE;
+    }
+    
+    mpz_clear(modded);
+    return MASK_SUCCESS;
+}
+
+static mask_t test_add_sub (
+    const mpz_t x,
+    const mpz_t y,
+    word_t word
+) {
+    struct p448_t xx,yy,tt;
+    mpz_t t;
+    mask_t succ = MASK_SUCCESS;
+    succ  = mpz_to_p448(&xx,x);
+    succ &= mpz_to_p448(&yy,y);
+    mpz_init(t);
+    
+    p448_add(&tt,&xx,&yy);
+    mpz_add(t,x,y);
+    succ &= p448_assert_eq_gmp("add",&tt,t,0,2.1);
+    
+    p448_sub(&tt,&xx,&yy);
+    p448_bias(&tt,2);
+    mpz_sub(t,x,y);
+    succ &= p448_assert_eq_gmp("sub",&tt,t,0,3.1);
+    
+    p448_copy(&tt,&xx);
+    p448_addw(&tt,word);
+    mpz_add_ui(t,x,word);
+    succ &= p448_assert_eq_gmp("addw",&tt,t,0,2.1);
+    
+    p448_copy(&tt,&xx);
+    p448_subw(&tt,word);
+    p448_bias(&tt,1);
+    mpz_sub_ui(t,x,word);
+    succ &= p448_assert_eq_gmp("subw",&tt,t,0,2.1);
+    
+    if (!succ) {
+        p448_print("    x", &xx);
+        p448_print("    y", &yy);
+    }
+    
+    mpz_clear(t);
+    
+    return succ;
+}
+
+static mask_t test_mul_sqr (
+    const mpz_t x,
+    const mpz_t y,
+    word_t word
+) {
+    struct p448_t xx,yy,tt;
+    mpz_t t;
+    mask_t succ = MASK_SUCCESS;
+    succ  = mpz_to_p448(&xx,x);
+    succ &= mpz_to_p448(&yy,y);
+    mpz_init(t);
+    
+    p448_mul(&tt,&xx,&yy);
+    mpz_mul(t,x,y);
+    succ &= p448_assert_eq_gmp("mul",&tt,t,0,1.1);
+    
+    p448_mulw(&tt,&xx,word);
+    mpz_mul_ui(t,x,word);
+    succ &= p448_assert_eq_gmp("mulw",&tt,t,0,1.1);
+    
+    p448_sqr(&tt,&xx);
+    mpz_mul(t,x,x);
+    succ &= p448_assert_eq_gmp("sqrx",&tt,t,0,1.1);
+    
+    p448_sqr(&tt,&yy);
+    mpz_mul(t,y,y);
+    succ &= p448_assert_eq_gmp("sqy",&tt,t,0,1.1);
+    
+    if (!succ) {
+        p448_print("    x", &xx);
+        p448_print("    y", &yy);
+    }
+    
+    mpz_clear(t);
+    
+    return succ;
+}
+
+int test_arithmetic () {
+    int j, ntests = 100000;
+    
+    gmp_randstate_t state;
+    gmp_randinit_mt(state);
+    
+    uint8_t pser[56];
+    for (j=0; j<56; j++) {
+        pser[j] = (j==28) ? 0xFE : 0xFF;
+    }
+    mpz_init(mp_p448);
+    mpz_import(mp_p448, 56, -1, 1, -1, 0, pser);
+    
+    mpz_t x,y;
+    mpz_init(x);
+    mpz_init(y);
+    
+    mask_t succ = MASK_SUCCESS;
+    
+    int bits = sizeof(word_t) * 448 / sizeof(p448_t);
+    
+    for (j=0; j<ntests; j++) {
+        if (j&1) {
+            mpz_rrandomb(x, state, 448);
+            mpz_rrandomb(y, state, 448);
+        } else {
+            mpz_urandomb(x, state, 448);
+            mpz_urandomb(y, state, 448);
+        }
+        
+        word_t word = gmp_urandomm_ui (state, 1ull<<bits);
+        
+        succ &= test_add_sub(x,y,word);
+        succ &= test_mul_sqr(x,y,word);
+        
+        // TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
+    }
+    
+    mpz_clear(x);
+    mpz_clear(y);
+    mpz_clear(mp_p448);
+    gmp_randclear(state);
+    
+    return succ ? 0 : 1;
+}
+
diff --git a/test/test_goldilocks.c b/test/test_goldilocks.c
new file mode 100644
index 0000000..460a26d
--- /dev/null
+++ b/test/test_goldilocks.c
@@ -0,0 +1,195 @@
+#include "test.h"
+#include "goldilocks.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int test_goldilocks () {
+    const char *message1 = "hello world";
+    const char *message2 = "Jello world";
+    
+    unsigned char signature[GOLDI_SIGNATURE_BYTES];
+    
+    unsigned char
+        ss12[GOLDI_SHARED_SECRET_BYTES],
+        ss21[GOLDI_SHARED_SECRET_BYTES],
+        ss21p[GOLDI_SHARED_SECRET_BYTES],
+        proto[GOLDI_SYMKEY_BYTES];
+    
+    struct goldilocks_public_key_t  pub, pub2;
+    struct goldilocks_private_key_t priv, priv2;
+    struct goldilocks_precomputed_public_key_t *pre = NULL;
+    
+    int i, ret, good = 1;
+    
+    ret = goldilocks_init();
+    if (ret) {
+        youfail();
+        printf("    Failed init.\n");
+    }
+    
+    for (i=0; i<1000 && good; i++) {
+        
+        ret = goldilocks_keygen(&priv, &pub);
+        if (ret) {
+            youfail();
+            printf("    Failed keygen trial %d.\n", i);
+            good = 0;
+        }
+        
+        goldilocks_destroy_precomputed_public_key( pre );
+        pre = goldilocks_precompute_public_key ( &pub );
+        if (!pre) {
+            youfail();
+            printf("    Failed precomp-public trial %d.\n", i);
+            return -1;
+        }
+        
+        ret = goldilocks_sign(
+            signature,
+            (const unsigned char *)message1,
+            strlen(message1),
+            &priv
+        );
+        if (ret) {
+            youfail();
+            printf("    Failed sign trial %d.\n", i);
+            good = 0;
+        }
+        
+        ret = goldilocks_verify(
+            signature,
+            (const unsigned char *)message1,
+            strlen(message1),
+            &pub
+        );
+        if (ret) {
+            youfail();
+            printf("    Failed verify trial %d.\n", i);
+            good = 0;
+        }
+
+        ret = goldilocks_verify_precomputed (
+            signature,
+            (const unsigned char *)message1,
+            strlen(message1),
+            pre
+        );
+        if (ret) {
+            youfail();
+            printf("    Failed verify-pre trial %d.\n", i);
+            good = 0;
+        }
+        
+        /* terrible negative test */
+        ret = goldilocks_verify(
+            signature,
+            (const unsigned char *)message2,
+            strlen(message1),
+            &pub
+        );
+        if (ret != GOLDI_EINVAL) {
+            youfail();
+            printf("    Failed nega-verify trial %d.\n", i);
+            good = 0;
+        }
+        ret = goldilocks_verify_precomputed(
+            signature,
+            (const unsigned char *)message2,
+            strlen(message1),
+            pre
+        );
+        if (ret != GOLDI_EINVAL) {
+            youfail();
+            printf("    Failed nega-verify-pre trial %d.\n", i);
+            good = 0;
+        }
+        
+        /* honestly a slightly better negative test */
+        memset(signature,0,sizeof(signature));
+        ret = goldilocks_verify(
+            signature,
+            (const unsigned char *)message1,
+            strlen(message1),
+            &pub
+        );
+        if (ret != GOLDI_EINVAL) {
+            youfail();
+            printf("    Failed nega-verify-0 trial %d.\n", i);
+            good = 0;
+        }
+        ret = goldilocks_verify_precomputed(
+            signature,
+            (const unsigned char *)message1,
+            strlen(message1),
+            pre
+        );
+        if (ret != GOLDI_EINVAL) {
+            youfail();
+            printf("    Failed nega-verify-pre-0 trial %d.\n", i);
+            good = 0;
+        }
+        
+        /* ecdh */
+        ret = goldilocks_keygen(&priv2, &pub2);
+        if (ret) {
+            youfail();
+            printf("    Failed keygen2 trial %d.\n", i);
+            good = 0;
+        }
+        
+        ret = goldilocks_shared_secret ( ss12, &priv, &pub2 );
+        if (ret) {
+            youfail();
+            printf("    Failed ss12 trial %d.\n", i);
+            good = 0;
+        }
+        
+        ret = goldilocks_shared_secret ( ss21, &priv2, &pub );
+        if (ret) {
+            youfail();
+            printf("    Failed ss21 trial %d.\n", i);
+            good = 0;
+        }
+        
+        ret = goldilocks_shared_secret_precomputed ( ss21p, &priv2, pre );
+        if (ret) {
+            youfail();
+            printf("    Failed ss21p trial %d.\n", i);
+            good = 0;
+        }
+        
+        if (memcmp(ss12,ss21,sizeof(ss12))) {
+            youfail();
+            printf("    Failed shared-secret trial %d.\n", i);
+            good = 0;
+        }
+        
+        if (memcmp(ss21,ss21p,sizeof(ss21))) {
+            youfail();
+            printf("    Failed shared-secret precomp trial %d.\n", i);
+            good = 0;
+        }
+        
+        /* test derive / underive / priv to pub */
+        goldilocks_underive_private_key ( proto, &priv );
+        ret = goldilocks_derive_private_key ( &priv2, proto );
+        if (ret || memcmp(&priv,&priv2,sizeof(priv))) {
+            youfail();    
+            printf("    Failed derive round-trip trial %d.\n", i);
+            good = 0;
+        }
+        
+        ret = goldilocks_private_to_public ( &pub2, &priv );
+        if (ret || memcmp(&pub,&pub2,sizeof(pub))) {
+            youfail();
+            printf("    Failed private-to-public trial %d.\n", i);
+            good = 0;
+        }
+        
+    }
+    
+    goldilocks_destroy_precomputed_public_key( pre );
+    
+    return good ? 0 : -1;
+}
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index d98cfd8..d6f16c7 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -159,6 +159,92 @@ single_scalarmul_compatibility_test (
     return ret;
 }
 
+static int
+single_linear_combo_test (
+    const struct p448_t *base1,
+    const word_t *scalar1,
+    int nbits1,
+    const struct p448_t *base2,
+    const word_t *scalar2,
+    int nbits2
+) {
+    /* MAGIC */
+    const struct p448_t
+    sqrt_d_minus_1 = {{
+        U58LE(0xd2e21836749f46),
+        U58LE(0x888db42b4f0179),
+        U58LE(0x5a189aabdeea38),
+        U58LE(0x51e65ca6f14c06),
+        U58LE(0xa49f7b424d9770),
+        U58LE(0xdcac4628c5f656),
+        U58LE(0x49443b8748734a),
+        U58LE(0x12fec0c0b25b7a)
+    }};
+    
+    struct tw_extensible_t text1, text2, working;
+    struct tw_pniels_t pn;
+    struct p448_t result_comb, result_combo, result_wnaf;
+    
+    mask_t succ = 
+        deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1)
+      & deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2);
+    if (!succ) return 1;
+    
+    struct fixed_base_table_t t1, t2;
+    struct tw_niels_t wnaf[32];
+    memset(&t1,0,sizeof(t1));
+    memset(&t2,0,sizeof(t2));
+    
+    succ = precompute_fixed_base(&t1, &text1, 5, 5, 18, NULL);
+    succ &= precompute_fixed_base(&t2, &text2, 6, 3, 25, NULL);
+    succ &= precompute_fixed_base_wnaf(wnaf, &text2, 5);
+    
+    if (!succ) {
+        destroy_fixed_base(&t1);
+        destroy_fixed_base(&t2);
+        return -1;
+    }
+    
+    /* use the dedicated wNAF linear combo algorithm */
+    copy_tw_extensible(&working, &text1);
+    linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5);
+    untwist_and_double_and_serialize(&result_wnaf, &working);
+    
+    /* use the dedicated combs algorithm */
+    succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2);
+    untwist_and_double_and_serialize(&result_combo, &working);
+    
+    /* use two combs */
+    succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1);
+    convert_tw_extensible_to_tw_pniels(&pn, &working);
+    succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2);
+    add_tw_pniels_to_tw_extensible(&working, &pn);
+    untwist_and_double_and_serialize(&result_comb, &working);
+    
+    mask_t consistent = MASK_SUCCESS;
+    consistent &= p448_eq(&result_combo, &result_wnaf);
+    consistent &= p448_eq(&result_comb,  &result_wnaf);
+    
+    if (!succ || !consistent) {
+        youfail();
+        printf("    Failed linear combo consistency test with nbits=%d,%d.\n",nbits1,nbits2);
+
+        p448_print("    base1", base1);
+        scalar_print("    scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
+        p448_print("    base2", base2);
+        scalar_print("    scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
+        p448_print("    combs", &result_comb);
+        p448_print("    combo", &result_combo);
+        p448_print("    wNAFs", &result_wnaf);
+        return -1;
+    }
+    
+    destroy_fixed_base(&t1);
+    destroy_fixed_base(&t2);
+    
+    return 0;
+}
+
 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_commutativity_test (
@@ -251,6 +337,49 @@ int test_scalarmul_commutativity () {
     return 0;
 }
 
+int test_linear_combo () {
+    int i,j,k,got;
+    
+    struct crandom_state_t crand;
+    crandom_init_from_buffer(&crand, "scalarmul_linear_combos_test RNG");
+    
+    for (i=0; i<=448; i+=7) {
+        for (j=0; j<=448; j+=7) {
+            got = 0;
+            
+            for (k=0; k<128 && !got; k++) {
+                uint8_t ser[56];
+                word_t scalar1[7], scalar2[7];
+                crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1));
+                crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2));
+            
+                p448_t base1;
+                crandom_generate(&crand, ser, sizeof(ser));
+                mask_t succ = p448_deserialize(&base1, ser);
+                if (!succ) continue;
+                
+                p448_t base2;
+                crandom_generate(&crand, ser, sizeof(ser));
+                succ = p448_deserialize(&base2, ser);
+                if (!succ) continue;
+            
+                int ret = single_linear_combo_test (&base1, scalar1, i, &base2, scalar2, j);
+                got = !ret;
+                if (ret == -1) return -1;
+            }
+
+            if (!got) {
+                youfail();
+                printf("    Unlikely: rejected 128 scalars in a row.\n");
+                return -1;
+            }
+            
+        }
+    }
+    
+    return 0;
+}
+
 int test_scalarmul_compatibility () {
     int i,j,k,got;