Internal changes which break compatibility with previous versions

(you knew this would happen). Added ARM NEON support. Added support for precomputation on public keys, which speeds up later signatures and ECDH calls. See history.txt or the doc for details. Reworked internals so that private keys can be derived from any 32-byte secret random value. This also means that secret keys can be "compressed" for cold storage. Added more tests. Running the tests now requires GMP, though Goldilocks itself does not. Linking now uses visibility instead of exported.sym.
11 years ago · d4085b9606
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,46 @@
 May 3, 2104:
    Minor changes to internal routines mean that this version is not
    compatible with the previous one.

    Added ARM NEON code.
    
    Added the ability to precompute multiples of a partner's public key.  This
    takes slightly longer than a signature verification, but reduces future
    verifications with the precomputed key by ~63% and ECDH by ~70%.
    
        goldilocks_precompute_public_key
        goldilocks_destroy_precomputed_public_key
        goldilocks_verify_precomputed
        goldilocks_shared_secret_precomputed
    
    The precomputation feature are is protected by a macro
        GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
    which can be #defined to 0 to compile these functions out.  Unlike most
    of Goldilocks' functions, goldilocks_precompute_public_key uses malloc()
    (and goldilocks_destroy_precomputed_public_key uses free()).
    
    Changed private keys to be derived from just the symmetric part.  This
    means that you can compress them to 32 bytes for cold storage, or derive
    keypairs from crypto secrets from other systems.
        goldilocks_derive_private_key
        goldilocks_underive_private_key
        goldilocks_private_to_public
    
    Fixed a number of bugs related to vector alignment on Sandy Bridge, which
    has AVX but uses SSE2 alignment (because it doesn't have AVX2).  Maybe I
    should just switch it to use AVX2 alignment?
    
    Beginning to factor out curve-specific magic, so as to build other curves
    with the Goldilocks framework.  That would enable fair tests against eg
    E-521, Ed25519 etc.  Still would be a lot of work.
    
    More thorough testing of arithmetic.  Now uses GMP for testing framework,
    but not in the actual library.
    
    Added some high-level tests for the whole library, including some (bs)
    negative testing.  Obviously, effective negative testing is a very difficult
    proposition in a crypto library.

 March 29, 2014:
    Added a test directory with various tests.  Currently testing SHA512 Monte
    Carlo, compatibility of the different scalarmul functions, and some
--- a/+ 54
+++ b/+ 54
@@ -1,23 +1,57 @@
 # Copyright (c) 2014 Cryptography Research, Inc.
 # Released under the MIT License.  See LICENSE.txt for license information.


 UNAME := $(shell uname)
 MACHINE := $(shell uname -m)

 ifeq ($(UNAME),Darwin)
 CC = clang
 LD = clang
 else
 CC = gcc
 endif
 LD = $(CC)

 ifneq (,$(findstring x86_64,$(MACHINE)))
 ARCH ?= arch_x86_64
 else
 # no i386 port yet
 ARCH ?= arch_arm_32
 endif

 ARCH = arch_x86_64

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	-Wgcc-compat -Wmissing-declarations
 	 -Wmissing-declarations -Wunused-function $(EXWARN)
 	 
 	 
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
 LANGFLAGS = -std=c99
 GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 OFLAGS = -O3
 #XFLAGS = -DN_TESTS_BASE=1000
 ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
 #ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16

 CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
 LDFLAGS = $(ARCHFLAGS)
 ifneq (,$(findstring arm,$(MACHINE)))
 ifneq (,$(findstring neon,$(ARCH)))
 ARCHFLAGS += -mfpu=neon
 else
 ARCHFLAGS += -mfpu=vfpv3-d16
 endif
 ARCHFLAGS += -mcpu=cortex-a9 # FIXME
 GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
 else
 ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO
 endif

 ifeq ($(CC),clang)
 WARNFLAGS += -Wgcc-compat
 endif

 ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC)))
 # ARCHFLAGS += -m32
 ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1
 endif

 CFLAGS  = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
 LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
 ASFLAGS = $(ARCHFLAGS)

 .PHONY: clean all test bench todo doc lib
@@ -29,7 +63,7 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o

 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o

 BENCHCOMPONENTS=build/bench.o

@@ -45,15 +79,20 @@ build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^

 build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^
 	$(LD) $(LDFLAGS) -o $@ $^ -lgmp

 lib: build/goldilocks.so

 build/goldilocks.so: $(LIBCOMPONENTS)
 	rm -f $@
 ifeq ($(UNAME),Darwin)
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  -exported_symbols_list src/exported.sym \
 		  $(LIBCOMPONENTS)
 else
 	$(LD) -shared -Wl,-soname,goldilocks.so.1 -Wl,--gc-sections -o $@ $(LIBCOMPONENTS)
 	strip --discard-all $@
 	ln -sf $@ build/goldilocks.so.1
 endif

 build/timestamp:
 	mkdir -p build
@@ -80,9 +119,9 @@ doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/

 todo::
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
 	@echo '============================='
 	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
 	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
 	  /bin/echo -n $$i'       ' | head -c 10; \
 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
@@ -90,7 +129,7 @@ todo::
 	@echo '============================='
 	@echo -n 'Total     '
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l

 bench: build/bench
 	./$<
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -12,13 +12,42 @@

 #include <stdint.h>

 #ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
 /** If nonzero, implement precomputation for verify and ECDH. */
 #define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
 #endif

 /** The size of the Goldilocks field, in bits. */
 #define GOLDI_FIELD_BITS          448

 /** The size of the Goldilocks scalars, in bits. */
 #define GOLDI_SCALAR_BITS         446

 /** The same size, in bytes. */
 #define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)

 /** The size of a Goldilocks public key, in bytes. */
 #define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES

 /** The extra bytes in a Goldilocks private key for the symmetric key. */
 #define GOLDI_SYMKEY_BYTES        32

 /** The size of a shared secret. */
 #define GOLDI_SHARED_SECRET_BYTES 64

 /** The size of a Goldilocks private key, in bytes. */
 #define GOLDI_PRIVATE_KEY_BYTES   (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)

 /** The size of a Goldilocks private key, in bytes. */
 #define GOLDI_SIGNATURE_BYTES     (2*GOLDI_FIELD_BYTES)

 /**
 * @brief Serialized form of a Goldilocks public key.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_public_key_t {
    uint8_t opaque[56]; /**< Serialized data. */
    uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
 };

 /**
@@ -30,7 +59,7 @@ struct goldilocks_public_key_t {
 * @warning This isn't even my final form!
 */
 struct goldilocks_private_key_t {
    uint8_t opaque[144]; /**< Serialized data. */
    uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
 };

 #ifdef __cplusplus
@@ -72,7 +101,7 @@ static const int GOLDI_EALREADYINIT  = 44805;
 */
 int
 goldilocks_init ()
 __attribute__((warn_unused_result));
 __attribute__((warn_unused_result,visibility ("default")));


 /**
@@ -90,7 +119,40 @@ int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2)));
 ) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));

 /**
 * @brief Derive a key from its compressed form.
 * @param [out] privkey The derived private key.
 * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
 *
 * @warning This isn't even my final form!
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_derive_private_key (
    struct goldilocks_private_key_t *privkey,
    const unsigned char proto[GOLDI_SYMKEY_BYTES]
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Compress a private key (by copying out the proto-key)
 * @param [out] proto The proto-key.
 * @param [in] privkey The private key.
 *
 * @warning This isn't even my final form!
 * @todo test.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 void
 goldilocks_underive_private_key (
    unsigned char proto[GOLDI_SYMKEY_BYTES],
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Extract the public key from a private key.
@@ -107,7 +169,7 @@ int
 goldilocks_private_to_public (
    struct goldilocks_public_key_t *pubkey,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2)));
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
@@ -140,10 +202,10 @@ goldilocks_private_to_public (
 */
 int
 goldilocks_shared_secret (
    uint8_t shared[64],
    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,3)));
 ) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
    
 /**
 * @brief Sign a message.
@@ -166,11 +228,11 @@ goldilocks_shared_secret (
 */
 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2,4)));
 ) __attribute__((nonnull(1,2,4),visibility ("default")));

 /**
 * @brief Verify a signature.
@@ -197,11 +259,108 @@ goldilocks_sign (
 */
 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,4)));
 ) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));

 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

 /** A public key which has been expanded by precomputation for higher speed. */
 struct goldilocks_precomputed_public_key_t;

 /**
 * @brief Expand a public key by precomputation.
 *
 * @todo Give actual error returns, instead of ambiguous NULL.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] pub The public key.
 * @retval NULL We ran out of memory, or the 
 */
 struct goldilocks_precomputed_public_key_t *
 goldilocks_precompute_public_key (
    const struct goldilocks_public_key_t *pub
 ) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));

 /**
 * @brief Overwrite an expanded public key with zeros, then destroy it.
 *
 * If the input is NULL, this function does nothing.
 *
 * @param [in] precom The public key.
 */
 void
 goldilocks_destroy_precomputed_public_key (
    struct goldilocks_precomputed_public_key_t *precom
 ) __attribute__((visibility ("default")));

 /**
 * @brief Verify a signature.
 *
 * This function is fairly strict.  It will correctly detect when
 * the signature has the wrong cofactor component, or when the sig
 * values aren't less than p or q.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] signature The signature.
 * @param [in] message The message to be verified.
 * @param [in] message_len The length of the message to be verified.
 * @param [in] pubkey The signer's public key, expanded by precomputation.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EINVAL The public key or signature is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_verify_precomputed (
   const uint8_t signature[GOLDI_SIGNATURE_BYTES],
   const uint8_t *message,
   uint64_t message_len,
   const struct goldilocks_precomputed_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
   
 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
 * Uses a precomputation on the other party's public key for efficiency.
 *
 * This function uses some compile-time flags whose merit remains to
 * be decided.
 *
 * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
 * of zeros to the secret before hashing.  In the case that the other
 * party's key is detectably corrupt, instead the symmetric part
 * of the secret key is used to produce a pseudorandom value.
 *
 * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
 * the two parties' public keys is prepended to the hash.
 *
 * In the current version, this function can safely be run even without
 * goldilocks_init().  But this property is not guaranteed for future
 * versions, so call it anyway.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] shared The shared secret established with the other party.
 * @param [in] my_privkey My private key.
 * @param [in] your_pubkey The other party's precomputed public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EINVAL   The other party's key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_shared_secret_precomputed (
   uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
   const struct goldilocks_private_key_t *my_privkey,
   const struct goldilocks_precomputed_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

 #endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */

 #ifdef __cplusplus
 }; /* extern "C" */
--- a/src/arch_arm_32/p448.c
+++ b/src/arch_arm_32/p448.c
@@ -28,6 +28,8 @@ smlal (
    const uint32_t a,
    const uint32_t b
 ) {

 #ifdef  __ARMEL__
    uint32_t lo = *acc, hi = (*acc)>>32;
    
    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -35,6 +37,9 @@ smlal (
        : [a]"r"(a), [b]"r"(b));
    
    *acc = lo + (((uint64_t)hi)<<32);
 #else
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 #endif
 }

 static inline void __attribute__((gnu_inline,always_inline))
@@ -43,6 +48,7 @@ smlal2 (
    const uint32_t a,
    const uint32_t b
 ) {
 #ifdef __ARMEL__
    uint32_t lo = *acc, hi = (*acc)>>32;
    
    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -50,6 +56,9 @@ smlal2 (
        : [a]"r"(a), [b]"r"(2*b));
    
    *acc = lo + (((uint64_t)hi)<<32);
 #else
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
 #endif
 }

 static inline void __attribute__((gnu_inline,always_inline))
@@ -58,6 +67,7 @@ smull (
    const uint32_t a,
    const uint32_t b
 ) {
 #ifdef __ARMEL__
    uint32_t lo, hi;
    
    __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
@@ -65,6 +75,9 @@ smull (
        : [a]"r"(a), [b]"r"(b));
    
    *acc = lo + (((uint64_t)hi)<<32);
 #else
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 #endif
 }

 static inline void __attribute__((gnu_inline,always_inline))
@@ -73,6 +86,7 @@ smull2 (
    const uint32_t a,
    const uint32_t b
 ) {
 #ifdef __ARMEL__
    uint32_t lo, hi;
    
    __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
@@ -80,6 +94,9 @@ smull2 (
        : [a]"r"(a), [b]"r"(2*b));
    
    *acc = lo + (((uint64_t)hi)<<32);
 #else
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
 #endif
 }

 void
@@ -760,13 +777,13 @@ p448_mulw (
    const p448_t *as,
    uint64_t b
 ) {
    const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
    uint32_t mask = (1ull<<28)-1;  
    const uint32_t bhi = b>>28, blo = b & mask;
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0, accum8;
    uint32_t mask = (1ull<<28)-1;  

    int i;

@@ -957,7 +974,7 @@ p448_deserialize (
        for (j=0; j<7; j++) {
            out |= ((uint64_t)serial[7*i+j])<<(8*j);
        }
        x->limb[2*i] = out & (1ull<<28)-1;
        x->limb[2*i] = out & ((1ull<<28)-1);
        x->limb[2*i+1] = out >> 28;
    }
    
--- a/src/arch_arm_32/p448.h
+++ b/src/arch_arm_32/p448.h
@@ -173,7 +173,7 @@ p448_set_ui (
    uint64_t x
 ) {
    int i;
    out->limb[0] = x & (1<<28)-1;
    out->limb[0] = x & ((1<<28)-1);
    out->limb[1] = x>>28;
    for (i=2; i<16; i++) {
      out->limb[i] = 0;
@@ -188,7 +188,11 @@ p448_cond_swap (
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
 #if __ARM_NEON__
    big_register_t m = vdupq_n_u32(doswap);
 #else
    big_register_t m = doswap;
 #endif

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
@@ -260,8 +264,12 @@ p448_cond_neg(
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
 #if __ARM_NEON__
    big_register_t m = vdupq_n_u32(doNegate);
 #else
    big_register_t m = doNegate;
    
 #endif

    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
--- a/src/arch_neon/ec_point.c
+++ b/src/arch_neon/ec_point.c
@@ -0,0 +1,959 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_bias ( &d->y,     2 );
    p448_weak_reduce( &d->y );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     1 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     1 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_weak_reduce( &e->t );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_weak_reduce(     b );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_bias ( &b->z,     2 );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_weak_reduce(   &L3 );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_weak_reduce( &a->y );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    p448_bias ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_bias (   &L3,     1 );
    p448_weak_reduce(   &L3 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/src/arch_neon/neon_emulation.h
+++ b/src/arch_neon/neon_emulation.h
@@ -0,0 +1,150 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file "neon_emulation.h"
 * @brief NEON intrinsic emulation using clang's vector extensions.
 *
 * This lets you test and debug NEON code on x86.
 */
 #ifndef __NEON_EMULATION_H__
 #define __NEON_EMULATION_H__ 1

 #include "word.h"

 #include <stdint.h>
 #include <assert.h>

 static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) {
    a.x += b.x;
    a.y += b.y;
    return a;
 }

 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
 xx_vaddup_s64(int64x2_t x) {
    x.y += x.x;
    return x;
 }

 typedef struct { int32x2_t val[2]; } int32x2x2_t;
 static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) {
    int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}};
    return out;
 }

 static __inline__ void __attribute__((gnu_inline,always_inline))
 xx_vtrnq_s64 (
    int64x2_t *x,
    int64x2_t *y
 ) {
    int64_t tmp = (*x).y;
    (*x).y = (*y).x;
    (*y).x = tmp;
 }

 int64x2_t vsraq_n_s64 (
    int64x2_t a,
    int64x2_t v,
    const int x
 ) {
    return a + (v >> x);
 }

 int64x2_t vshrq_n_s64 (
    int64x2_t v,
    const int x
 ) {
    return v >> x;
 }

 static inline int64_t vgetq_lane_s64 (
    int64x2_t acc,
    const int lane
 ) {
    return lane ? acc.y : acc.x;
 }

 static inline int32_t vget_lane_s32 (
    int32x2_t acc,
    const int lane
 ) {
    return lane ? acc.y : acc.x;
 }

 static inline int64x2_t vmlal_lane_s32 (
    int64x2_t acc,
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    return acc + xx*(lane?yy.yy:yy.xx);
 }

 static inline int64x2_t vmlsl_lane_s32 (
    int64x2_t acc,
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    return acc - xx*(lane?yy.yy:yy.xx);
 }

 static inline int64x2_t vqdmlsl_lane_s32 (
    int64x2_t acc,
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
    assert(tmp.x >> 63 == tmp.x>>62);
    assert(tmp.y >> 63 == tmp.y>>62);
    return acc - 2*tmp;
 }

 static inline int64x2_t vqdmlal_lane_s32 (
    int64x2_t acc,
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
    assert(tmp.x >> 63 == tmp.x>>62);
    assert(tmp.y >> 63 == tmp.y>>62);
    return acc + 2*tmp;
 }

 static inline int64x2_t vqdmull_lane_s32 (
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
    assert(tmp.x >> 63 == tmp.x>>62);
    assert(tmp.y >> 63 == tmp.y>>62);
    return 2*tmp;
 }

 static inline int32x2_t vmovn_s64(
    int64x2_t x
 ) {
    int32x2_t y = {x.x,x.y};
    return y;
 }

 static inline int64x2_t vmull_lane_s32 (
    int32x2_t x,
    int32x2_t y,
    int lane
 ) {
    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
    return xx*(lane?yy.yy:yy.xx);
 }

 #endif /* __NEON_EMULATION_H__ */
--- a/src/arch_neon/p448.c
+++ b/src/arch_neon/p448.c
@@ -0,0 +1,749 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"
 #include "p448.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
    word_t x
 ) {
    dword_t xx = x;
    xx--;
    return xx >> WORD_BITS;
 }

 static uint64_t widemul_32 (
    const uint32_t a,
    const uint32_t b
 ) {
    return ((uint64_t)a)* b;
 }

 #ifdef __ARM_NEON__
 static __inline__ void __attribute__((gnu_inline,always_inline))
 xx_vtrnq_s64 (
    int64x2_t *x,
    int64x2_t *y
 ) {
    __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y));
 }

 static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
 xx_vaddup_s64(int64x2_t x) {
    __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
    return x;
 }
 #else
 #include "neon_emulation.h"
 #endif // ARM_NEON

 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
 ) {
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 smlal2 (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
 ) {
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 smull (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
 ) {
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 smull2 (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
 ) {
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }

 // static inline int64x2_t copy_now(int64x2_t x) {
 //     int64x2_t y;
 //     __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x));
 //     return y;
 // }

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
    
    const int32x2_t
        *val = (const int32x2_t *)a,
        *vbl = (const int32x2_t *)b,
        *vah = (const int32x2_t *)(&a[8]),
        *vbh = (const int32x2_t *)(&b[8]);
    
    int32x2_t
        *vcl = (int32x2_t *)c,
        *vch = (int32x2_t *)(&c[8]),
        vmask = {(1<<28) - 1, (1<<28)-1};

    int64x2_t accumx0a, accumx0b;
    int64x2_t accumx1a, accumx1b;
    int64x2_t accumx2a, accumx2b;
    int64x2_t accumx3a, accumx3b;
    int64x2_t accumx4a, accumx4b;
    int64x2_t accumx5a, accumx5b;
    int64x2_t accumx6a, accumx6b;
    int64x2_t accumx7a, accumx7b;
    int64x2_t carry;
    int32x2x2_t trn_res;
    int32x2_t delta;
    
    accumx0a = vmull_lane_s32(          delta = val[1] + vah[1], vbh[3], 0);
    accumx1a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx2a = vmull_lane_s32(          delta = val[2] + vah[2], vbh[3], 0);
    accumx3a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
    accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
    accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
    accumx2b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[1], 0);
    accumx3b = vmull_lane_s32(          delta, vbh[1], 1);
    accumx0b = vmull_lane_s32(          delta, vbh[0], 0);
    accumx1b = vmull_lane_s32(          delta, vbh[0], 1);
    accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
    accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
    accumx2b += accumx2a;
    accumx3b += accumx3a;
    accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
    accumx0b += accumx0a;
    accumx1b += accumx1a;
    accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
    accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
    accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
    accumx2a += accumx2b;
    accumx3a += accumx3b;
    accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
    accumx0a += accumx0b;
    accumx1a += accumx1b;
    accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
    accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
    accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
    xx_vtrnq_s64(&accumx0a, &accumx0b);
    xx_vtrnq_s64(&accumx1a, &accumx1b);
    xx_vtrnq_s64(&accumx2a, &accumx2b);
    xx_vtrnq_s64(&accumx3a, &accumx3b);
    accumx0b += accumx1a;
    accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
    accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
    accumx2a += accumx1b;
    accumx2b += accumx3a;
    accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
    accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
    trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
    vcl[0] = trn_res.val[1] & vmask;
    vch[0] = trn_res.val[0] & vmask;
    trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
    vcl[1] = trn_res.val[1] & vmask;
    vch[1] = trn_res.val[0] & vmask;
    carry = accumx3b;
    
    accumx4a = vmull_lane_s32(          delta = val[3] + vah[3], vbh[3], 0);
    accumx5a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx6b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[3], 0);
    accumx7b = vmull_lane_s32(          delta, vbh[3], 1);
    accumx4b = accumx4a;
    accumx5b = accumx5a;
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
    accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
    accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
    accumx6a = accumx6b;
    accumx7a = accumx7b;
    accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
    accumx4a += accumx4b;
    accumx5a += accumx5b;
    accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
    accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
    accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
    accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
    accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
    /**/
    accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
    
    xx_vtrnq_s64(&accumx4a, &accumx4b);
    xx_vtrnq_s64(&accumx5a, &accumx5b);
    xx_vtrnq_s64(&accumx6a, &accumx6b);
    xx_vtrnq_s64(&accumx7a, &accumx7b);
    accumx4a += carry;
    accumx4b += accumx5a;
    accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
    accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
    accumx6a += accumx5b;
    accumx6b += accumx7a;
    
    trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
    vcl[2] = trn_res.val[1] & vmask;
    vch[2] = trn_res.val[0] & vmask;
    accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
    accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
    trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
    vcl[3] = trn_res.val[1] & vmask;
    vch[3] = trn_res.val[0] & vmask;
    
    accumx7b = xx_vaddup_s64(accumx7b);

    int32x2_t t0 = vcl[0], t1 = vch[0];
    trn_res = vtrn_s32(t0,t1);
    t0 = trn_res.val[0]; t1 = trn_res.val[1];
    
    accumx7b = vaddw_s32(accumx7b, t0);
    t0 = vmovn_s64(accumx7b) & vmask;
    
    accumx7b = vshrq_n_s64(accumx7b,28);
    accumx7b = vaddw_s32(accumx7b, t1);
    t1 = vmovn_s64(accumx7b) & vmask;
    trn_res = vtrn_s32(t0,t1);
    vcl[0] = trn_res.val[0];
    vch[0] = trn_res.val[1];
    accumx7b = vshrq_n_s64(accumx7b,28);

    t0 = vmovn_s64(accumx7b);
    
    uint32_t
        c0 = vget_lane_s32(t0,0),
        c1 = vget_lane_s32(t0,1);
    c[2]  += c0;
    c[10] += c1;
 }

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    /* FUTURE possible improvements:
     *    don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end
     *        or use phi/nega-phi for everything, montgomery style
     *        or find some sort of phi algorithm which doesn't have this problem
     *    break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks
     *
     * These improvements are all pretty minor, but I guess together they might matter?
     */
    
    const uint32_t *b = as->limb;
    uint32_t *c = cs->limb;

    int32x2_t vbm[4];
    
    const int32x2_t
        *vbl = (const int32x2_t *)b,
        *vbh = (const int32x2_t *)(&b[8]);
        
    int i;
    for (i=0; i<4; i++) {
        vbm[i] = vbl[i] - vbh[i];
    }
    
    int32x2_t
        *vcl = (int32x2_t *)c,
        *vch = (int32x2_t *)(&c[8]),
        vmask = {(1<<28) - 1, (1<<28)-1};

    int64x2_t accumx0a, accumx0b;
    int64x2_t accumx1a, accumx1b;
    int64x2_t accumx2a, accumx2b;
    int64x2_t accumx3a, accumx3b;
    int64x2_t accumx4a, accumx4b;
    int64x2_t accumx5a, accumx5b;
    int64x2_t accumx6a, accumx6b;
    int64x2_t accumx7a, accumx7b;
    int64x2_t carry;
    int32x2x2_t trn_res;
    
    accumx0a = vqdmull_lane_s32(          vbh[1], vbh[3], 0);
    accumx1a = vqdmull_lane_s32(          vbh[1], vbh[3], 1);
    accumx2a = vqdmull_lane_s32(          vbh[2], vbh[3], 0);
    accumx3a = vqdmull_lane_s32(          vbh[2], vbh[3], 1);
    accumx0a =   vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0);
    accumx1a =   vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1);
    accumx2b = accumx2a;
    accumx3b = accumx3a;
    accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0);
    accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1);
    accumx0b = accumx0a;
    accumx1b = accumx1a;
    accumx0b =   vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0);
    accumx1b =   vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1);
    accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0);
    accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1);
    accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0);
    accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1);
    accumx0b =   vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0);
    accumx1b =   vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1);
    accumx2a += accumx2b;
    accumx3a += accumx3b;
    accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0);
    accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1);
    accumx0a += accumx0b;
    accumx1a += accumx1b;
    accumx0a =   vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0);
    accumx1a =   vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1);
    accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0);
    accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1);
    accumx0a =   vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0);
    accumx1a =   vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1);
    accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0);
    accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1);
    accumx0b += accumx0a;
    accumx1b += accumx1a;
    accumx0b =   vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0);
    accumx1b =   vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1);
    accumx2b += accumx2a;
    accumx3b += accumx3a;
    accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0);
    accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1);
    xx_vtrnq_s64(&accumx0b, &accumx0a);
    xx_vtrnq_s64(&accumx1b, &accumx1a);
    xx_vtrnq_s64(&accumx2b, &accumx2a);
    xx_vtrnq_s64(&accumx3b, &accumx3a);
    accumx0a += accumx1b;
    accumx0a = vsraq_n_s64(accumx0a,accumx0b,28);
    accumx1a = vsraq_n_s64(accumx1a,accumx0a,28);
    accumx2b += accumx1a;
    accumx2a += accumx3b;
    accumx2a = vsraq_n_s64(accumx2a,accumx2b,28);
    accumx3a = vsraq_n_s64(accumx3a,accumx2a,28);
    trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a));
    vcl[0] = trn_res.val[1] & vmask;
    vch[0] = trn_res.val[0] & vmask;
    trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a));
    vcl[1] = trn_res.val[1] & vmask;
    vch[1] = trn_res.val[0] & vmask;
    carry = accumx3a;
    
    accumx4a =   vmull_lane_s32(          vbh[3], vbh[3], 0);
    accumx5a =   vmull_lane_s32(          vbh[3], vbh[3], 1);
    accumx6b = vqdmull_lane_s32(          vbh[0], vbh[3], 0);
    accumx7b = vqdmull_lane_s32(          vbh[0], vbh[3], 1);
    accumx4b = accumx4a;
    accumx5b = accumx5a;
    accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0);
    accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1);
    accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0);
    accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1);
    accumx4b =   vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0);
    accumx5b =   vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1);
    accumx4b =   vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0);
    accumx5b =   vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1);
    accumx6a = accumx6b;
    accumx7a = accumx7b;
    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0);
    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1);
    accumx4a += accumx4b;
    accumx5a += accumx5b;
    accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0);
    accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1);
    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0);
    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1);
    accumx4a =   vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0);
    accumx5a =   vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1);
    accumx4a =   vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0);
    accumx5a =   vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1);
    accumx6b += accumx6a;
    accumx7b += accumx7a;
    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0);
    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1);
    accumx4b += accumx4a;
    accumx5b += accumx5a;
    accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0);
    accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1);
    accumx4b =   vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0);
    accumx5b =   vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1);
    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0);
    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1);
    
    xx_vtrnq_s64(&accumx4b, &accumx4a);
    xx_vtrnq_s64(&accumx5b, &accumx5a);
    xx_vtrnq_s64(&accumx6b, &accumx6a);
    xx_vtrnq_s64(&accumx7b, &accumx7a);
    accumx4b += carry;
    accumx4a += accumx5b;
    accumx4a = vsraq_n_s64(accumx4a,accumx4b,28);
    accumx5a = vsraq_n_s64(accumx5a,accumx4a,28);
    accumx6b += accumx5a;
    accumx6a += accumx7b;
    
    trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a));
    vcl[2] = trn_res.val[1] & vmask;
    vch[2] = trn_res.val[0] & vmask;
    accumx6a = vsraq_n_s64(accumx6a,accumx6b,28);
    accumx7a = vsraq_n_s64(accumx7a,accumx6a,28);
    trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a));
    vcl[3] = trn_res.val[1] & vmask;
    vch[3] = trn_res.val[0] & vmask;
    
    accumx7a = xx_vaddup_s64(accumx7a);

    int32x2_t t0 = vcl[0], t1 = vch[0];
    trn_res = vtrn_s32(t0,t1);
    t0 = trn_res.val[0]; t1 = trn_res.val[1];
    
    accumx7a = vaddw_s32(accumx7a, t0);
    t0 = vmovn_s64(accumx7a) & vmask;
    
    accumx7a = vshrq_n_s64(accumx7a,28);
    accumx7a = vaddw_s32(accumx7a, t1);
    t1 = vmovn_s64(accumx7a) & vmask;
    trn_res = vtrn_s32(t0,t1);
    vcl[0] = trn_res.val[0];
    vch[0] = trn_res.val[1];
    accumx7a = vshrq_n_s64(accumx7a,28);

    t0 = vmovn_s64(accumx7a);
    
    uint32_t
        c0 = vget_lane_s32(t0,0),
        c1 = vget_lane_s32(t0,1);
    c[2]  += c0;
    c[10] += c1;
 }

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    uint64_t b
 ) {
    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0, accum8;
    uint32_t mask = (1ull<<28)-1;  

    int i;

    uint32_t c0, c8, n0, n8;
    accum0 = widemul_32(bhi, a[15]);
    accum8 = widemul_32(bhi, a[15] + a[7]);
    c0 = a[0]; c8 = a[8];
    smlal(&accum0, blo, c0);
    smlal(&accum8, blo, c8);

    c[0] = accum0 & mask; accum0 >>= 28;
    c[8] = accum8 & mask; accum8 >>= 28;
    
    i=1;
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);
        
        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);
        
        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        i++;
    }

    accum0 += accum8 + c[8];
    c[8] = accum0 & mask;
    c[9] += accum0 >> 28;

    accum8 += c[0];
    c[0] = accum8 & mask;
    c[1] += accum8 >> 28;
 }

 void
 p448_strong_reduce (
    p448_t *a
 ) {
    word_t mask = (1ull<<28)-1;

    /* first, clear high */
    a->limb[8] += a->limb[15]>>28;
    a->limb[0] += a->limb[15]>>28;
    a->limb[15] &= mask;

    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    dsword_t scarry = 0;
    int i;
    for (i=0; i<16; i++) {
        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 28;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    word_t scarry_mask = scarry & mask;
    dword_t carry = 0;

    /* add it back */
    for (i=0; i<16; i++) {
        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 28;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p448_is_zero (
    const struct p448_t *a
 ) {
    struct p448_t b;
    p448_copy(&b,a);
    p448_strong_reduce(&b);

    uint32_t any = 0;
    int i;
    for (i=0; i<16; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
        for (j=0; j<7; j++) {
            serial[7*i+j] = limb;
            limb >>= 8;
        }
        assert(limb == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
        uint64_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((uint64_t)serial[7*i+j])<<(8*j);
        }
        x->limb[2*i] = out & ((1ull<<28)-1);
        x->limb[2*i+1] = out >> 28;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 56 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    uint32_t ge = -1, mask = (1ull<<28)-1;
    for (i=0; i<8; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
    
    /* Propagate the rest */
    for (i=9; i<16; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
 }

 void
 simultaneous_invert_p448(
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 ) {
  if (n==0) {
      return;
  } else if (n==1) {
      p448_inverse(out,in);
      return;
  }
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<(int) (n-1); i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }
--- a/src/arch_neon/p448.h
+++ b/src/arch_neon/p448.h
@@ -0,0 +1,378 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x & ((1<<28)-1);
    out->limb[1] = x>>28;
    for (i=2; i<16; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = br_set_to_mask(doNegate);
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    int i;
    a->limb[8] += tmp;
    for (i=15; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/arch_x86_64/p448.c
+++ b/src/arch_x86_64/p448.c
@@ -17,13 +17,14 @@ p448_mul (
    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32)));
    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
        ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    }
    /*
    for (int i=0; i<4; i++) {
@@ -81,18 +82,15 @@ p448_mul (
    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[2],&bb[3]);
    msb(&accum0, &a[2], &b[3]);
    mac(&accum1, &a[6], &b[7]);
    accum2  = widemul(&a[2],&b[7]);
    mac(&accum0, &a[6], &bb[3]);
    mac(&accum1, &aa[2], &bbb[3]);

    mac(&accum2, &aa[3], &bb[2]);
    msb(&accum0, &a[3], &b[2]);
    mac(&accum1, &a[7], &b[6]);
    mac(&accum2, &a[3], &b[6]);
    mac(&accum0, &a[7], &bb[2]);
    mac(&accum1, &aa[3], &bbb[2]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(&a[0],&b[1]);
    mac(&accum2, &a[0],&b[1]);
    mac(&accum1, &aa[0], &bb[1]);
    mac(&accum0, &a[4], &b[5]);

@@ -109,14 +107,11 @@ p448_mul (
    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[3],&bb[3]);
    msb(&accum0, &a[3], &b[3]);
    mac(&accum1, &a[7], &b[7]);

    accum1 += accum2;
    accum0 += accum2;
    accum2  = widemul(&a[3],&b[7]);
    mac(&accum0, &a[7], &bb[3]);
    mac(&accum1, &aa[3], &bbb[3]);

    accum2  = widemul(&a[0],&b[2]);
    mac(&accum2, &a[0],&b[2]);
    mac(&accum1, &aa[0], &bb[2]);
    mac(&accum0, &a[4], &b[6]);

@@ -186,11 +181,9 @@ p448_mulw (
    c[3] = accum0 & mask; accum0 >>= 56;
    c[7] = accum4 & mask; accum4 >>= 56;

    c[4] += accum0 + accum4;
    c[0] += accum4;
    // c[4] += accum0 + accum4;
    // c[0] += accum4;
    
    /*
     * TODO: double-check that this is not necessary.
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;
@@ -198,7 +191,6 @@ p448_mulw (
    accum4 += c[0];
    c[0] = accum4 & mask;
    c[1] += accum4 >> 56;
    */
 }

 void
--- a/src/arch_x86_64/p448.h
+++ b/src/arch_x86_64/p448.h
@@ -290,7 +290,10 @@ p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
    }
 }

 void
@@ -299,10 +302,25 @@ p448_bias (
    int amt
 ) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    
 #if __AVX2__
    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint64x4_t *aa = (uint64x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
 #elif __SSE2__
    uint64x2_t lo = {co1,co1}, hi = {co2,co1};
    uint64x2_t *aa = (uint64x2_t*) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 #else
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
        a->limb[i] += (i==4) ? co2 : co1;
    }
 #endif
 }

 void
--- a/src/exported.sym
+++ b/src/exported.sym
@@ -1,6 +0,0 @@
 _goldilocks_init
 _goldilocks_keygen
 _goldilocks_shared_secret
 _goldilocks_sign
 _goldilocks_verify
 _goldilocks_private_to_public
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -32,7 +32,10 @@
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif

 /* FUTURE: auto */
 #define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS))
 #define GOLDI_DIVERSIFY_BYTES 8

 /* FUTURE: auto.  MAGIC */
 const struct affine_t goldilocks_base_point = {
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
@@ -42,11 +45,12 @@ const struct affine_t goldilocks_base_point = {
    {{ 19 }}
 };

 /* These are just unique identifiers */
 static const char *G_INITING = "initializing";
 static const char *G_INITED = "initialized";
 static const char *G_FAILED = "failed to initialize";

 /* FUTURE: auto */
 /* FUTURE: auto.  MAGIC */
 static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
    U64LE(0xdc873d6d54a7bb0d),
    U64LE(0xde933d8d723a70aa),
@@ -54,19 +58,45 @@ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
    0x8335dc16
 };
 const struct barrett_prime_t goldi_q448 = {
    448/WORD_BITS,
    GOLDI_FIELD_WORDS,
    62 % WORD_BITS,
    sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
    goldi_q448_lo
 };

 /* FUTURE: auto */
 /* MAGIC */
 static const struct p448_t
 sqrt_d_minus_1 = {{
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 }};

 struct goldilocks_precomputed_public_key_t {
    struct goldilocks_public_key_t pub;
    struct fixed_base_table_t table;
 };

 #ifndef USE_BIG_TABLES
 #if __ARM_NEON__
 #define USE_BIG_TABLES 1
 #else
 #define USE_BIG_TABLES (WORD_BITS==64)
 #endif
 #endif

 /* FUTURE: auto.  MAGIC */
 struct {
    const char * volatile state;
 #if GOLDILOCKS_USE_PTHREAD
    pthread_mutex_t mutex;
 #endif
    struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64];
    struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64];
    struct fixed_base_table_t fixed_base;
    struct tw_niels_t wnafs[32];
    struct crandom_state_t rand;
@@ -107,7 +137,7 @@ goldilocks_init () {
    /* Precompute the tables. */
    mask_t succ;
    
    int big = (WORD_BITS==64);
    int big = USE_BIG_TABLES;
    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
@@ -135,55 +165,77 @@ fail:
    return -1;
 }

 static const struct p448_t
 sqrt_d_minus_1 = {{
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 }};

 int
 goldilocks_keygen (
 goldilocks_derive_private_key (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
    const unsigned char proto[GOLDI_SYMKEY_BYTES]
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }
    
    word_t sk[448*2/WORD_BITS];
    memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES);
    
    unsigned char skb[SHA512_OUTPUT_BYTES];
    word_t sk[GOLDI_FIELD_WORDS];
    assert(sizeof(skb) >= sizeof(sk));
    
    struct sha512_ctx_t ctx;
    struct tw_extensible_t exta;
    struct p448_t pk;
    
    sha512_init(&ctx);
    sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
    sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
    sha512_final(&ctx, (unsigned char *)skb);

    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448);
    barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);

    scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
    untwist_and_double_and_serialize(&pk, &exta);
    
    p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
    
    return GOLDI_EOK;
 }

 void
 goldilocks_underive_private_key (
    unsigned char proto[GOLDI_SYMKEY_BYTES],
    const struct goldilocks_private_key_t *privkey
 ) {
    memcpy(proto, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
 }

 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }
    
    unsigned char proto[GOLDI_SYMKEY_BYTES];

 #if GOLDILOCKS_USE_PTHREAD
    int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex);
    if (ml_ret) return ml_ret;
 #endif

    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
    int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
    if (!ret) ret = ret2;
    int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto));

 #if GOLDILOCKS_USE_PTHREAD
    ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
    if (ml_ret) abort();
 #endif
    
    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448);
    barrett_serialize(privkey->opaque, sk, 448/8);
    
    scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base);
    //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
    untwist_and_double_and_serialize(&pk, &exta);
    int ret2 = goldilocks_derive_private_key(privkey, proto);
    if (!ret) ret = ret2;
    
    p448_serialize(pubkey->opaque, &pk);
    memcpy(&privkey->opaque[56], pubkey->opaque, 56);
    ret2 = goldilocks_private_to_public(pubkey, privkey);
    if (!ret) ret = ret2;
    
    return ret ? GOLDI_ENODICE : GOLDI_EOK;
 }
@@ -194,7 +246,7 @@ goldilocks_private_to_public (
    const struct goldilocks_private_key_t *privkey
 ) {
    struct p448_t pk;
    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]);
    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
    
    if (msucc) {
        p448_serialize(pubkey->opaque, &pk);
@@ -204,30 +256,46 @@ goldilocks_private_to_public (
    }
 }

 int
 goldilocks_shared_secret (
    uint8_t shared[64],
 static int
 goldilocks_shared_secret_core (
    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
    const struct goldilocks_public_key_t *your_pubkey,
    const struct goldilocks_precomputed_public_key_t *pre
 ) {
    /* This function doesn't actually need anything in goldilocks_global,
     * so it doesn't check init.
     */
    
    word_t sk[448/WORD_BITS];
    assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
    
    word_t sk[GOLDI_FIELD_WORDS];
    struct p448_t pk;
    
    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    struct p448_t sum, prod;
    msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
    msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
    p448_mul(&prod,&pk,&sum);
    p448_add(&sum,&pk,&sum);
 #endif
    
    msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
    succ &= montgomery_ladder(&pk,&pk,sk,446,2);
    
 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
    if (pre) {
        struct tw_extensible_t tw;
        succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table);
        untwist_and_double_and_serialize(&pk, &tw);
    } else {
        succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
    }
 #else
    (void)pre;
    succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
 #endif
    
    
    p448_serialize(shared,&pk);
    
@@ -236,28 +304,28 @@ goldilocks_shared_secret (
    sha512_init(&ctx);

 #ifdef EXPERIMENT_ECDH_OBLITERATE_CT
    uint8_t oblit[40];
    uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES];
    unsigned i;
    for (i=0; i<8; i++) {
    for (i=0; i<GOLDI_DIVERSIFY_BYTES; i++) {
        oblit[i] = "noshared"[i] & ~(succ&msucc);
    }
    for (i=0; i<32; i++) {
        oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
    for (i=0; i<GOLDI_SYMKEY_BYTES; i++) {
        oblit[GOLDI_DIVERSIFY_BYTES+i] = my_privkey->opaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc);
    }
    sha512_update(&ctx, oblit, 40);
    sha512_update(&ctx, oblit, sizeof(oblit));
 #endif
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    /* stir in the sum and product of the pubkeys. */
    uint8_t a_pk[56];
    uint8_t a_pk[GOLDI_FIELD_BYTES];
    p448_serialize(a_pk, &sum);
    sha512_update(&ctx, a_pk, 56);
    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
    p448_serialize(a_pk, &prod);
    sha512_update(&ctx, a_pk, 56);
    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
       
    /* stir in the shared key and finish */
    sha512_update(&ctx, shared, 56);
    sha512_update(&ctx, shared, GOLDI_FIELD_BYTES);
    sha512_final(&ctx, shared);
    
    return (GOLDI_ECORRUPT & ~msucc)
@@ -265,9 +333,42 @@ goldilocks_shared_secret (
        | (GOLDI_EOK & msucc & succ);
 }

 int
 goldilocks_shared_secret (
    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) {
    return goldilocks_shared_secret_core(
        shared,
        my_privkey,
        your_pubkey,
        NULL
    );
 }

 static void
 goldilocks_derive_challenge(
    word_t challenge[GOLDI_FIELD_WORDS],
    const unsigned char pubkey[GOLDI_FIELD_BYTES],
    const unsigned char gnonce[GOLDI_FIELD_BYTES],
    const unsigned char *message,
    uint64_t message_len
 ) {
    /* challenge = H(pk, [nonceG], message). */
    unsigned char sha_out[SHA512_OUTPUT_BYTES];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES);
    sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448);
 }

 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
@@ -277,7 +378,7 @@ goldilocks_sign (
    }
    
    /* challenge = H(pk, [nonceG], message). */
    word_t skw[448/WORD_BITS];
    word_t skw[GOLDI_FIELD_WORDS];
    mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
    if (!succ) {
        memset(skw,0,sizeof(skw));
@@ -285,48 +386,50 @@ goldilocks_sign (
    }
        
    /* Derive a nonce.  TODO: use HMAC. FUTURE: factor. */
    unsigned char sha_out[512/8];
    word_t tk[448/WORD_BITS];
    unsigned char sha_out[SHA512_OUTPUT_BYTES];
    word_t tk[GOLDI_FIELD_WORDS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, (const unsigned char *)"signonce", 8);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
    sha512_update(&ctx, message, message_len);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448);
    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448);
    
    /* 4[nonce]G */
    uint8_t signature_tmp[56];
    uint8_t signature_tmp[GOLDI_FIELD_BYTES];
    struct tw_extensible_t exta;
    struct p448_t gsk;
    scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base);
    scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
    double_tw_extensible(&exta);
    untwist_and_double_and_serialize(&gsk, &exta);
    p448_serialize(signature_tmp, &gsk);
    
    word_t challenge[448/WORD_BITS];
    sha512_update(&ctx, &privkey->opaque[56], 56);
    sha512_update(&ctx, signature_tmp, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge (
        challenge,
        &privkey->opaque[GOLDI_FIELD_BYTES],
        signature_tmp,
        message,
        message_len
    );
    
    // reduce challenge and sub.
    barrett_negate(challenge,448/WORD_BITS,&goldi_q448);
    barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448);

    barrett_mac(
        tk,448/WORD_BITS,
        challenge,448/WORD_BITS,
        skw,448/WORD_BITS,
        tk,GOLDI_FIELD_WORDS,
        challenge,GOLDI_FIELD_WORDS,
        skw,GOLDI_FIELD_WORDS,
        &goldi_q448
    );
        
    word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1);
    barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448);
    word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1);
    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448);
        
    memcpy(signature_out, signature_tmp, 56);
    barrett_serialize(signature_out+56, tk, 448/8);
    memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
    barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
    memset((unsigned char *)tk,0,sizeof(tk));
    memset((unsigned char *)skw,0,sizeof(skw));
    memset((unsigned char *)challenge,0,sizeof(challenge));
@@ -342,7 +445,7 @@ goldilocks_sign (

 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
@@ -352,24 +455,16 @@ goldilocks_verify (
    }
    
    struct p448_t pk;
    word_t s[448/WORD_BITS];
    word_t s[GOLDI_FIELD_WORDS];
    
    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
    if (!succ) return GOLDI_EINVAL;
    
    succ = barrett_deserialize(s, &signature[56], &goldi_q448);
    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
    if (!succ) return GOLDI_EINVAL;
    
    /* challenge = H(pk, [nonceG], message). */
    unsigned char sha_out[512/8];
    word_t challenge[448/WORD_BITS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, pubkey->opaque, 56);
    sha512_update(&ctx, signature, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
    
    struct p448_t eph;
    struct tw_extensible_t pk_text;
@@ -381,7 +476,102 @@ goldilocks_verify (
    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
    if (!succ) return GOLDI_EINVAL;
    
    linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 );
    linear_combo_var_fixed_vt( &pk_text,
        challenge, GOLDI_SCALAR_BITS,
        s, GOLDI_SCALAR_BITS,
        goldilocks_global.wnafs, 5 );
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
    p448_bias(&eph, 2);
    
    succ = p448_is_zero(&eph);
    
    return succ ? 0 : GOLDI_EINVAL;
 }

 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

 struct goldilocks_precomputed_public_key_t *
 goldilocks_precompute_public_key (
    const struct goldilocks_public_key_t *pub    
 ) {
    struct goldilocks_precomputed_public_key_t *precom;
    precom = (struct goldilocks_precomputed_public_key_t *)
        malloc(sizeof(*precom));
    
    if (!precom) return NULL;
    
    struct tw_extensible_t pk_text;
    
    struct p448_t pk;
    mask_t succ = p448_deserialize(&pk, pub->opaque);
    if (!succ) {
        free(precom);
        return NULL;
    }
    
    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
    if (!succ) {
        free(precom);
        return NULL;
    }
    
    int big = USE_BIG_TABLES;
    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

    succ =  precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL);
    if (!succ) {
        free(precom);
        return NULL;
    }
    
    memcpy(&precom->pub,pub,sizeof(*pub));
    
    return precom;
 }

 void
 goldilocks_destroy_precomputed_public_key (
    struct goldilocks_precomputed_public_key_t *precom
 ) {
    if (!precom) return;
    destroy_fixed_base(&precom->table);
    memset(&precom->pub.opaque, 0, sizeof(precom->pub));
    free(precom);
 }

 int
 goldilocks_verify_precomputed (
    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_precomputed_public_key_t *pubkey
 ) {
    if (!goldilocks_check_init()) {
        return GOLDI_EUNINIT;
    }

    word_t s[GOLDI_FIELD_WORDS];
    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
    if (!succ) return GOLDI_EINVAL;
    
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
    
    struct p448_t eph, pk;
    struct tw_extensible_t pk_text;
    
    /* deserialize [nonce]G */
    succ = p448_deserialize(&eph, signature);
    if (!succ) return GOLDI_EINVAL;
        
    succ = linear_combo_combs_vt (
        &pk_text,
        challenge, GOLDI_SCALAR_BITS, &pubkey->table,
        s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base
    );
    if (!succ) return GOLDI_EINVAL;
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
@@ -391,3 +581,20 @@ goldilocks_verify (
    
    return succ ? 0 : GOLDI_EINVAL;
 }

 int
 goldilocks_shared_secret_precomputed (
    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_precomputed_public_key_t *your_pubkey
 ) {
    return goldilocks_shared_secret_core(
        shared,
        my_privkey,
        &your_pubkey->pub,
        your_pubkey
    );
 }

 #endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

--- a/src/include/intrinsics.h
+++ b/src/include/intrinsics.h
@@ -12,7 +12,9 @@

 #include <sys/types.h>

 #if __i386__ || __x86_64__
 #include <immintrin.h>
 #endif

 #define INTRINSIC \
  static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -26,7 +26,7 @@ struct fixed_base_table_t {
  struct tw_niels_t *table;
  
  /** Adjustments to the scalar in even and odd cases, respectively. */
  word_t scalar_adjustments[2*(448/WORD_BITS)];
  word_t scalar_adjustments[2*(448/WORD_BITS)];  /* MAGIC */
  
  /** The number of combs in the table. */
  unsigned int n;
@@ -103,7 +103,7 @@ montgomery_ladder (
 void
 scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    const word_t scalar[448/WORD_BITS] /* MAGIC */
    /* TODO? int nbits */
 );
    
@@ -124,7 +124,7 @@ scalarmul (
 void
 scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    const word_t scalar[448/WORD_BITS] /* MAGIC */
    /* TODO? int nbits */
 );

@@ -209,7 +209,7 @@ scalarmul_fixed_base (
 void
 scalarmul_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    const word_t scalar[448/WORD_BITS] /* MAGIC */
 );


@@ -274,14 +274,42 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt (
    struct tw_extensible_t *working,
    const word_t scalar_var[448/WORD_BITS],
    const word_t scalar_var[448/WORD_BITS], /* MAGIC */
    unsigned int nbits_var,
    const word_t scalar_pre[448/WORD_BITS],
    const word_t scalar_pre[448/WORD_BITS], /* MAGIC */
    unsigned int nbits_pre,
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
 );

 /**
 * Variable-time scalar linear combination of two fixed points.
 *
 * @warning This function takes variable time.  It is intended for
 * signature verification.
 *
 * @param [out] working The output point.
 * @param [in] scalar1 The first scalar.
 * @param [in] nbits1 The number of bits in the first scalar.
 * @param [in] table1 The first precomputed table.
 * @param [in] scalar2 The second scalar.
 * @param [in] nbits1 The number of bits in the second scalar.
 * @param [in] table1 The second precomputed table.
 *
 * @retval MASK_SUCCESS Success.
 * @retval MASK_FAILURE Failure, because eg the tables are too small.
 */
 mask_t
 linear_combo_combs_vt (
    struct tw_extensible_t *out,
    const word_t scalar1[448/WORD_BITS],
    unsigned int nbits1,
    const struct fixed_base_table_t *table1,
    const word_t scalar2[448/WORD_BITS],
    unsigned int nbits2,
    const struct fixed_base_table_t *table2
 );

 #ifdef __cplusplus
 };
 #endif
--- a/src/include/sha512.h
+++ b/src/include/sha512.h
@@ -10,6 +10,8 @@
 extern "C" {
 #endif

 #define SHA512_OUTPUT_BYTES 64

 /**
 * SHA512 hashing context.
 *
@@ -37,7 +39,7 @@ sha512_update (
 void
 sha512_final (
    struct sha512_ctx_t *ctx,
    uint8_t result[64]
    uint8_t result[SHA512_OUTPUT_BYTES]
 );
    
 #ifdef __cplusplus
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -8,11 +8,22 @@
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600

 #ifndef __APPLE__
 #define _BSD_SOURCE
 #include <endian.h>
 #endif

 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>

 #if __ARM_NEON__
 #include <arm_neon.h>
 #elif __SSE2__
 #include <immintrin.h>
 #endif

 #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
 /* It's a 64-bit machine if:
 * // limits.h thinks so
@@ -33,6 +44,7 @@ typedef __int128_t dsword_t;
 #define PRIxWORD58   "%014" PRIx64
 #define U64LE(x) x##ull
 #define U58LE(x) x##ull
 #define letohWORD letoh64
 #else
 typedef uint16_t hword_t;
 typedef uint32_t word_t;
@@ -45,15 +57,19 @@ typedef int64_t dsword_t;
 #define PRIxWORD58   "%07" PRIx32
 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define letohWORD letoh32
 #endif

 #define WORD_BITS (sizeof(word_t) * 8)

 /* TODO: vector width for procs like ARM; gcc support */
 typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

 typedef word_t mask_t;
 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;



 #ifdef __ARM_NEON__
 typedef uint32x4_t vecmask_t;
 #else
 /* FIXME this only works on clang */
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
@@ -61,8 +77,13 @@ typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
 typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
 /* TODO: vector width for procs like ARM; gcc support */
 typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
 #endif

 #if __AVX2__
 typedef uint32x8_t big_register_t;
@@ -82,11 +103,28 @@ typedef uint32_t big_register_t;
 #endif


 #if __AVX2__ || __SSE2__ || __ARM_NEON__
 #ifdef __ARM_NEON__
 static __inline__ big_register_t
 br_set_to_mask(mask_t x) {
    return vdupq_n_u32(x);
 }
 #else
 static __inline__ big_register_t
 br_set_to_mask(mask_t x) {
    return (big_register_t)x;
 }
 #endif

 #if __AVX2__ || __SSE2__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)(x == (big_register_t)0);
 }
 #elif __ARM_NEON__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return vceqq_u32(x,x^x);
 }
 #else
 static __inline__ mask_t
 br_is_zero(word_t x) {
@@ -96,6 +134,22 @@ br_is_zero(word_t x) {




 #ifdef __APPLE__
 static inline uint64_t
 htobe64 (uint64_t x) {
    __asm__ ("bswapq %0" : "+r"(x));
    return x;
 }
 static inline uint64_t
 htole64 (uint64_t x) { return x; }

 static inline uint64_t
 letoh64 (uint64_t x) { return x; }
 #endif



 /**
 * Allocate memory which is sufficiently aligned to be used for the
 * largest vector on the system (for now that's a big_register_t).
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -63,14 +63,14 @@ cond_negate_tw_pniels (
    cond_negate_tw_niels(&n->n, doNegate);
 }

 void    
 static __inline__ void
 constant_time_lookup_tw_pniels (
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
@@ -85,14 +85,14 @@ constant_time_lookup_tw_pniels (
    }
 }

 static __inline__ void    
 static __inline__ void
 constant_time_lookup_tw_niels (
    struct tw_niels_t *out,
    const struct tw_niels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
@@ -139,64 +139,73 @@ scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {

    const int nbits=448; /* HACK? */
    const int nbits=450; /* MAGIC */
    word_t prepared_data[448*2/WORD_BITS] = {
        U64LE(0x9595b847fdf73126),
        U64LE(0x9bb9b8a856af5200),
        U64LE(0xb3136e22f37d5c4f),
        U64LE(0x0000000189a19442),
        
        U64LE(0xebec9967f5d3f5c2),
        U64LE(0x0aa09b49b16c9a02),
        U64LE(0x7f6126aec172cd8e),
        U64LE(0x00000007b027e54d),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),

        U64LE(0x721cf5b5529eec33),
        U64LE(0x7a4cf635c8e9c2ab),
        U64LE(0xeec492d944a725bf),
        U64LE(0x000000020cd77058),
            
        U64LE(0xc873d6d54a7bb0cf),
        U64LE(0xe933d8d723a70aad),
        U64LE(0xbb124b65129c96fd),
        U64LE(0x00000008335dc163),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* TODO: split off */
    }; /* MAGIC */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
    
    const int WINDOW = 5, /* MAGIC */
        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    struct tw_pniels_t pn, multiples[NTABLE];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
    int i,j;
    for (i=1; i<NTABLE; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
        inv = (bits>>3)-1;
    i = nbits - WINDOW;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
        inv = (bits>>(WINDOW-1))-1;
    bits ^= inv;
    
    constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
    constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
    for (i-=WINDOW; i>=0; i-=WINDOW) {
        for (j=0; j<WINDOW; j++) {
            double_tw_extensible(working);
        }

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
        inv = (bits>>3)-1;
        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
        
        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
        }
                
        bits &= WINDOW_MASK;
        inv = (bits>>(WINDOW-1))-1;
        bits ^= inv;
    
        constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
        constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
@@ -207,81 +216,89 @@ scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {

    const int nbits=448; /* HACK? */
    const int nbits=450; /* HACK? */
    word_t prepared_data[448*2/WORD_BITS] = {
        U64LE(0x9595b847fdf73126),
        U64LE(0x9bb9b8a856af5200),
        U64LE(0xb3136e22f37d5c4f),
        U64LE(0x0000000189a19442),
        
        U64LE(0xebec9967f5d3f5c2),
        U64LE(0x0aa09b49b16c9a02),
        U64LE(0x7f6126aec172cd8e),
        U64LE(0x00000007b027e54d),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),

        U64LE(0x721cf5b5529eec33),
        U64LE(0x7a4cf635c8e9c2ab),
        U64LE(0xeec492d944a725bf),
        U64LE(0x000000020cd77058),
            
        U64LE(0xc873d6d54a7bb0cf),
        U64LE(0xe933d8d723a70aad),
        U64LE(0xbb124b65129c96fd),
        U64LE(0x00000008335dc163),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* TODO: split off */
    }; /* MAGIC: split off */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
    
    const int WINDOW = 5, /* MAGIC */
        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    struct tw_pniels_t pn, multiples[NTABLE];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
    int i,j;
    for (i=1; i<NTABLE; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
        inv = (bits>>3)-1;
    i = nbits - WINDOW;
    int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
        inv = (bits>>(WINDOW-1))-1;
    bits ^= inv;

 	copy_tw_pniels(&pn, &multiples[bits&7]);
    copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
    for (i-=WINDOW; i>=0; i-=WINDOW) {
        for (j=0; j<WINDOW; j++) {
            double_tw_extensible(working);
        }

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
        inv = (bits>>3)-1;
        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
        
        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
        }
                
        bits &= WINDOW_MASK;
        inv = (bits>>(WINDOW-1))-1;
        bits ^= inv;
    
 		copy_tw_pniels(&pn, &multiples[bits&7]);
        copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
 }


 mask_t
 scalarmul_fixed_base (
    struct tw_extensible_t *out,
    const word_t scalar[448/WORD_BITS],
 static mask_t
 schedule_scalar_for_combs (
    word_t *scalar2,
    const word_t *scalar,
    unsigned int nbits,
    const struct fixed_base_table_t *table
 ) {
    unsigned int i;
    unsigned int n = table->n, t = table->t, s = table->s;
    assert(n >= 1 && t >= 1 && s >= 1);
    
    if (n*t*s < nbits) {
    if (n*t*s < nbits || n < 1 || t < 1 || s < 1) {
        return MASK_FAILURE;
    }
    
@@ -289,10 +306,9 @@ scalarmul_fixed_base (
        scalar2_words = scalar_words;
    if (scalar2_words < 448 / WORD_BITS)
        scalar2_words = 448 / WORD_BITS;
    word_t scalar2[scalar2_words], scalar3[scalar2_words];
    word_t scalar3[scalar2_words];
    
    /* Copy scalar to scalar3, but clear its high bits (if there are any) */
    unsigned int i,j,k;
    for (i=0; i<scalar_words; i++) {
        scalar3[i] = scalar[i];
    }
@@ -309,6 +325,31 @@ scalarmul_fixed_base (
        table->scalar_adjustments , 448 / WORD_BITS
    );
    
    return MASK_SUCCESS;
 }

 mask_t
 scalarmul_fixed_base (
    struct tw_extensible_t *out,
    const word_t scalar[448/WORD_BITS],
    unsigned int nbits,
    const struct fixed_base_table_t *table
 ) {
    unsigned int i,j,k;
    unsigned int n = table->n, t = table->t, s = table->s;
    
    unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
    if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS;
    
    word_t scalar2[scalar2_words];

    mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table);
    if (!succ) return MASK_FAILURE;
    
 #ifdef __clang_analyzer__
    assert(t >= 1);
 #endif
    
    struct tw_niels_t ni;
    
    for (i=0; i<s; i++) {
@@ -345,6 +386,90 @@ scalarmul_fixed_base (
    return MASK_SUCCESS;
 }

 mask_t
 linear_combo_combs_vt (
    struct tw_extensible_t *out,
    const word_t scalar1[448/WORD_BITS],
    unsigned int nbits1,
    const struct fixed_base_table_t *table1,
    const word_t scalar2[448/WORD_BITS],
    unsigned int nbits2,
    const struct fixed_base_table_t *table2
 ) { 
    unsigned int i,j,k,sc;
    unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
    
    unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
    if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS;
    
    unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
    if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS;
    
    word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];

    /* Schedule the scalars */
    mask_t succ;
    succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1);
    if (!succ) return MASK_FAILURE;
  
    succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2);
    if (!succ) return MASK_FAILURE;

 #ifdef __clang_analyzer__
    assert(table1->t >= 1);
    assert(table2->t >= 1);
 #endif
  
    struct tw_niels_t ni;
    
    unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
    word_t *scalars[2] = {scalar1b,scalar2b};
    
    for (i=0; i<smax; i++) {
        if (i) double_tw_extensible(out);
            
        for (sc=0; sc<2; sc++) {
            const struct fixed_base_table_t *table = sc ? table2 : table1;
            
            int ii = i-smax+table->s;
            if (ii < 0) continue;
            assert(ii < (int)table->s);
        
            for (j=0; j<table->n; j++) {
            
                int tab = 0;

                for (k=0; k<table->t; k++) {
                    unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t);
                    if (bit < swords[sc] * WORD_BITS) {
                        tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
                    }
                }
            
                mask_t invert = (tab>>(table->t-1))-1;
                tab ^= invert;
                tab &= (1<<(table->t-1)) - 1;
            
                copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]);
                cond_negate_tw_niels(&ni,invert);
                
                if (started) {
                    add_tw_niels_to_tw_extensible(out, &ni);
                } else {
                    convert_tw_niels_to_tw_extensible(out, &ni);
                    started = 1;
                }
            
            }
        }
        
        assert(started);
    }
    
    return MASK_SUCCESS;
 }


 mask_t
 precompute_fixed_base (
  struct fixed_base_table_t *out,
@@ -354,7 +479,7 @@ precompute_fixed_base (
  unsigned int s,
  struct tw_niels_t *prealloc
 ) {
    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) {
    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */
        memset(out, 0, sizeof(*out));
        return 0;
    }
@@ -402,7 +527,7 @@ precompute_fixed_base (
    
    adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);

    /* FIXME: factor out somehow */
    /* MAGIC: factor out somehow */
    const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
        U64LE(0xdc873d6d54a7bb0d),
        U64LE(0xde933d8d723a70aa),
@@ -462,13 +587,13 @@ precompute_fixed_base (
        /* Gray-code phase */
        for (j=0;; j++) {
            int gray = j ^ (j>>1);
            int idx = ((i+1)<<(t-1))-1 ^ gray;
            int idx = (((i+1)<<(t-1))-1) ^ gray;

            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
            copy_tw_niels(&table[idx], &pn_tmp.n);
            p448_copy(&zs[idx], &pn_tmp.z);
 			
            if (j >= (1<<(t-1)) - 1) break;
            if (j >= (1u<<(t-1)) - 1) break;
            int delta = (j+1) ^ ((j+1)>>1) ^ gray;

            for (k=0; delta>1; k++)
@@ -777,7 +902,7 @@ linear_combo_var_fixed_vt(
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
 ) {
    const int table_bits_var = 3;
    const int table_bits_var = 4;
    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
    
--- a/src/sha512.c
+++ b/src/sha512.c
@@ -2,12 +2,8 @@
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __APPLE__
 #define _BSD_SOURCE
 #include <endian.h>
 #endif

 #include "sha512.h"
 #include "word.h"

 #include <string.h>
 #include <assert.h>
@@ -20,14 +16,6 @@ rotate_r (
  return (x >> d) | (x << (64-d));
 }

 #ifdef __APPLE__
 static inline uint64_t
 htobe64 (uint64_t x) {
    __asm__ ("bswapq %0" : "+r"(x));
    return x;
 }
 #endif

 static const uint64_t
 sha512_init_state[8] = {
    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 
--- a/test/bench.c
+++ b/test/bench.c
@@ -17,23 +17,28 @@
 #include "goldilocks.h"
 #include "sha512.h"

 double now() {
 static __inline__ void
 ignore_result ( int result ) {
    (void)result;
 }

 static double now() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  
  return tv.tv_sec + tv.tv_usec/1000000.0;
 }

 void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
 static void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
    crandom_generate(crand, (unsigned char *)a, sizeof(*a));
    p448_strong_reduce(a);
 }

 void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
 static void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
    crandom_generate(crand, (unsigned char *)sk, 448/8);
 }

 void p448_print( const char *descr, const struct p448_t *a ) {
 static void p448_print( const char *descr, const struct p448_t *a ) {
    p448_t b;
    p448_copy(&b, a);
    p448_strong_reduce(&b);
@@ -45,17 +50,21 @@ void p448_print( const char *descr, const struct p448_t *a ) {
    printf("\n");
 }

 void p448_print_full( const char *descr, const struct p448_t *a ) {
 static void __attribute__((unused))
 p448_print_full (
    const char *descr,
    const struct p448_t *a
 ) {
    int j;
    printf("%s = 0x", descr);
    for (j=15; j>=0; j--) {
        printf("%02" PRIxWORD "_" PRIxWORD58 " ",
            a->limb[j]>>28, a->limb[j]&(1<<28)-1);
            a->limb[j]>>28, a->limb[j]&((1<<28)-1));
    }
    printf("\n");
 }

 void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
 static void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
    int j;
    printf("%s = 0x", descr);
    for (j=448/WORD_BITS-1; j>=0; j--) {
@@ -295,7 +304,7 @@ int main(int argc, char **argv) {
 	
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)montgomery_ladder(&a,&b,sk,448,0);
        ignore_result(montgomery_ladder(&a,&b,sk,448,0));
    }
    when = now() - when;
    printf("full ladder: %5.1fµs\n", when * 1e6 / i);
@@ -310,11 +319,18 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        scalarmul_vlook(&ext,sk);
        untwist_and_double_and_serialize(&a,&ext);
    }
    when = now() - when;
    printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        scalarmul(&ext,sk);
        untwist_and_double_and_serialize(&a,&ext);
    }
    when = now() - when;
    printf("edwards smc: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
@@ -326,7 +342,7 @@ int main(int argc, char **argv) {
    struct tw_niels_t wnaft[1<<6];
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,6);
        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,6));
    }
    when = now() - when;
    printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -341,7 +357,7 @@ int main(int argc, char **argv) {
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,4);
        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,4));
    }
    when = now() - when;
    printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -356,7 +372,7 @@ int main(int argc, char **argv) {

    when = now();
    for (i=0; i<nbase/10; i++) {
        (void)precompute_fixed_base_wnaf(wnaft,&ext,5);
        ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,5));
    }
    when = now() - when;
    printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
@@ -401,7 +417,7 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_5_5_18);
        (void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL);
        ignore_result(precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL));
    }
    when = now() - when;
    printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
@@ -409,7 +425,7 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_3_5_30);
        (void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL);
        ignore_result(precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL));
    }
    when = now() - when;
    printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i);
@@ -417,7 +433,7 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_5_3_30);
        (void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL);
        ignore_result(precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL));
    }
    when = now() - when;
    printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i);
@@ -425,15 +441,15 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_15_3_10);
        (void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL);
        ignore_result(precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL));
    }
    when = now() - when;
    printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i);
    printf("pre(15,3,10):%5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase/10; i++) {
        if (i) destroy_fixed_base(&t_8_4_14);
        (void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL);
        ignore_result(precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL));
    }
    when = now() - when;
    printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i);
@@ -471,7 +487,7 @@ int main(int argc, char **argv) {
        scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10);
    }
    when = now() - when;
    printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i);
    printf("com(15,3,10):%5.1fµs\n", when * 1e6 / i);
    
    printf("\nGoldilocks:\n");
    
@@ -494,7 +510,7 @@ int main(int argc, char **argv) {
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    uint8_t ss1[64],ss2[64];
    int gres1,gres2;
    int gres1=0,gres2=0;
    when = now();
    for (i=0; i<nbase; i++) {
        if (i&1) {
@@ -540,18 +556,43 @@ int main(int argc, char **argv) {
    
    when = now();
    for (i=0; i<nbase; i++) {
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        (void)res;
        int ver = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        assert(!ver);
    }
    when = now() - when;
    printf("verify:      %5.1fµs\n", when * 1e6 / i);
    
    struct goldilocks_precomputed_public_key_t *pre = NULL;
    when = now();
    for (i=0; i<nbase; i++) {
        goldilocks_destroy_precomputed_public_key(pre);
        pre = goldilocks_precompute_public_key(&gpk);
    }
    when = now() - when;
    printf("precompute:  %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        int ver = goldilocks_verify_precomputed(sout,(const unsigned char *)message,message_len,pre);
        assert(!ver);
    }
    when = now() - when;
    printf("verify pre:  %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<nbase; i++) {
        int ret = goldilocks_shared_secret_precomputed(ss1,&gsk,pre);
        assert(!ret);
    }
    when = now() - when;
    printf("ecdh pre:    %5.1fµs\n", when * 1e6 / i);
    
    printf("\nTesting...\n");
    
    
    int failures=0, successes = 0;
    for (i=0; i<nbase/10; i++) {
        (void)goldilocks_keygen(&gsk,&gpk);
        ignore_result(goldilocks_keygen(&gsk,&gpk));
        goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        if (res) failures++;
@@ -574,9 +615,9 @@ int main(int argc, char **argv) {
        y = (hword_t)y;
        word_t z=x*y;
        
 	(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0);
        (void)montgomery_ladder(&c,&b,&y,WORD_BITS,0);
        (void)montgomery_ladder(&b,&a,&z,WORD_BITS,0);
    	ignore_result(montgomery_ladder(&b,&a,&x,WORD_BITS,0));
        ignore_result(montgomery_ladder(&c,&b,&y,WORD_BITS,0));
        ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0));
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
@@ -655,7 +696,7 @@ int main(int argc, char **argv) {
        untwist_and_double(&exta,&exv);
        serialize_extensible(&b, &exta);

        (void)precompute_fixed_base_wnaf(wnaft,&exu,5);
        ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5));
        linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
        untwist_and_double(&exta,&exv);
        serialize_extensible(&c, &exta);
--- a/test/test.c
+++ b/test/test.c
@@ -6,7 +6,7 @@

 int failed_tests, n_tests, failed_this_test, running_a_test;

 void end_test() {
 static void end_test() {
    if (!failed_this_test) {
        printf("[PASS]\n");
    }
@@ -14,7 +14,7 @@ void end_test() {
    running_a_test = 0;
 }

 void begin_test(const char *name) {
 static void begin_test(const char *name) {
    if (running_a_test) end_test();
    printf("%s...%*s",name,(int)(30-strlen(name)),"");
    fflush(stdout);
@@ -110,8 +110,9 @@ int main(int argc, char **argv) {
    (void) argv;
    
    n_tests = running_a_test = failed_tests = 0;
    begin_test("SHA-512 NIST Monte Carlo");
    test_sha512_monte_carlo();

    begin_test("Arithmetic");
    test_arithmetic();

    begin_test("EC point operations");
    test_pointops();
@@ -122,6 +123,15 @@ int main(int argc, char **argv) {
    begin_test("Scalarmul commutativity");
    test_scalarmul_commutativity();
    
    begin_test("Linear combo");
    test_linear_combo();
    
    begin_test("SHA-512 NIST Monte Carlo");
    test_sha512_monte_carlo();
    
    begin_test("Goldilocks complete system");
    test_goldilocks();
    
    if (running_a_test) end_test();
    printf("\n");
    if (failed_tests) {
--- a/test/test.h
+++ b/test/test.h
@@ -33,10 +33,16 @@ void youfail();

 int test_sha512_monte_carlo();

 int test_linear_combo ();

 int test_scalarmul_compatibility ();

 int test_scalarmul_commutativity ();

 int test_arithmetic ();

 int test_goldilocks ();

 int test_pointops ();

 #endif // __GOLDILOCKS_TEST_H__
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -0,0 +1,196 @@
 #include "p448.h"
 #include "test.h"
 #include <gmp.h>
 #include <string.h>
 #include <stdio.h>

 mpz_t mp_p448;

 static mask_t mpz_to_p448 (
    struct p448_t *out,
    const mpz_t in
 ) {
    uint8_t ser[56];
    mpz_t modded;
    memset(ser,0,sizeof(ser));
    mpz_init(modded);
    mpz_mod(modded, in, mp_p448);
    mpz_export(ser, NULL, -1, 1, -1, 0, modded);
    mask_t succ = p448_deserialize(out, ser);
    return succ;
 }

 static mask_t p448_assert_eq_gmp(
    const char *descr,
    const struct p448_t *x,
    const mpz_t y,
    float lowBound,
    float highBound
 ) {
    uint8_t xser[56], yser[56];
    mpz_t modded;
    
    memset(yser,0,sizeof(yser));
    
    p448_serialize(xser, x);
    
    mpz_init(modded);
    mpz_mod(modded, y, mp_p448);
    mpz_export(yser, NULL, -1, 1, -1, 0, modded);
    
    unsigned int i;
    for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
        int bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
        word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
            (1ull<<bits) - 2 : (1ull<<bits) - 1;
        if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
            youfail();
            printf("    P448 limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n",
                 i, x->limb[i], lowBound, highBound, descr, yardstick);
            break;
        }
    }
    
    if (memcmp(xser,yser,56)) {
        youfail();
        printf("    Failed arithmetic test %s\n", descr);
        p448_print("    p448", x);
        printf("    gmp  = 0x");
        int j;
        for (j=55; j>=0; j--) {
            printf("%02x", yser[j]);
        }
        printf("\n");
        return MASK_FAILURE;
    }
    
    mpz_clear(modded);
    return MASK_SUCCESS;
 }

 static mask_t test_add_sub (
    const mpz_t x,
    const mpz_t y,
    word_t word
 ) {
    struct p448_t xx,yy,tt;
    mpz_t t;
    mask_t succ = MASK_SUCCESS;
    succ  = mpz_to_p448(&xx,x);
    succ &= mpz_to_p448(&yy,y);
    mpz_init(t);
    
    p448_add(&tt,&xx,&yy);
    mpz_add(t,x,y);
    succ &= p448_assert_eq_gmp("add",&tt,t,0,2.1);
    
    p448_sub(&tt,&xx,&yy);
    p448_bias(&tt,2);
    mpz_sub(t,x,y);
    succ &= p448_assert_eq_gmp("sub",&tt,t,0,3.1);
    
    p448_copy(&tt,&xx);
    p448_addw(&tt,word);
    mpz_add_ui(t,x,word);
    succ &= p448_assert_eq_gmp("addw",&tt,t,0,2.1);
    
    p448_copy(&tt,&xx);
    p448_subw(&tt,word);
    p448_bias(&tt,1);
    mpz_sub_ui(t,x,word);
    succ &= p448_assert_eq_gmp("subw",&tt,t,0,2.1);
    
    if (!succ) {
        p448_print("    x", &xx);
        p448_print("    y", &yy);
    }
    
    mpz_clear(t);
    
    return succ;
 }

 static mask_t test_mul_sqr (
    const mpz_t x,
    const mpz_t y,
    word_t word
 ) {
    struct p448_t xx,yy,tt;
    mpz_t t;
    mask_t succ = MASK_SUCCESS;
    succ  = mpz_to_p448(&xx,x);
    succ &= mpz_to_p448(&yy,y);
    mpz_init(t);
    
    p448_mul(&tt,&xx,&yy);
    mpz_mul(t,x,y);
    succ &= p448_assert_eq_gmp("mul",&tt,t,0,1.1);
    
    p448_mulw(&tt,&xx,word);
    mpz_mul_ui(t,x,word);
    succ &= p448_assert_eq_gmp("mulw",&tt,t,0,1.1);
    
    p448_sqr(&tt,&xx);
    mpz_mul(t,x,x);
    succ &= p448_assert_eq_gmp("sqrx",&tt,t,0,1.1);
    
    p448_sqr(&tt,&yy);
    mpz_mul(t,y,y);
    succ &= p448_assert_eq_gmp("sqy",&tt,t,0,1.1);
    
    if (!succ) {
        p448_print("    x", &xx);
        p448_print("    y", &yy);
    }
    
    mpz_clear(t);
    
    return succ;
 }

 int test_arithmetic () {
    int j, ntests = 100000;
    
    gmp_randstate_t state;
    gmp_randinit_mt(state);
    
    uint8_t pser[56];
    for (j=0; j<56; j++) {
        pser[j] = (j==28) ? 0xFE : 0xFF;
    }
    mpz_init(mp_p448);
    mpz_import(mp_p448, 56, -1, 1, -1, 0, pser);
    
    mpz_t x,y;
    mpz_init(x);
    mpz_init(y);
    
    mask_t succ = MASK_SUCCESS;
    
    int bits = sizeof(word_t) * 448 / sizeof(p448_t);
    
    for (j=0; j<ntests; j++) {
        if (j&1) {
            mpz_rrandomb(x, state, 448);
            mpz_rrandomb(y, state, 448);
        } else {
            mpz_urandomb(x, state, 448);
            mpz_urandomb(y, state, 448);
        }
        
        word_t word = gmp_urandomm_ui (state, 1ull<<bits);
        
        succ &= test_add_sub(x,y,word);
        succ &= test_mul_sqr(x,y,word);
        
        // TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
    }
    
    mpz_clear(x);
    mpz_clear(y);
    mpz_clear(mp_p448);
    gmp_randclear(state);
    
    return succ ? 0 : 1;
 }

--- a/test/test_goldilocks.c
+++ b/test/test_goldilocks.c
@@ -0,0 +1,195 @@
 #include "test.h"
 #include "goldilocks.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 int test_goldilocks () {
    const char *message1 = "hello world";
    const char *message2 = "Jello world";
    
    unsigned char signature[GOLDI_SIGNATURE_BYTES];
    
    unsigned char
        ss12[GOLDI_SHARED_SECRET_BYTES],
        ss21[GOLDI_SHARED_SECRET_BYTES],
        ss21p[GOLDI_SHARED_SECRET_BYTES],
        proto[GOLDI_SYMKEY_BYTES];
    
    struct goldilocks_public_key_t  pub, pub2;
    struct goldilocks_private_key_t priv, priv2;
    struct goldilocks_precomputed_public_key_t *pre = NULL;
    
    int i, ret, good = 1;
    
    ret = goldilocks_init();
    if (ret) {
        youfail();
        printf("    Failed init.\n");
    }
    
    for (i=0; i<1000 && good; i++) {
        
        ret = goldilocks_keygen(&priv, &pub);
        if (ret) {
            youfail();
            printf("    Failed keygen trial %d.\n", i);
            good = 0;
        }
        
        goldilocks_destroy_precomputed_public_key( pre );
        pre = goldilocks_precompute_public_key ( &pub );
        if (!pre) {
            youfail();
            printf("    Failed precomp-public trial %d.\n", i);
            return -1;
        }
        
        ret = goldilocks_sign(
            signature,
            (const unsigned char *)message1,
            strlen(message1),
            &priv
        );
        if (ret) {
            youfail();
            printf("    Failed sign trial %d.\n", i);
            good = 0;
        }
        
        ret = goldilocks_verify(
            signature,
            (const unsigned char *)message1,
            strlen(message1),
            &pub
        );
        if (ret) {
            youfail();
            printf("    Failed verify trial %d.\n", i);
            good = 0;
        }

        ret = goldilocks_verify_precomputed (
            signature,
            (const unsigned char *)message1,
            strlen(message1),
            pre
        );
        if (ret) {
            youfail();
            printf("    Failed verify-pre trial %d.\n", i);
            good = 0;
        }
        
        /* terrible negative test */
        ret = goldilocks_verify(
            signature,
            (const unsigned char *)message2,
            strlen(message1),
            &pub
        );
        if (ret != GOLDI_EINVAL) {
            youfail();
            printf("    Failed nega-verify trial %d.\n", i);
            good = 0;
        }
        ret = goldilocks_verify_precomputed(
            signature,
            (const unsigned char *)message2,
            strlen(message1),
            pre
        );
        if (ret != GOLDI_EINVAL) {
            youfail();
            printf("    Failed nega-verify-pre trial %d.\n", i);
            good = 0;
        }
        
        /* honestly a slightly better negative test */
        memset(signature,0,sizeof(signature));
        ret = goldilocks_verify(
            signature,
            (const unsigned char *)message1,
            strlen(message1),
            &pub
        );
        if (ret != GOLDI_EINVAL) {
            youfail();
            printf("    Failed nega-verify-0 trial %d.\n", i);
            good = 0;
        }
        ret = goldilocks_verify_precomputed(
            signature,
            (const unsigned char *)message1,
            strlen(message1),
            pre
        );
        if (ret != GOLDI_EINVAL) {
            youfail();
            printf("    Failed nega-verify-pre-0 trial %d.\n", i);
            good = 0;
        }
        
        /* ecdh */
        ret = goldilocks_keygen(&priv2, &pub2);
        if (ret) {
            youfail();
            printf("    Failed keygen2 trial %d.\n", i);
            good = 0;
        }
        
        ret = goldilocks_shared_secret ( ss12, &priv, &pub2 );
        if (ret) {
            youfail();
            printf("    Failed ss12 trial %d.\n", i);
            good = 0;
        }
        
        ret = goldilocks_shared_secret ( ss21, &priv2, &pub );
        if (ret) {
            youfail();
            printf("    Failed ss21 trial %d.\n", i);
            good = 0;
        }
        
        ret = goldilocks_shared_secret_precomputed ( ss21p, &priv2, pre );
        if (ret) {
            youfail();
            printf("    Failed ss21p trial %d.\n", i);
            good = 0;
        }
        
        if (memcmp(ss12,ss21,sizeof(ss12))) {
            youfail();
            printf("    Failed shared-secret trial %d.\n", i);
            good = 0;
        }
        
        if (memcmp(ss21,ss21p,sizeof(ss21))) {
            youfail();
            printf("    Failed shared-secret precomp trial %d.\n", i);
            good = 0;
        }
        
        /* test derive / underive / priv to pub */
        goldilocks_underive_private_key ( proto, &priv );
        ret = goldilocks_derive_private_key ( &priv2, proto );
        if (ret || memcmp(&priv,&priv2,sizeof(priv))) {
            youfail();    
            printf("    Failed derive round-trip trial %d.\n", i);
            good = 0;
        }
        
        ret = goldilocks_private_to_public ( &pub2, &priv );
        if (ret || memcmp(&pub,&pub2,sizeof(pub))) {
            youfail();
            printf("    Failed private-to-public trial %d.\n", i);
            good = 0;
        }
        
    }
    
    goldilocks_destroy_precomputed_public_key( pre );
    
    return good ? 0 : -1;
 }
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -159,6 +159,92 @@ single_scalarmul_compatibility_test (
    return ret;
 }

 static int
 single_linear_combo_test (
    const struct p448_t *base1,
    const word_t *scalar1,
    int nbits1,
    const struct p448_t *base2,
    const word_t *scalar2,
    int nbits2
 ) {
    /* MAGIC */
    const struct p448_t
    sqrt_d_minus_1 = {{
        U58LE(0xd2e21836749f46),
        U58LE(0x888db42b4f0179),
        U58LE(0x5a189aabdeea38),
        U58LE(0x51e65ca6f14c06),
        U58LE(0xa49f7b424d9770),
        U58LE(0xdcac4628c5f656),
        U58LE(0x49443b8748734a),
        U58LE(0x12fec0c0b25b7a)
    }};
    
    struct tw_extensible_t text1, text2, working;
    struct tw_pniels_t pn;
    struct p448_t result_comb, result_combo, result_wnaf;
    
    mask_t succ = 
        deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1)
      & deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2);
    if (!succ) return 1;
    
    struct fixed_base_table_t t1, t2;
    struct tw_niels_t wnaf[32];
    memset(&t1,0,sizeof(t1));
    memset(&t2,0,sizeof(t2));
    
    succ = precompute_fixed_base(&t1, &text1, 5, 5, 18, NULL);
    succ &= precompute_fixed_base(&t2, &text2, 6, 3, 25, NULL);
    succ &= precompute_fixed_base_wnaf(wnaf, &text2, 5);
    
    if (!succ) {
        destroy_fixed_base(&t1);
        destroy_fixed_base(&t2);
        return -1;
    }
    
    /* use the dedicated wNAF linear combo algorithm */
    copy_tw_extensible(&working, &text1);
    linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5);
    untwist_and_double_and_serialize(&result_wnaf, &working);
    
    /* use the dedicated combs algorithm */
    succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2);
    untwist_and_double_and_serialize(&result_combo, &working);
    
    /* use two combs */
    succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1);
    convert_tw_extensible_to_tw_pniels(&pn, &working);
    succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2);
    add_tw_pniels_to_tw_extensible(&working, &pn);
    untwist_and_double_and_serialize(&result_comb, &working);
    
    mask_t consistent = MASK_SUCCESS;
    consistent &= p448_eq(&result_combo, &result_wnaf);
    consistent &= p448_eq(&result_comb,  &result_wnaf);
    
    if (!succ || !consistent) {
        youfail();
        printf("    Failed linear combo consistency test with nbits=%d,%d.\n",nbits1,nbits2);

        p448_print("    base1", base1);
        scalar_print("    scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
        p448_print("    base2", base2);
        scalar_print("    scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
        p448_print("    combs", &result_comb);
        p448_print("    combo", &result_combo);
        p448_print("    wNAFs", &result_wnaf);
        return -1;
    }
    
    destroy_fixed_base(&t1);
    destroy_fixed_base(&t2);
    
    return 0;
 }

 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_commutativity_test (
@@ -251,6 +337,49 @@ int test_scalarmul_commutativity () {
    return 0;
 }

 int test_linear_combo () {
    int i,j,k,got;
    
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, "scalarmul_linear_combos_test RNG");
    
    for (i=0; i<=448; i+=7) {
        for (j=0; j<=448; j+=7) {
            got = 0;
            
            for (k=0; k<128 && !got; k++) {
                uint8_t ser[56];
                word_t scalar1[7], scalar2[7];
                crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1));
                crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2));
            
                p448_t base1;
                crandom_generate(&crand, ser, sizeof(ser));
                mask_t succ = p448_deserialize(&base1, ser);
                if (!succ) continue;
                
                p448_t base2;
                crandom_generate(&crand, ser, sizeof(ser));
                succ = p448_deserialize(&base2, ser);
                if (!succ) continue;
            
                int ret = single_linear_combo_test (&base1, scalar1, i, &base2, scalar2, j);
                got = !ret;
                if (ret == -1) return -1;
            }

            if (!got) {
                youfail();
                printf("    Unlikely: rejected 128 scalars in a row.\n");
                return -1;
            }
            
        }
    }
    
    return 0;
 }

 int test_scalarmul_compatibility () {
    int i,j,k,got;