Mostly a cleanup release. Cleanup old code, improve documentation,

improve GCC-cleanness, etc. Disable the crandom output buffer so that it won't return duplicate data across fork(). I should still stir in more entropy into the buffer at least when RDRAND is available, but this should prevent disasters for now. The Elligator code in the current version is incompatible with past versions due to a minor tweak. It wasn't being called by any of the API functions, though. Removing "magic" constants and type names. So for example p448_t is now field_t (though maybe it should really be felem_t?). This should enable other curves with the Goldilocks code in the not-too- distant future. Added CRANDOM_MIGHT_IS_MUST so that you don't have to -D a bunch of things on the command line. You can `make bat` to make an eBAT which probably doesn't work. I haven't implemented the improved nonce generation from the curves@moderncrypto.org thread yet.
10 years ago · 4eb210cd85
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,4 +1,52 @@
 May 3, 2104:
 July 11, 2014:
    This is mostly a cleanup release.

    Added CRANDOM_MIGHT_IS_MUST config flag (default: 1).  When set, this
    causes crandom to assume that all features in the target arch will
    be available, instead of detecting them.  This makes sense because
    the rest of the Goldilocks code is not (yet?) able to detect features.
    Also, I'd like to submit this to SUPERCOP eventually, and SUPERCOP won't
    pass -DMUST_HAVE_XXX on the command line the way the Makefile here did.
    
    Flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES to disable the crandom
    output buffer.  This buffer improves performance (very marginally at
    Goldilocks sizes), but can cause problems with forking and VM
    snapshotting.  By default, the buffer is now disabled.
    
    I've slightly tweaked the Elligator implementation (which is still
    unused) to make it easier to invert.  This makes anything using Elligator
    (i.e. nothing) incompatible with previous releases.
    
    I've been factoring "magic" constants such as curve orders, window sizes,
    etc into a few headers, to reduce the effort to port the code to other
    primes, curves, etc.  For example, I could test the Microsoft curves, and
    something like:
        x^2 + y^2 = 1 +- 5382[45] x^2 y^2 mod 2^480-2^240-1
    ("Goldeneye"? "Ridinghood"?) might be a reasonable thing to try for
    64-bit CPUs.
    
    In a similar vein, most of the internal code has been changed to say
    "field" instead of p448, so that a future version of magic.h can decide
    which field header to include.
    
    You can now `make bat` to create an eBAT in build/ed448-goldilocks.  This
    is only minimally tested, though, because SUPERCOP doesn't work on my
    machine and I'm too lazy to reverse engineer it.  It sets a new macro,
    SUPERCOP_WONT_LET_ME_OPEN_FILES, which causes goldilocks_init() to fall
    back to something horribly insecure if crandom_init_from_file raises
    EMFILE.
    
    Slightly improved documentation.
    
    Removed some old commented-out code; restored the /* C-style */ comment
    discipline.
    
    The AMD-64 version should now be GCC clean, at least for reasonably
    recent GCC (tested on OS X.9.3, Haswell, gcc-4.9).
    
    History no longer says "2104".

 May 3, 2014:
    Minor changes to internal routines mean that this version is not
    compatible with the previous one.

--- a/+ 21
+++ b/+ 21
@@ -39,7 +39,7 @@ endif
 ARCHFLAGS += -mcpu=cortex-a9 # FIXME
 GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
 else
 ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO
 ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
 endif

 ifeq ($(CC),clang)
@@ -48,26 +48,28 @@ endif

 ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC)))
 # ARCHFLAGS += -m32
 ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1
 XCFLAGS += -DGOLDI_FORCE_32_BIT=1
 endif

 CFLAGS  = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
 LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
 ASFLAGS = $(ARCHFLAGS)

 .PHONY: clean all test bench todo doc lib
 .PHONY: clean all test bench todo doc lib bat
 .PRECIOUS: build/%.s

 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o
  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o

 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o

 BENCHCOMPONENTS=build/bench.o

 BATNAME=build/ed448-goldilocks

 all: lib build/test build/bench

 scan: clean
@@ -118,6 +120,19 @@ doc/timestamp:
 doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h
 	doxygen

 bat: $(BATNAME)

 $(BATNAME): include/* src/* src/*/*
 	rm -fr $@
 	for arch in src/arch*; do \
 		mkdir -p $@/`basename $$arch`; \
 		cp include/* src/*.c src/include/* $$arch/* $@/`basename $$arch`; \
 		perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $@/`basename $$arch`/config.h; \
 		done
 	echo 'Mike Hamburg' > $@/designers
 	echo 'Ed448-Goldilocks sign and dh' > $@/description
 	

 todo::
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
@@ -139,4 +154,4 @@ test: build/test
 	./$<

 clean:
 	rm -fr build doc
 	rm -fr build doc $(BATNAME)
--- a/TODO.txt
+++ b/TODO.txt
@@ -25,8 +25,8 @@ Important work items for Ed448-Goldilocks:

 * [DONE] Bugfix: make sure that init() and randomization are thread-safe.

 * Security: check on deserialization that points are < p.
    * Check also that they're nonzero or otherwise non-pathological?
 * [DONE] Security: check on deserialization that points are < p.
    * [NEEDS TESTING] Check also that they're nonzero or otherwise non-pathological?

 * Testing:
    * Corner-case testing
@@ -39,16 +39,16 @@ Important work items for Ed448-Goldilocks:
    * Most functions now have warn on ignored return.

 * Safety:
    * Check for init() if it's still required once we've done the above
    * [DONE] Check for init() if it's still required once we've done the above
    * Decide what to do about RNG failures
        * abort
        * return error and zeroize
        * return error but continue if RNG is kind of mostly OK
    
 * Flexibility: decide which API options are good.
    * Eg, should functions take nbits and table sizes?
    * [DONE?] Eg, should functions take nbits and table sizes?
    
    * Remove hardcoded adjustments from comb control.
    * [DONE] Remove hardcoded adjustments from comb control.
        * These adjustments make the output wrong when it's not 450 bits.
        
    * Other slow Barrett fields?  Montgomery fields?
@@ -71,6 +71,7 @@ Important work items for Ed448-Goldilocks:

 * Portability: test and make clean with other compilers
    * Using a fair amount of __attribute__ code.
    * [DONE] Should work for GCC now.

 * Portability: try to make the vector code as portable as possible
    * Currently using clang ext_vector_length.
@@ -79,15 +80,15 @@ Important work items for Ed448-Goldilocks:

 * Portability: make the inner layers of the code 32-bit clean.
    * Write new versions of the field code.
        * 28-bit limbs give less headroom for carries.
        * Now have a vectorless ARM version; need NEON.
        * [DONE] 28-bit limbs give less headroom for carries.
        * [DONE] Now have a vectorless ARM version; need NEON.
        * Improve speed of 32-bit field code.
    
    * Run through the SAGE tool to generate new bias & bound.
    * [DONE] Run through the SAGE tool to generate new bias & bound.

 * [DONE] Portability: make the outer layers of the code 32-bit clean.

 * Performance/flexibility: decide which parameters should be hard-coded.
 * [DONE] Performance/flexibility: decide which parameters should be hard-coded.
    * Perhaps useful for comb precomputation.

 * Performance: Improve SHA512.
@@ -120,4 +121,4 @@ Important work items for Ed448-Goldilocks:

 * Clear other TODO/FIXME/HACK/PERF items in the code

 * Submit to SUPERCOP
 * [DONE?] Submit to SUPERCOP
--- a/src/arch_32/ec_point.c
+++ b/src/arch_32/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
    mask_t L4, L5, L6;
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &a->xa,   &L1 );
    p448_add  (   &L2,   &L0,   &L3 );
    p448_sub  (   &L1,   &L3,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L2, &a->z0, &a->z0 );
    p448_bias (   &L2,     1 );
    p448_add  (   &L0,   &L2,   &L2 );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L0, &a->xd,   &L2 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L0,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
 }

 void
@@ -524,8 +524,8 @@ test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_mul  (   &L1, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L1 );
    p448_isr  (   &L0, &b->x );
    p448_mul  ( &b->u, &b->t,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &b->t, &b->x,   &L1 );
    p448_add  (   &L1, &a->y, &a->x );
    p448_weak_reduce(   &L1 );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
 }

 mask_t
@@ -744,15 +744,15 @@ eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L1, L2;
    struct p448_t L0;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_neg  (   &L4, &a->y );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->x );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L0,   &L1, 39081 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L0,   &L3,   &L2 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L2,   &L3,   &L0 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/src/arch_32/p448.c
+++ b/src/arch_32/p448.c
@@ -4,7 +4,6 @@

 #include "word.h"
 #include "p448.h"
 //#include "x86-64-arith.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
@@ -27,13 +26,7 @@ p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    // p448_t ar, br;
 //     p448_copy(&ar,as);
 //     p448_copy(&br,bs);
 //     p448_weak_reduce(&ar);
 //     p448_weak_reduce(&br);
    
 ) { 
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;

@@ -41,13 +34,7 @@ p448_mul (
    uint32_t mask = (1<<28) - 1;  

    uint32_t aa[8], bb[8];

    /* For some reason clang doesn't vectorize this without prompting? */
    // unsigned int i;
    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    // }
    
    int i,j;
    for (i=0; i<8; i++) {
        aa[i] = a[i] + a[i+8];
@@ -144,7 +131,7 @@ p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    p448_mul(cs,as,as); // PERF
    p448_mul(cs,as,as); /* PERF */
 }

 void
--- a/src/arch_arm_32/ec_point.c
+++ b/src/arch_arm_32/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
    mask_t L4, L5, L6;
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &a->xa,   &L1 );
    p448_add  (   &L2,   &L0,   &L3 );
    p448_sub  (   &L1,   &L3,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L2, &a->z0, &a->z0 );
    p448_bias (   &L2,     1 );
    p448_add  (   &L0,   &L2,   &L2 );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L0, &a->xd,   &L2 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L0,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
 }

 void
@@ -524,8 +524,8 @@ test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_mul  (   &L1, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L1 );
    p448_isr  (   &L0, &b->x );
    p448_mul  ( &b->u, &b->t,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &b->t, &b->x,   &L1 );
    p448_add  (   &L1, &a->y, &a->x );
    p448_weak_reduce(   &L1 );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
 }

 mask_t
@@ -744,15 +744,15 @@ eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L1, L2;
    struct p448_t L0;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_neg  (   &L4, &a->y );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->x );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L0,   &L1, 39081 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L0,   &L3,   &L2 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L2,   &L3,   &L0 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/src/arch_arm_32/p448.c
+++ b/src/arch_arm_32/p448.c
@@ -4,7 +4,6 @@

 #include "word.h"
 #include "p448.h"
 //#include "x86-64-arith.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
@@ -105,11 +104,6 @@ p448_mul (
    const p448_t *as,
    const p448_t *bs
 ) {
    // p448_t ar, br;
 //     p448_copy(&ar,as);
 //     p448_copy(&br,bs);
 //     p448_weak_reduce(&ar);
 //     p448_weak_reduce(&br);
    
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
@@ -119,12 +113,6 @@ p448_mul (

    uint32_t aa[8], bm[8];

    /* For some reason clang doesn't vectorize this without prompting? */
    // unsigned int i;
    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    // }
    int i;
    for (i=0; i<8; i++) {
        aa[i] = a[i] + a[i+8];
@@ -466,12 +454,6 @@ p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    // p448_t ar, br;
 //     p448_copy(&ar,as);
 //     p448_copy(&br,bs);
 //     p448_weak_reduce(&ar);
 //     p448_weak_reduce(&br);
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

@@ -479,13 +461,7 @@ p448_sqr (
    uint32_t mask = (1<<28) - 1;  

    uint32_t bm[8];

    /* For some reason clang doesn't vectorize this without prompting? */
    // unsigned int i;
    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&bm[4]))[i];
    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    // }
    
    int i;
    for (i=0; i<8; i++) {
        bm[i] = a[i] - a[i+8];
--- a/src/arch_neon/ec_point.c
+++ b/src/arch_neon/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L5, &a->z0, &a->z0 );
    p448_bias (   &L5,     1 );
    p448_add  (   &L3,   &L5,   &L5 );
    p448_add  (   &L5,   &L3,   &L4 );
    p448_weak_reduce(   &L5 );
    p448_mul  (   &L3, &a->xd,   &L5 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L3,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
    mask_t L4, L5, L6;
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &a->xa,   &L1 );
    p448_add  (   &L2,   &L0,   &L3 );
    p448_sub  (   &L1,   &L3,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L2, &a->z0, &a->z0 );
    p448_bias (   &L2,     1 );
    p448_add  (   &L0,   &L2,   &L2 );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L0, &a->xd,   &L2 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L0,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
 }

 void
@@ -524,8 +524,8 @@ test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  ( &b->x, &a->y, &a->x );
    p448_weak_reduce( &b->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L3, &b->t,   &L2 );
    p448_add  (   &L2,   &L3, &b->x );
    p448_sub  ( &b->t, &b->x,   &L3 );
    p448_mul  (   &L1, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L1 );
    p448_isr  (   &L0, &b->x );
    p448_mul  ( &b->u, &b->t,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &b->t, &b->x,   &L1 );
    p448_add  (   &L1, &a->y, &a->x );
    p448_weak_reduce(   &L1 );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
    p448_weak_reduce( &b->y );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
 }

 mask_t
@@ -744,15 +744,15 @@ eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L1, L2;
    struct p448_t L0;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_weak_reduce(   &L9 );
    p448_sqr  (   &L2,   &L9 );
    p448_mulw (   &L8,   &L2, 1527402724 );
    p448_mulw (   &L7,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L7,   &L8 );
    p448_neg  (   &L4, &a->y );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L5, &a->x );
    p448_subw (   &L5,     1 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L6, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L5,   &L6 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->x );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L0,   &L1, 39081 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L0,   &L3,   &L2 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L2,   &L3,   &L0 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/src/arch_neon/neon_emulation.h
+++ b/src/arch_neon/neon_emulation.h
@@ -8,9 +8,12 @@
 *
 * This lets you test and debug NEON code on x86.
 */

 #ifndef __NEON_EMULATION_H__
 #define __NEON_EMULATION_H__ 1

 /** @cond internal */

 #include "word.h"

 #include <stdint.h>
@@ -147,4 +150,6 @@ static inline int64x2_t vmull_lane_s32 (
    return xx*(lane?yy.yy:yy.xx);
 }

 /** @endcond */

 #endif /* __NEON_EMULATION_H__ */
--- a/src/arch_neon/p448.c
+++ b/src/arch_neon/p448.c
@@ -37,7 +37,7 @@ xx_vaddup_s64(int64x2_t x) {
 }
 #else
 #include "neon_emulation.h"
 #endif // ARM_NEON
 #endif /* ARM_NEON */

 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
@@ -75,12 +75,6 @@ smull2 (
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }

 // static inline int64x2_t copy_now(int64x2_t x) {
 //     int64x2_t y;
 //     __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x));
 //     return y;
 // }

 void
 p448_mul (
    p448_t *__restrict__ cs,
--- a/src/arch_x86_64/ec_point.c
+++ b/src/arch_x86_64/ec_point.c
@@ -356,51 +356,51 @@ serialize_montgomery (
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5, L6;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L2, &a->xa,   &L1 );
    p448_add  (   &L1,   &L2,   &L3 );
    p448_sub  (   &L0,   &L3,   &L2 );
    p448_bias (   &L0,     2 );
    p448_mul  (   &L3,   &L0,   &L1 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L0, &a->z0, &a->z0 );
    p448_bias (   &L0,     1 );
    p448_add  (   &L2,   &L0,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2, &a->xd,   &L0 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L2,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_bias (   &L4,     2 );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_bias (   &L4,     2 );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_bias (   &L4,     2 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L3, &a->z0, &a->z0 );
    p448_bias (   &L3,     1 );
    p448_add  (   &L5,   &L3,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_mul  (   &L5, &a->xd,   &L3 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L5,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
    p448_bias (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
 }

 void
@@ -491,8 +491,8 @@ test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1;
    mask_t L2, L3;
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
@@ -501,36 +501,36 @@ test_only_twist (
    p448_add  ( &b->u, &b->y, &b->y );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_mul  ( &b->t, &b->y, &a->y );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->x, &b->z, &b->t );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  (   &L1, &b->x, &b->t );
    p448_isr  ( &b->t,   &L1 );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t,   &L1, &b->x );
    p448_add  (   &L1, &a->y, &a->x );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_bias (   &L2,     2 );
    p448_mul  ( &b->x, &b->t,   &L2 );
    p448_add  (   &L2, &b->x,   &L3 );
    p448_sub  ( &b->t,   &L3, &b->x );
    p448_bias ( &b->t,     2 );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
    p448_weak_reduce( &b->y );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }
@@ -539,16 +539,16 @@ mask_t
 is_square (
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    mask_t L2, L3;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
    p448_bias (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
 }

 mask_t
@@ -700,15 +700,15 @@ eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    struct p448_t L0;
    mask_t L1, L2;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
@@ -716,19 +716,19 @@ eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    struct p448_t L0, L1, L2;
    mask_t L3, L4;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
@@ -736,19 +736,19 @@ eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    struct p448_t L0, L1, L2;
    mask_t L3, L4;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
@@ -756,53 +756,56 @@ elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    struct p448_t L0, L1, L2, L3, L4, L5, L6, L7;
    mask_t L8, L9;
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L1, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L7, &a->y );
    p448_neg  (   &L4, &a->y );
    p448_bias (   &L4,     2 );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_sqr  (   &L0,   &L7 );
    p448_mulw (   &L6,   &L0, 1527402724 );
    p448_mulw (   &L5,   &L1, 6108985600 );
    p448_add  ( &a->y,   &L5,   &L6 );
    p448_mulw (   &L6,   &L0, 6109454568 );
    p448_sub  (   &L5, &a->y,   &L6 );
    p448_bias (   &L5,     2 );
    p448_mulw (   &L2, &a->y, 78160 );
    p448_mul  (   &L4,   &L5,   &L7 );
    p448_mul  (   &L6,   &L4,   &L2 );
    p448_mul  (   &L2,   &L5,   &L6 );
    p448_isr  (   &L3,   &L2 );
    p448_mul  (   &L2,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mul  (   &L6,   &L5,   &L3 );
    p448_mul  (   &L5,   &L6,   &L3 );
    p448_copy (   &L4, &a->x );
    p448_subw (   &L4,     1 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L3, &a->x,   &L6 );
    p448_sub  ( &a->x,   &L4,   &L3 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_mul  (   &L3,   &L2, &a->x );
    p448_mulw (   &L2,   &L3, 78160 );
    p448_neg  ( &a->x,   &L2 );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L2,   &L1,   &L1 );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_subw (   &L1,     2 );
    p448_bias (   &L1,     1 );
    p448_mul  (   &L0,   &L1,   &L6 );
    p448_mulw (   &L1,   &L0, 3054649120 );
    p448_add  (   &L0,   &L1, &a->y );
    p448_mul  ( &a->y,   &L5,   &L0 );
       L9 = p448_is_zero(   &L7 );
       L8 = -   L9;
    p448_addw ( &a->y,    L8 );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_bias (   &L3,     1 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

@@ -828,83 +831,83 @@ mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5;
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L0, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L0 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L1, &ext->x, &ext->y );
    p448_neg  (   &L0,   &L1 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
       L5 = p448_is_zero(   &L1 );
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L0,   &L2 );
    p448_addw (   &L0,     0 );
    p448_sqr  (   &L1, &ext->x );
    p448_add  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L1, &ext->t );
    p448_mul  (   &L0,   &L1,   &L3 );
    p448_mulw (   &L1,   &L0, 39081 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L1,   &L3,   &L2 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L2,   &L3,   &L1 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    struct p448_t L0, L1, L2, L3;
    mask_t L4, L5;
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     4 );
       L1 = p448_is_zero(   &L2 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/src/arch_x86_64/p448.c
+++ b/src/arch_x86_64/p448.c
@@ -180,9 +180,6 @@ p448_mulw (

    c[3] = accum0 & mask; accum0 >>= 56;
    c[7] = accum4 & mask; accum4 >>= 56;

    // c[4] += accum0 + accum4;
    // c[0] += accum4;
    
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -5,8 +5,11 @@

 /* Chacha random number generator code copied from crandom */

 #include "intrinsics.h"
 #include "crandom.h"
 #include "intrinsics.h"
 #include "config.h"
 #include "magic.h"

 #include <stdio.h>

 volatile unsigned int crandom_features = 0;
@@ -67,7 +70,7 @@ INTRINSIC u_int64_t rdrand(int abort_on_fail) {
        out = out << 32 | reg;
        return out;
    # else
        abort(); // whut
        abort(); /* whut */
    # endif
    } else {
        tries = 0;
@@ -296,9 +299,6 @@ crandom_chacha_expand(u_int64_t iv,
 #endif /* NEED_CONV */
 }

 /* "return 4", cf xkcd #221 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 int
 crandom_init_from_file(
    struct crandom_state_t *state,
@@ -361,6 +361,52 @@ crandom_generate(

    int ret = 0;

    /* 
     * Addition 5/21/2014.
     *
     * If this is used in an application inside a VM, and the VM
     * is snapshotted and restored, then crandom_generate() would
     * produce the same output.
     * 
     * Of course, the real defense against this is "don't do that",
     * but we mitigate it by the RDRAND and/or rdtsc() in the refilling
     * code.  Since chacha is pseudorandom, when the attacker doesn't
     * know the state, it's good enough if RDRAND/rdtsc() return
     * different results.  However, if (part of) the request is filled
     * from the buffer, this won't help.
     *
     * So, add a flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES which
     * disables the buffer for requests larger than this size.
     *
     * Suggest EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES = 0, which
     * disables the buffer.  But instead you can set it to say 16,
     * so that pulls of at least 128 bits will be stirred.  This
     * could still be a problem for eg 64-bit nonces, but those
     * aren't entirely collision-resistant anyway.
     *
     * Heuristic: large requests are more likely to be 
     * cryptographically important, and the buffer doesn't impact
     * their performance as much.  So if the request is bigger
     * than a certain size, just drop the buffer on the floor.
     *
     * This code isn't activated if state->reseed_interval == 0,
     * because then the PRNG is deterministic anyway.
     *
     * TODO: sample 128 bits out of RDRAND() instead of 64 bits.
     * TODO: option to completely remove the buffer and fill?
     * FUTURE: come up with a less band-aid-y solution to this problem.
     */
 #ifdef EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES
    if (state->reseed_interval
 #if EXPERIMENT_CRANDOM_CUTOFF_BYTES > 0
        /* #if'd to a warning from -Wtype-limits in GCC when it's zero */
        && length >= EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES
 #endif
    ) {
        state->fill = 0;
    }
 #endif
    
    while (length) {
        if (unlikely(state->fill <= 0)) {
            uint64_t iv = 0;
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -32,73 +32,27 @@
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif

 #define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS))
 #define GOLDI_DIVERSIFY_BYTES 8

 /* FUTURE: auto.  MAGIC */
 const struct affine_t goldilocks_base_point = {
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
    }},
    {{ 19 }}
 };

 /* These are just unique identifiers */
 static const char *G_INITING = "initializing";
 static const char *G_INITED = "initialized";
 static const char *G_FAILED = "failed to initialize";

 /* FUTURE: auto.  MAGIC */
 static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
    U64LE(0xdc873d6d54a7bb0d),
    U64LE(0xde933d8d723a70aa),
    U64LE(0x3bb124b65129c96f),
    0x8335dc16
 };
 const struct barrett_prime_t goldi_q448 = {
    GOLDI_FIELD_WORDS,
    62 % WORD_BITS,
    sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
    goldi_q448_lo
 };

 /* MAGIC */
 static const struct p448_t
 sqrt_d_minus_1 = {{
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 }};

 struct goldilocks_precomputed_public_key_t {
    struct goldilocks_public_key_t pub;
    struct fixed_base_table_t table;
 };

 #ifndef USE_BIG_TABLES
 #if __ARM_NEON__
 #define USE_BIG_TABLES 1
 #else
 #define USE_BIG_TABLES (WORD_BITS==64)
 #endif
 #endif

 /* FUTURE: auto.  MAGIC */
 struct {
 /* FUTURE: auto. */
 static struct {
    const char * volatile state;
 #if GOLDILOCKS_USE_PTHREAD
    pthread_mutex_t mutex;
 #endif
    struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64];
    struct tw_niels_t combs[COMB_N << (COMB_T-1)];
    struct fixed_base_table_t fixed_base;
    struct tw_niels_t wnafs[32];
    struct tw_niels_t wnafs[1<<WNAF_PRECMP_BITS];
    struct crandom_state_t rand;
 } goldilocks_global;

@@ -136,18 +90,23 @@ goldilocks_init () {
    
    /* Precompute the tables. */
    mask_t succ;
    
    int big = USE_BIG_TABLES;
    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, 5);
    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text,
        COMB_N, COMB_T, COMB_S, goldilocks_global.combs);
    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, WNAF_PRECMP_BITS);
    
    int criff_res = crandom_init_from_file(&goldilocks_global.rand,
        GOLDILOCKS_RANDOM_INIT_FILE,
        GOLDILOCKS_RANDOM_RESEED_INTERVAL,
        GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
        
 #ifdef SUPERCOP_WONT_LET_ME_OPEN_FILES
    if (criff_res == EMFILE) {
        crandom_init_from_buffer(&goldilocks_global.rand, "SUPERCOP won't let me open files");
        criff_res = 0;
    }
 #endif
        
    if (succ & !criff_res) {
        if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) {
            abort();
@@ -182,20 +141,20 @@ goldilocks_derive_private_key (
    
    struct sha512_ctx_t ctx;
    struct tw_extensible_t exta;
    struct p448_t pk;
    struct field_t pk;
    
    sha512_init(&ctx);
    sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
    sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
    sha512_final(&ctx, (unsigned char *)skb);

    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448);
    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &curve_prime_order);
    barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);

    scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
    untwist_and_double_and_serialize(&pk, &exta);
    
    p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
    field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
    
    return GOLDI_EOK;
 }
@@ -245,11 +204,11 @@ goldilocks_private_to_public (
    struct goldilocks_public_key_t *pubkey,
    const struct goldilocks_private_key_t *privkey
 ) {
    struct p448_t pk;
    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
    struct field_t pk;
    mask_t msucc = field_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
    
    if (msucc) {
        p448_serialize(pubkey->opaque, &pk);
        field_serialize(pubkey->opaque, &pk);
        return GOLDI_EOK;
    } else {
        return GOLDI_ECORRUPT;
@@ -270,18 +229,18 @@ goldilocks_shared_secret_core (
    assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
    
    word_t sk[GOLDI_FIELD_WORDS];
    struct p448_t pk;
    struct field_t pk;
    
    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
    mask_t succ = field_deserialize(&pk,your_pubkey->opaque), msucc = -1;
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    struct p448_t sum, prod;
    msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
    p448_mul(&prod,&pk,&sum);
    p448_add(&sum,&pk,&sum);
    struct field_t sum, prod;
    msucc &= field_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
    field_mul(&prod,&pk,&sum);
    field_add(&sum,&pk,&sum);
 #endif
    
    msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
    msucc &= barrett_deserialize(sk,my_privkey->opaque,&curve_prime_order);
    
 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
    if (pre) {
@@ -297,7 +256,7 @@ goldilocks_shared_secret_core (
 #endif
    
    
    p448_serialize(shared,&pk);
    field_serialize(shared,&pk);
    
    /* obliterate records of our failure by adjusting with obliteration key */
    struct sha512_ctx_t ctx;
@@ -318,9 +277,9 @@ goldilocks_shared_secret_core (
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    /* stir in the sum and product of the pubkeys. */
    uint8_t a_pk[GOLDI_FIELD_BYTES];
    p448_serialize(a_pk, &sum);
    field_serialize(a_pk, &sum);
    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
    p448_serialize(a_pk, &prod);
    field_serialize(a_pk, &prod);
    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
       
@@ -363,7 +322,7 @@ goldilocks_derive_challenge(
    sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448);
    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order);
 }

 int
@@ -379,7 +338,7 @@ goldilocks_sign (
    
    /* challenge = H(pk, [nonceG], message). */
    word_t skw[GOLDI_FIELD_WORDS];
    mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
    mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order);
    if (!succ) {
        memset(skw,0,sizeof(skw));
        return GOLDI_ECORRUPT;
@@ -395,16 +354,16 @@ goldilocks_sign (
    sha512_update(&ctx, message, message_len);
    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
    sha512_final(&ctx, sha_out);
    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448);
    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order);
    
    /* 4[nonce]G */
    uint8_t signature_tmp[GOLDI_FIELD_BYTES];
    struct tw_extensible_t exta;
    struct p448_t gsk;
    struct field_t gsk;
    scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
    double_tw_extensible(&exta);
    untwist_and_double_and_serialize(&gsk, &exta);
    p448_serialize(signature_tmp, &gsk);
    field_serialize(signature_tmp, &gsk);
    
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge (
@@ -415,18 +374,18 @@ goldilocks_sign (
        message_len
    );
    
    // reduce challenge and sub.
    barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448);
    /* reduce challenge and sub. */
    barrett_negate(challenge,GOLDI_FIELD_WORDS,&curve_prime_order);

    barrett_mac(
        tk,GOLDI_FIELD_WORDS,
        challenge,GOLDI_FIELD_WORDS,
        skw,GOLDI_FIELD_WORDS,
        &goldi_q448
        &curve_prime_order
    );
        
    word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1);
    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448);
    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&curve_prime_order);
        
    memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
    barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
@@ -454,23 +413,23 @@ goldilocks_verify (
        return GOLDI_EUNINIT;
    }
    
    struct p448_t pk;
    struct field_t pk;
    word_t s[GOLDI_FIELD_WORDS];
    
    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
    mask_t succ = field_deserialize(&pk,pubkey->opaque);
    if (!succ) return GOLDI_EINVAL;
    
    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order);
    if (!succ) return GOLDI_EINVAL;
    
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
    
    struct p448_t eph;
    struct field_t eph;
    struct tw_extensible_t pk_text;
    
    /* deserialize [nonce]G */
    succ = p448_deserialize(&eph, signature);
    succ = field_deserialize(&eph, signature);
    if (!succ) return GOLDI_EINVAL;
    
    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
@@ -479,13 +438,13 @@ goldilocks_verify (
    linear_combo_var_fixed_vt( &pk_text,
        challenge, GOLDI_SCALAR_BITS,
        s, GOLDI_SCALAR_BITS,
        goldilocks_global.wnafs, 5 );
        goldilocks_global.wnafs, WNAF_PRECMP_BITS );
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
    p448_bias(&eph, 2);
    field_sub(&eph, &eph, &pk);
    field_bias(&eph, 2);
    
    succ = p448_is_zero(&eph);
    succ = field_is_zero(&eph);
    
    return succ ? 0 : GOLDI_EINVAL;
 }
@@ -504,8 +463,8 @@ goldilocks_precompute_public_key (
    
    struct tw_extensible_t pk_text;
    
    struct p448_t pk;
    mask_t succ = p448_deserialize(&pk, pub->opaque);
    struct field_t pk;
    mask_t succ = field_deserialize(&pk, pub->opaque);
    if (!succ) {
        free(precom);
        return NULL;
@@ -516,11 +475,9 @@ goldilocks_precompute_public_key (
        free(precom);
        return NULL;
    }
    
    int big = USE_BIG_TABLES;
    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

    succ =  precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL);
    succ =  precompute_fixed_base(&precom->table, &pk_text,
        COMB_N, COMB_T, COMB_S, NULL);
    if (!succ) {
        free(precom);
        return NULL;
@@ -553,17 +510,17 @@ goldilocks_verify_precomputed (
    }

    word_t s[GOLDI_FIELD_WORDS];
    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order);
    if (!succ) return GOLDI_EINVAL;
    
    word_t challenge[GOLDI_FIELD_WORDS];
    goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
    
    struct p448_t eph, pk;
    struct field_t eph, pk;
    struct tw_extensible_t pk_text;
    
    /* deserialize [nonce]G */
    succ = p448_deserialize(&eph, signature);
    succ = field_deserialize(&eph, signature);
    if (!succ) return GOLDI_EINVAL;
        
    succ = linear_combo_combs_vt (
@@ -574,10 +531,10 @@ goldilocks_verify_precomputed (
    if (!succ) return GOLDI_EINVAL;
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
    p448_bias(&eph, 2);
    field_sub(&eph, &eph, &pk);
    field_bias(&eph, 2);
    
    succ = p448_is_zero(&eph);
    succ = field_is_zero(&eph);
    
    return succ ? 0 : GOLDI_EINVAL;
 }
@@ -596,5 +553,5 @@ goldilocks_shared_secret_precomputed (
    );
 }

 #endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
 #endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */

--- a/src/include/api.h
+++ b/src/include/api.h
@@ -0,0 +1,190 @@
 /**
 * @file sizes.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief BATMAN / SUPERCOP glue for benchmarking.
 */

 #include <string.h>
 #include "goldilocks.h"

 #define PUBLICKEY_BYTES GOLDI_PUBLIC_KEY_BYTES
 #define SECRETKEY_BYTES GOLDI_PRIVATE_KEY_BYTES
 #define SIGNATURE_BYTES GOLDI_SIGNATURE_BYTES
 #define SHAREDSECRET_BYTES GOLDI_SHARED_SECRET_BYTES

 #define crypto_dh_PUBLICKEYBYTES PUBLICKEY_BYTES
 #define crypto_dh_SECRETKEYBYTES SECRETKEY_BYTES
 #define PRIVATEKEY_BYTES SECRETKEY_BYTES
 #define crypto_dh_BYTES SHAREDSECRET_BYTES
 #define crypto_dh_IMPLEMENTATION "AMD64"
 #define crypto_dh_VERSION "2014-07-11"

 #define crypto_sign_PUBLICKEYBYTES PUBLICKEY_BYTES
 #define crypto_sign_SECRETKEYBYTES SECRETKEY_BYTES
 #define crypto_sign_IMPLEMENTATION "AMD64"
 #define crypto_sign_VERSION "2014-07-11"
 #define crypto_sign_BYTES SIGNATURE_BYTES

 #define CRYPTO_DETERMINISTIC 1

 /*
 #ifndef LOOPS
 #define LOOPS 512
 #endif
 */

 static inline int timingattacks() { return 0; }
 static inline int copyrightclaims() { return 0; }
 static inline int patentclaims() {
    /* Until the end of July 2014, point compression
     * is patented. */
    return 20;
 }

 #define crypto_sign_keypair crypto_dh_keypair
 static inline int crypto_dh_keypair (
    unsigned char pk[SECRETKEY_BYTES],
    unsigned char sk[PUBLICKEY_BYTES]
 ) {
  int ret;
  ret = goldilocks_init();
  if (ret && ret != GOLDI_EALREADYINIT)
    return ret;
  if ((ret = goldilocks_keygen(
      (struct goldilocks_private_key_t *)sk,
      (struct goldilocks_public_key_t *)pk
  ))) abort();
  return ret;
 }

 static inline void keypair (
    unsigned char sk[SECRETKEY_BYTES],
    unsigned long long *sklen,
    unsigned char pk[PUBLICKEY_BYTES],
    unsigned long long *pklen
 ) {
    int ret = goldilocks_init();
    if (ret) abort();

    ret = goldilocks_keygen(
        (struct goldilocks_private_key_t *)sk,
        (struct goldilocks_public_key_t *)pk
    );
    if (ret) abort();

    *sklen = SECRETKEY_BYTES;
    *pklen = PUBLICKEY_BYTES;
 }

 static inline int crypto_sign (
    unsigned char *sm,
    unsigned long long *smlen,
    const unsigned char *m,
    unsigned long long mlen,
    const unsigned char sk[SECRETKEY_BYTES]
 ) {
    int ret = goldilocks_sign(
        sm, m, mlen,
        (const struct goldilocks_private_key_t *)sk
    );
    if (ret) abort();

    memcpy(sm + SIGNATURE_BYTES, m, mlen);
    
    *smlen = mlen + SIGNATURE_BYTES;
    return 0;
 }

 static inline void signmessage (
    unsigned char *sm,
    unsigned long long *smlen,
    const unsigned char *m,
    unsigned long long mlen,
    const unsigned char sk[SECRETKEY_BYTES],
    unsigned long long sklen
 ) {
    if (sklen != PRIVATEKEY_BYTES) abort();
    
    int ret = goldilocks_sign(
        sm, m, mlen,
        (const struct goldilocks_private_key_t *)sk
    );
    if (ret) abort();

    memcpy(sm + SIGNATURE_BYTES, m, mlen);
    
    *smlen = mlen + SIGNATURE_BYTES;
 }

 static inline int crypto_sign_open (
    unsigned char *m,
    unsigned long long *mlen,
    const unsigned char *sm,
    unsigned long long smlen,
    const unsigned char pk[PUBLICKEY_BYTES]
 ) {
    int ret = goldilocks_verify(
        sm, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES,
        (const struct goldilocks_public_key_t *)pk
    );
    if (!ret) {
        *mlen = smlen - SIGNATURE_BYTES;
        memcpy(m, sm + SIGNATURE_BYTES, *mlen);
    }
    return ret ? -1 : 0;
 }

 static inline int verification (
    const unsigned char *m,
    unsigned long long mlen,
    const unsigned char *sm,
    unsigned long long smlen,
    const unsigned char pk[PUBLICKEY_BYTES],
    unsigned long long pklen
 ) {
    if (pklen != PUBLICKEY_BYTES) abort();
    
    int ret = goldilocks_verify(
        sm, m, mlen,
        (const struct goldilocks_public_key_t *)pk
    );
    return ret ? -1 : 0;
 }


 static inline int crypto_dh (
    unsigned char s[SHAREDSECRET_BYTES],
    const unsigned char sk[SECRETKEY_BYTES],
    const unsigned char pk[PUBLICKEY_BYTES]
 ) {
  return goldilocks_shared_secret (
        s,
        (const struct goldilocks_private_key_t *)sk,
        (const struct goldilocks_public_key_t *)pk
  );
 }

 static inline int sharedsecret (
    unsigned char s[SHAREDSECRET_BYTES],
    unsigned long long *slen,
    const unsigned char sk[SECRETKEY_BYTES],
    unsigned long long sklen,
    const unsigned char pk[PUBLICKEY_BYTES],
    unsigned long long pklen
 ) {
    if (pklen != PUBLICKEY_BYTES) abort();
    if (sklen != SECRETKEY_BYTES) abort();
    
    int ret = goldilocks_shared_secret (
        s,
        (const struct goldilocks_private_key_t *)sk,
        (const struct goldilocks_public_key_t *)pk
    );
    if (ret) return -1;
    *slen = SHAREDSECRET_BYTES;
    return 0;
 }

--- a/src/include/barrett_field.h
+++ b/src/include/barrett_field.h
@@ -32,7 +32,7 @@ struct barrett_prime_t {
 /**
 * The Goldilocks prime.  I'm not sure this is the right place for it, but oh well.
 */
 extern const struct barrett_prime_t goldi_q448;
 extern const struct barrett_prime_t curve_prime_order;

 /**
 * Reduce a number (with optional high carry word) mod p.
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -1,8 +1,64 @@
 /**
 * @file config.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Goldilocks top-level configuration flags.
 */

 #ifndef __GOLDILOCKS_CONFIG_H__
 #define __GOLDILOCKS_CONFIG_H__ 1

 /** @brief crandom architecture detection.
 * With this flag set to 1, crandom will assume that any flag
 * supported by -march and friends (MIGHT_HAVE) will actually
 * be available on the target machine (MUST_HAVE), instead of
 * trying to detect it.
 *
 * Without this flag, crandom can detect, eg, that while -mavx
 * was passed, the currint machine doesn't support AVX, and can
 * fall back to SSE2 or whatever.  But the rest of the
 * Goldilocks code doesn't support this, so it'll still crash
 * with an illegal instruction error.
 *
 * Setting this flag will make the library smaller.
 */
 #define CRANDOM_MIGHT_IS_MUST           1

 /**
 * @brief Causes crandom to refuse to buffer requests bigger
 * than this size.  Setting 0 disables buffering for all
 * requests, which hurts performance.
 *
 * The advantage is that if a user process forks or is VM-
 * snapshotted, the buffer is not adjusted (FUTURE).  However,
 * with the buffer disabled, the refresh routines will stir
 * in entropy from RDTSC and/or RDRAND, making this operation
 * mostly-safe.
 */
 #define EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES 0

 /**
 * @brief Goldilocks uses libpthread mutexes to provide
 * thread-safety.  If you disable this flag, it won't link
 * libpthread, but it won't be thread-safe either.
 */
 #define GOLDILOCKS_USE_PTHREAD          1

 /**
 * @brief Experiment to change the hash inputs for ECDH,
 * in a way that obliterates the result -- overwriting it with
 * a safe pseudorandom value -- if the public key is invalid.
 * That way users who ignore the status result won't be
 * exposed to invalid key attacks. 
 */
 #define EXPERIMENT_ECDH_OBLITERATE_CT   1

 /**
 * @brief ECDH adds public keys into the hash, to prevent
 * esoteric attacks.
 */
 #define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1

 #endif // __GOLDILOCKS_CONFIG_H__
 #endif /* __GOLDILOCKS_CONFIG_H__ */
--- a/src/include/crandom.h
+++ b/src/include/crandom.h
@@ -12,6 +12,7 @@
 #ifndef __GOLDI_CRANDOM_H__
 #define __GOLDI_CRANDOM_H__ 1

 #define _XOPEN_SOURCE 600
 #include <stdint.h>  /* for uint64_t */
 #include <fcntl.h>   /* for open */
 #include <errno.h>   /* for returning errors after open */
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -0,0 +1,30 @@
 /**
 * @file field.h
 * @brief Field switch code.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */
 #ifndef __FIELD_H__
 #define __FIELD_H__
 #include "magic.h"

 #include "p448.h"

 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_add            p448_add
 #define field_sub            p448_sub
 #define field_bias           p448_bias
 #define field_copy           p448_copy
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_cond_swap      p448_cond_swap
 #define field_cond_neg       p448_cond_neg
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_is_zero        p448_is_zero
 #define simultaneous_invert  simultaneous_invert_p448 /* FUTURE: consistency */

 #endif /* __FIELD_H__ */
--- a/src/include/intrinsics.h
+++ b/src/include/intrinsics.h
@@ -11,25 +11,27 @@
 #define __CRANDOM_INTRINSICS_H__ 1

 #include <sys/types.h>
 #include "config.h"

 #if __i386__ || __x86_64__
 #include <immintrin.h>
 #endif

 /** @brief Macro to make a function static, forcibly inlined and possibly unused. */
 #define INTRINSIC \
  static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))

 #define GEN    1
 #define SSE2   2
 #define SSSE3  4
 #define AESNI  8
 #define XOP    16
 #define AVX    32
 #define AVX2   64
 #define RDRAND 128
 #define GEN    1     /**< @brief Intrinsics field has been generated. */
 #define SSE2   2     /**< @brief Machine supports SSE2 */
 #define SSSE3  4     /**< @brief Machine supports SSSE3 (for shuffles) */
 #define AESNI  8     /**< @brief Machine supports Intel AES-NI */
 #define XOP    16    /**< @brief Machine supports AMD XOP */
 #define AVX    32    /**< @brief Machine supports Intel AVX (for masking)  */
 #define AVX2   64    /**< @brief Machine supports Intel AVX2 (for bignums) */
 #define RDRAND 128   /**< @brief Machine supports Intel RDRAND */

 /**
 * If on x86, read the timestamp counter.  Otherwise, return 0.
 * @brief If on x86, read the timestamp counter.  Otherwise, return 0.
 */
 INTRINSIC u_int64_t rdtsc() {
  u_int64_t out = 0;
@@ -53,6 +55,8 @@ INTRINSIC u_int64_t opacify(u_int64_t x) {
  return x;
 }


 /** @cond internal */
 #ifdef __AVX2__
 #  define MIGHT_HAVE_AVX2 1
 #  ifndef MUST_HAVE_AVX2
@@ -92,10 +96,6 @@ INTRINSIC u_int64_t opacify(u_int64_t x) {
 #  define pslldq _mm_slli_epi32
 #  define pshufd _mm_shuffle_epi32

 INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
 }

 #else
 #  define MIGHT_HAVE_SSE2 0
 #  define MUST_HAVE_SSE2  0
@@ -127,11 +127,6 @@ INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
 #  ifndef MUST_HAVE_XOP
 #    define MUST_HAVE_XOP 0
 #  endif
 INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  ssereg out;
  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
  return out;
 }
 #else
 #  define MIGHT_HAVE_XOP 0
 #  define MUST_HAVE_XOP 0
@@ -146,6 +141,9 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  | RDRAND * MIGHT_HAVE_RDRAND \
  | AVX2   * MIGHT_HAVE_AVX2)

 #if CRANDOM_MIGHT_IS_MUST
 #define MUST_MASK MIGHT_MASK
 #else
 #define MUST_MASK \
  ( SSE2   * MUST_HAVE_SSE2   \
  | SSSE3  * MUST_HAVE_SSSE3  \
@@ -154,22 +152,58 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  | AVX    * MUST_HAVE_AVX    \
  | RDRAND * MUST_HAVE_RDRAND \
  | AVX2   * MUST_HAVE_AVX2 )
 #endif
 /** @endcond */

 #ifdef __SSE2__
 /** Rotate a register by some amount using SSE2. */
 INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
 }
 #endif
      
 #ifdef __XOP__
 /** Rotate a register by some amount using AMD XOP. */      
 INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  ssereg out;
  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
  return out;
 }
 #endif

 /**
 * @brief Macro which detects that targets might support this feature,
 * so that we can include code for it.
 */
 #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)

 /**
 * @brief Macro which detects that targets must support this feature,
 * so we can omit fallback code.
 */
 #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)

 /**
 * @brief Make a functiona available by C API.
 */
 #ifdef __cplusplus
 #  define extern_c extern "C"
 #else
 #  define extern_c
 #endif

 /** @cond internal
 * @brief Detect platform features and return them as a flagfield int.
 */
 extern_c
 unsigned int crandom_detect_features();
 /** @endcond */

 #ifndef likely
 #  define likely(x)       __builtin_expect((x),1)
 #  define unlikely(x)     __builtin_expect((x),0)
 #  define likely(x)       __builtin_expect((x),1) \
    /**< @brief Tell the compiler that a branch is likely, for optimization. */
 #  define unlikely(x)     __builtin_expect((x),0) \
    /**< @brief Tell the compiler that a branch is unlikely, for optimization. */
 #endif
  
 /**
@@ -187,12 +221,6 @@ compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 );
    
 const char *compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 ) {
    return __sync_val_compare_and_swap(target,old,new);
 }
@@ -208,13 +236,6 @@ const char *compare_and_swap (
 * @param [in] new A value to replace the target on success.
 */
 INTRINSIC int
 bool_compare_and_swap (
    const char *volatile* target,
    const char *old,
    const char *new
 );

 int
 bool_compare_and_swap (
    const char *volatile* target,
    const char *old,
@@ -231,6 +252,8 @@ bool_compare_and_swap (
 * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not.
 */
 extern volatile unsigned int crandom_features;

 /** @brief Determine if a given CPU feature is available. */
 INTRINSIC int HAVE(unsigned int feature);

 int HAVE(unsigned int feature) {
--- a/src/include/magic.h
+++ b/src/include/magic.h
@@ -0,0 +1,105 @@
 /**
 * @file magic.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
 */


 #ifndef __GOLDI_MAGIC_H__
 #define __GOLDI_MAGIC_H__ 1

 #include "word.h"
 #include "p448.h"
 #include "ec_point.h"

 /* TODO: standardize notation */


 /** @brief The number of bits in the Goldilocks field. */
 #define GOLDI_FIELD_BITS 448

 /** @brief The number of words in the Goldilocks field. */
 #define GOLDI_FIELD_WORDS DIV_CEIL(GOLDI_FIELD_BITS,WORD_BITS)

 /** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */
 #define COFACTOR_BITS 2

 /** @brief The number of bits in a Goldilocks scalar. */
 #define SCALAR_BITS (GOLDI_FIELD_BITS - COFACTOR_BITS)

 /** @brief The number of words in the Goldilocks field. */
 #define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)

 /**
 * @brief sqrt(d-1), used for point formats and twisting.
 */
 extern const struct p448_t sqrt_d_minus_1;

 /**
 * @brief The base point for Goldilocks.
 */
 extern const struct affine_t goldilocks_base_point;

 /**
 * @brief The Goldilocks prime subgroup order.
 */ 
 extern const struct barrett_prime_t curve_prime_order;

 /**
 * @brief Window size for fixed-window signed binary scalarmul.
 * Table size is 2^(this - 1).
 */
 #define SCALARMUL_FIXED_WINDOW_SIZE 5

 /**
 * @brief Even/odd adjustments for fixed window with
 * ROUNDUP(SCALAR_BITS,SCALARMUL_FIXED_WINDOW_SIZE).
 */
 extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS];

 /**
 * @brief Table size for wNAF signed binary (variable-time) scalarmul.
 * Table size is 2^this.
 */
 #define SCALARMUL_WNAF_TABLE_BITS 3

 /**
 * @brief Table size for wNAF signed binary (variable-time) linear combo.
 * Table size is 2^this.
 */
 #define SCALARMUL_WNAF_COMBO_TABLE_BITS 4

 /**
 * @brief If true, use wider tables for the precomputed combs.
 */
 #ifndef USE_BIG_COMBS
 #if __ARM_NEON__
 #define USE_BIG_COMBS 1
 #else
 #define USE_BIG_COMBS (WORD_BITS==64)
 #endif
 #endif

 /** @brief The number of combs to use for signed comb algo */
 #define COMB_N (USE_BIG_COMBS ? 5  : 8)

 /** @brief The number of teeth of the combs for signed comb algo */
 #define COMB_T (USE_BIG_COMBS ? 5  : 4)

 /** @brief The spacing the of combs for signed comb algo */
 #define COMB_S (USE_BIG_COMBS ? 18 : 14)

 /**
 * @brief The bit width of the precomputed WNAF tables.  Size is 2^this elements.
 */
 #define WNAF_PRECMP_BITS 5

 /**
 * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 #endif /* __GOLDI_MAGIC_H__ */
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -10,12 +10,19 @@
 #define __P448_ALGO_H__ 1

 #include "ec_point.h"
 #include "field.h"
 #include "intrinsics.h"
 #include "magic.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * A word array containing a scalar
 */
 typedef word_t scalar_t[SCALAR_WORDS];

 /**
 * A precomputed table for fixed-base scalar multiplication.
 *
@@ -26,7 +33,7 @@ struct fixed_base_table_t {
  struct tw_niels_t *table;
  
  /** Adjustments to the scalar in even and odd cases, respectively. */
  word_t scalar_adjustments[2*(448/WORD_BITS)];  /* MAGIC */
  word_t scalar_adjustments[2*SCALAR_WORDS];
  
  /** The number of combs in the table. */
  unsigned int n;
@@ -83,8 +90,8 @@ struct fixed_base_table_t {
 */
 mask_t
 montgomery_ladder (
    struct p448_t *out,
    const struct p448_t *in,
    struct field_t *out,
    const struct field_t *in,
    const word_t *scalar,
    unsigned int nbits,
    unsigned int n_extra_doubles
@@ -103,7 +110,7 @@ montgomery_ladder (
 void
 scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS] /* MAGIC */
    const word_t scalar[SCALAR_WORDS]
    /* TODO? int nbits */
 );
    
@@ -124,8 +131,7 @@ scalarmul (
 void
 scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS] /* MAGIC */
    /* TODO? int nbits */
    const word_t scalar[SCALAR_WORDS]
 );

 /**
@@ -134,7 +140,7 @@ scalarmul_vlook (
 *
 * This function computes $n$ "comb" tables, each containing
 * 2^(t-1) points in tw_niels_t format.  You must have
 * n * t * s >= 446 for complete coverage.
 * n * t * s >= SCALAR_BITS = 446 for complete coverage.
 *
 * The scalar multiplication algorithm may adjust the scalar by
 * a multiple of q.  Therefore, we strongly recommend to use base
@@ -205,11 +211,13 @@ scalarmul_fixed_base (
 *
 * @param [inout] working The input and output point.
 * @param [in] scalar The scalar.
 * @param [in] nbits The number of bits in the scalar
 */ 
 void
 scalarmul_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS] /* MAGIC */
    const word_t *scalar,
    unsigned int nbits
 );


@@ -274,9 +282,9 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt (
    struct tw_extensible_t *working,
    const word_t scalar_var[448/WORD_BITS], /* MAGIC */
    const word_t scalar_var[SCALAR_WORDS],
    unsigned int nbits_var,
    const word_t scalar_pre[448/WORD_BITS], /* MAGIC */
    const word_t scalar_pre[SCALAR_WORDS],
    unsigned int nbits_pre,
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
@@ -302,10 +310,10 @@ linear_combo_var_fixed_vt (
 mask_t
 linear_combo_combs_vt (
    struct tw_extensible_t *out,
    const word_t scalar1[448/WORD_BITS],
    const word_t scalar1[SCALAR_WORDS],
    unsigned int nbits1,
    const struct fixed_base_table_t *table1,
    const word_t scalar2[448/WORD_BITS],
    const word_t scalar2[SCALAR_WORDS],
    unsigned int nbits2,
    const struct fixed_base_table_t *table2
 );
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -26,7 +26,6 @@

 #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
 /* It's a 64-bit machine if:
 * // limits.h thinks so
 * __uint128_t exists
 * size_t is 64 bits
 * Either longs are 64-bits (doesn't happen on Windows)
@@ -61,6 +60,9 @@ typedef int64_t dsword_t;
 #endif

 #define WORD_BITS (sizeof(word_t) * 8)
 #define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y))
 #define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y))
 #define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS))

 typedef word_t mask_t;
 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;
@@ -69,51 +71,80 @@ static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;

 #ifdef __ARM_NEON__
 typedef uint32x4_t vecmask_t;
 #else
 /* FIXME this only works on clang */
 #elif __clang__
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
 typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
 typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
 #else /* GCC-cleanliness */
 typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
 typedef int64_t  int64x2_t __attribute__((vector_size(16)));
 typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
 typedef int64_t  int64x4_t __attribute__((vector_size(32)));
 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
 typedef int32_t  int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 typedef int32_t  int32x4_t __attribute__((vector_size(16)));
 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
 typedef int32_t  int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
 typedef int32_t  int32x8_t __attribute__((vector_size(32)));
 /* TODO: vector width for procs like ARM; gcc support */
 typedef word_t vecmask_t __attribute__((vector_size(32)));
 #endif

 #if __AVX2__
 typedef uint32x8_t big_register_t;
 typedef uint64x4_t uint64xn_t;
 typedef uint32x8_t uint32xn_t;
 #elif __SSE2__ || __ARM_NEON__
 typedef uint32x4_t big_register_t;
 typedef uint64x2_t uint64xn_t;
 typedef uint32x4_t uint32xn_t;
    typedef uint32x8_t big_register_t;
    typedef uint64x4_t uint64xn_t;
    typedef uint32x8_t uint32xn_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = x;
        big_register_t ret = {y,y,y,y,y,y,y,y};
        return ret;
    }
 #elif __SSE2__
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;
    typedef uint32_t uint32xn_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = x;
        big_register_t ret = {y,y,y,y};
        return ret;
    }
 #elif __ARM_NEON__
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;
    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return vdupq_n_u32(x);
    }
 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
 typedef uint64_t big_register_t, uint64xn_t;
 typedef uint32_t uint32xn_t;
 #else
 typedef uint64_t uint64xn_t;
 typedef uint32_t uint32xn_t;
 typedef uint32_t big_register_t;
 #endif

    typedef uint64_t big_register_t, uint64xn_t;

 #ifdef __ARM_NEON__
 static __inline__ big_register_t
 br_set_to_mask(mask_t x) {
    return vdupq_n_u32(x);
 }
    typedef uint32_t uint32xn_t;
    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
 #else
 static __inline__ big_register_t
 br_set_to_mask(mask_t x) {
    big_register_t out = {x,x,x,x,x,x,x,x};
    return out;
 }
    typedef uint64_t uint64xn_t;
    typedef uint32_t uint32xn_t;
    typedef uint32_t big_register_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
 #endif

 #if __AVX2__ || __SSE2__
--- a/src/magic.c
+++ b/src/magic.c
@@ -0,0 +1,61 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "field.h"
 #include "magic.h"
 #include "barrett_field.h"

 /* FUTURE: automatically generate this file. */

 const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
    U64LE(0xebec9967f5d3f5c2),
    U64LE(0x0aa09b49b16c9a02),
    U64LE(0x7f6126aec172cd8e),
    U64LE(0x00000007b027e54d),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    U64LE(0x4000000000000000),
    
    U64LE(0xc873d6d54a7bb0cf),
    U64LE(0xe933d8d723a70aad),
    U64LE(0xbb124b65129c96fd),
    U64LE(0x00000008335dc163),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000)
 };

 const struct affine_t goldilocks_base_point = {
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
    }},
    {{ 19 }}
 };

 static const word_t curve_prime_order_lo[(224+WORD_BITS-1)/WORD_BITS] = {
    U64LE(0xdc873d6d54a7bb0d),
    U64LE(0xde933d8d723a70aa),
    U64LE(0x3bb124b65129c96f),
    0x8335dc16
 };
 const struct barrett_prime_t curve_prime_order = {
    GOLDI_FIELD_WORDS,
    62 % WORD_BITS,
    sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]),
    curve_prime_order_lo
 };

 const struct field_t
 sqrt_d_minus_1 = {{
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 }};
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -13,8 +13,8 @@

 mask_t
 montgomery_ladder (
    struct p448_t *out,
    const struct p448_t *in,
    struct field_t *out,
    const struct field_t *in,
    const word_t *scalar,
    unsigned int nbits,
    unsigned int n_extra_doubles
@@ -28,15 +28,15 @@ montgomery_ladder (
        word_t w = scalar[j];
        for (i=n; i>=0; i--) {
            mask_t flip = -((w>>i)&1);
            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
            field_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            field_cond_swap(&mont.za,&mont.zd,flip^pflip);
            montgomery_step(&mont);
            pflip = flip;
        }
        n = WORD_BITS-1;
    }
    p448_cond_swap(&mont.xa,&mont.xd,pflip);
    p448_cond_swap(&mont.za,&mont.zd,pflip);
    field_cond_swap(&mont.xa,&mont.xd,pflip);
    field_cond_swap(&mont.za,&mont.zd,pflip);
    
    assert(n_extra_doubles < INT_MAX);
    for (j=0; j<(int)n_extra_doubles; j++) {
@@ -51,8 +51,8 @@ cond_negate_tw_niels (
    struct tw_niels_t *n,
    mask_t doNegate
 ) {
    p448_cond_swap(&n->a, &n->b, doNegate);
    p448_cond_neg(&n->c, doNegate);
    field_cond_swap(&n->a, &n->b, doNegate);
    field_cond_neg(&n->c, doNegate);
 }

 static __inline__ void
@@ -137,34 +137,18 @@ convert_to_signed_window_form (
 void
 scalarmul (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    const word_t scalar[SCALAR_WORDS]
 ) {
    const int nbits=450; /* MAGIC */
    word_t prepared_data[448*2/WORD_BITS] = {
        
        U64LE(0xebec9967f5d3f5c2),
        U64LE(0x0aa09b49b16c9a02),
        U64LE(0x7f6126aec172cd8e),
        U64LE(0x00000007b027e54d),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),
            
        U64LE(0xc873d6d54a7bb0cf),
        U64LE(0xe933d8d723a70aad),
        U64LE(0xbb124b65129c96fd),
        U64LE(0x00000008335dc163),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* MAGIC */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
    
    const int WINDOW = 5, /* MAGIC */
    const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);
        NTABLE = 1<<(WINDOW-1),
        nbits = ROUND_UP(SCALAR_BITS,WINDOW);
    
    word_t scalar2[SCALAR_WORDS];
    convert_to_signed_window_form (
        scalar2, scalar, SCALAR_WORDS,
        SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
    );

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
@@ -197,7 +181,7 @@ scalarmul (

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
        
        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
        if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
        }
                
@@ -214,34 +198,19 @@ scalarmul (
 void
 scalarmul_vlook (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
 ) {
    const int nbits=450; /* HACK? */
    word_t prepared_data[448*2/WORD_BITS] = {
        
        U64LE(0xebec9967f5d3f5c2),
        U64LE(0x0aa09b49b16c9a02),
        U64LE(0x7f6126aec172cd8e),
        U64LE(0x00000007b027e54d),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x4000000000000000),
            
        U64LE(0xc873d6d54a7bb0cf),
        U64LE(0xe933d8d723a70aad),
        U64LE(0xbb124b65129c96fd),
        U64LE(0x00000008335dc163),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000),
        U64LE(0x0000000000000000)
    }; /* MAGIC: split off */
    
    word_t scalar2[448/WORD_BITS];
    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
    
    const int WINDOW = 5, /* MAGIC */
    const word_t scalar[SCALAR_WORDS]
 ) {    
    const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
        WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);
        NTABLE = 1<<(WINDOW-1),
        nbits = ROUND_UP(SCALAR_BITS,WINDOW);
    
    word_t scalar2[SCALAR_WORDS];
    convert_to_signed_window_form(
        scalar2, scalar, SCALAR_WORDS,
        SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
    );


    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
@@ -274,7 +243,7 @@ scalarmul_vlook (

        bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
        
        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
        if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
            bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
        }
                
@@ -304,8 +273,8 @@ schedule_scalar_for_combs (
    
    unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS,
        scalar2_words = scalar_words;
    if (scalar2_words < 448 / WORD_BITS)
        scalar2_words = 448 / WORD_BITS;
    if (scalar2_words < SCALAR_WORDS)
        scalar2_words = SCALAR_WORDS;
    word_t scalar3[scalar2_words];
    
    /* Copy scalar to scalar3, but clear its high bits (if there are any) */
@@ -322,7 +291,7 @@ schedule_scalar_for_combs (
    convert_to_signed_window_form (
        scalar2,
        scalar3, scalar2_words,
        table->scalar_adjustments , 448 / WORD_BITS
        table->scalar_adjustments , SCALAR_WORDS
    );
    
    return MASK_SUCCESS;
@@ -331,7 +300,7 @@ schedule_scalar_for_combs (
 mask_t
 scalarmul_fixed_base (
    struct tw_extensible_t *out,
    const word_t scalar[448/WORD_BITS],
    const word_t scalar[SCALAR_WORDS],
    unsigned int nbits,
    const struct fixed_base_table_t *table
 ) {
@@ -339,7 +308,7 @@ scalarmul_fixed_base (
    unsigned int n = table->n, t = table->t, s = table->s;
    
    unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
    if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS;
    if (scalar2_words < SCALAR_WORDS) scalar2_words = SCALAR_WORDS;
    
    word_t scalar2[scalar2_words];

@@ -389,10 +358,10 @@ scalarmul_fixed_base (
 mask_t
 linear_combo_combs_vt (
    struct tw_extensible_t *out,
    const word_t scalar1[448/WORD_BITS],
    const word_t scalar1[SCALAR_WORDS],
    unsigned int nbits1,
    const struct fixed_base_table_t *table1,
    const word_t scalar2[448/WORD_BITS],
    const word_t scalar2[SCALAR_WORDS],
    unsigned int nbits2,
    const struct fixed_base_table_t *table2
 ) { 
@@ -400,10 +369,10 @@ linear_combo_combs_vt (
    unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
    
    unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
    if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS;
    if (scalar1b_words < SCALAR_WORDS) scalar1b_words = SCALAR_WORDS;
    
    unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
    if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS;
    if (scalar2b_words < SCALAR_WORDS) scalar2b_words = SCALAR_WORDS;
    
    word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];

@@ -479,7 +448,7 @@ precompute_fixed_base (
  unsigned int s,
  struct tw_niels_t *prealloc
 ) {
    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */
    if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
        memset(out, 0, sizeof(*out));
        return 0;
    }
@@ -493,8 +462,8 @@ precompute_fixed_base (
    struct tw_pniels_t pn_tmp;
  
    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
    
    struct tw_niels_t *table = prealloc;
    if (prealloc) {
@@ -519,30 +488,19 @@ precompute_fixed_base (
    
    /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */
    unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1;
    assert(adjustment_size >= 448/WORD_BITS);
    assert(adjustment_size >= SCALAR_WORDS);
    word_t adjustment[adjustment_size];
    for (i=0; i<adjustment_size; i++) {
        adjustment[i] = -1;
    }
    
    adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);

    /* MAGIC: factor out somehow */
    const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
        U64LE(0xdc873d6d54a7bb0d),
        U64LE(0xde933d8d723a70aa),
        U64LE(0x3bb124b65129c96f),
        0x8335dc16
    };
    const struct barrett_prime_t goldi_q448 = {
        448/WORD_BITS, 62 % WORD_BITS, sizeof(goldi_q448_lo)/sizeof(word_t), goldi_q448_lo
    };
    
    /* The low adjustment is 2^nbits - 1 mod q */
    barrett_reduce(adjustment, adjustment_size, 0, &goldi_q448);
    word_t *low_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)],
        *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)];
    for (i=0; i<448/WORD_BITS; i++) {
    barrett_reduce(adjustment, adjustment_size, 0, &curve_prime_order);
    word_t *low_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*(adjustment[0] & 1)],
        *high_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*((~adjustment[0]) & 1)];
    for (i=0; i<SCALAR_WORDS; i++) {
        low_adjustment[i] = adjustment[i];
    }
    
@@ -550,12 +508,12 @@ precompute_fixed_base (
    (void)
    sub_nr_ext_packed(
        high_adjustment,
        adjustment, 448/WORD_BITS,
        goldi_q448.p_lo, goldi_q448.nwords_lo,
        adjustment, SCALAR_WORDS,
        curve_prime_order.p_lo, curve_prime_order.nwords_lo,
        -1
    );
    if (goldi_q448.p_shift) {
        high_adjustment[goldi_q448.nwords_p - 1] += ((word_t)1)<<goldi_q448.p_shift;
    if (curve_prime_order.p_shift) {
        high_adjustment[curve_prime_order.nwords_p - 1] += ((word_t)1)<<curve_prime_order.p_shift;
    }
    
    /* OK, now compute the tables */
@@ -591,7 +549,7 @@ precompute_fixed_base (

            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
            copy_tw_niels(&table[idx], &pn_tmp.n);
            p448_copy(&zs[idx], &pn_tmp.z);
            field_copy(&zs[idx], &pn_tmp.z);
 			
            if (j >= (1u<<(t-1)) - 1) break;
            int delta = (j+1) ^ ((j+1)>>1) ^ gray;
@@ -611,24 +569,24 @@ precompute_fixed_base (
        }
    }
 	
    simultaneous_invert_p448(zis, zs, n<<(t-1));
    simultaneous_invert(zis, zs, n<<(t-1));

    p448_t product;
    field_t product;
    for (i=0; i<n<<(t-1); i++) {
        p448_mul(&product, &table[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].a, &product);
        field_mul(&product, &table[i].a, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&table[i].a, &product);
        
        p448_mul(&product, &table[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].b, &product);
        field_mul(&product, &table[i].b, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&table[i].b, &product);
        
        p448_mul(&product, &table[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&table[i].c, &product);
        field_mul(&product, &table[i].c, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&table[i].c, &product);
    }
 	
 	mask_t ret = ~p448_is_zero(&zis[0]);
 	mask_t ret = ~field_is_zero(&zis[0]);

    free(doubles);
    free(zs);
@@ -664,8 +622,8 @@ precompute_fixed_base_wnaf (
    unsigned int tbits
 ) {
    int i;
    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs)<<tbits);
    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis)<<tbits);
    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs)<<tbits);
    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis)<<tbits);

    if (!zs || !zis) {
        free(zs);
@@ -679,7 +637,7 @@ precompute_fixed_base_wnaf (
    struct tw_pniels_t twop, tmp;
    
    convert_tw_extensible_to_tw_pniels(&tmp, &base);
    p448_copy(&zs[0], &tmp.z);
    field_copy(&zs[0], &tmp.z);
    copy_tw_niels(&out[0], &tmp.n);

    if (tbits > 0) {
@@ -688,32 +646,32 @@ precompute_fixed_base_wnaf (
        add_tw_pniels_to_tw_extensible(&base, &tmp);
        
        convert_tw_extensible_to_tw_pniels(&tmp, &base);
        p448_copy(&zs[1], &tmp.z);
        field_copy(&zs[1], &tmp.z);
        copy_tw_niels(&out[1], &tmp.n);

        for (i=2; i < 1<<tbits; i++) {
            add_tw_pniels_to_tw_extensible(&base, &twop);
            convert_tw_extensible_to_tw_pniels(&tmp, &base);
            p448_copy(&zs[i], &tmp.z);
            field_copy(&zs[i], &tmp.z);
            copy_tw_niels(&out[i], &tmp.n);
        }
    }
    
    simultaneous_invert_p448(zis, zs, 1<<tbits);
    simultaneous_invert(zis, zs, 1<<tbits);

    p448_t product;
    field_t product;
    for (i=0; i<1<<tbits; i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        field_mul(&product, &out[i].a, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        field_mul(&product, &out[i].b, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
        field_mul(&product, &out[i].c, &zis[i]);
        field_strong_reduce(&product);
        field_copy(&out[i].c, &product);
    }

    free(zs);
@@ -757,7 +715,7 @@ recode_wnaf(
         * There's also the stopper with power -1, for a total of +3.
         */
        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
            int delta = (current + 1) >> 1; // |delta| < 2^tablebits
            int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
            current = -(current & 1);

            for (j=i; (delta & 1) == 0; j++) {
@@ -813,10 +771,10 @@ prepare_wnaf_table(
 void
 scalarmul_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS]
    const word_t scalar[SCALAR_WORDS],
    unsigned int nbits
 ) {
    /* HACK: not 448? */
    const int nbits=448, table_bits = 3;
    const int table_bits = SCALARMUL_WNAF_TABLE_BITS;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
@@ -854,7 +812,7 @@ scalarmul_vt (
 void
 scalarmul_fixed_base_wnaf_vt (
    struct tw_extensible_t *working,
    const word_t scalar[448/WORD_BITS],
    const word_t scalar[SCALAR_WORDS],
    unsigned int nbits,
    const struct tw_niels_t *precmp,
    unsigned int table_bits
@@ -895,14 +853,14 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const word_t scalar_var[448/WORD_BITS],
    const word_t scalar_var[SCALAR_WORDS],
    unsigned int nbits_var,
    const word_t scalar_pre[448/WORD_BITS],
    const word_t scalar_pre[SCALAR_WORDS],
    unsigned int nbits_pre,
    const struct tw_niels_t *precmp,
    unsigned int table_bits_pre
 ) {
    const int table_bits_var = 4;
    const int table_bits_var = SCALARMUL_WNAF_COMBO_TABLE_BITS;
    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
    
--- a/src/sha512.c
+++ b/src/sha512.c
@@ -2,8 +2,8 @@
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include "sha512.h"
 #include "word.h"
 #include "sha512.h"

 #include <string.h>
 #include <assert.h>
@@ -163,9 +163,11 @@ sha512_final (
        sha512_process_block(ctx);
        fill = 0;
    }
    memset(ctx->block + fill, 0, 120-fill);
    uint64_t size = htobe64((ctx->nbytes * 8));
    memcpy(&ctx->block[120], &size, sizeof(size));
    memset(ctx->block + fill, 0, 112-fill);
    
    uint64_t highCount = 0, lowCount = htobe64((ctx->nbytes * 8));
    memcpy(&ctx->block[112],&highCount,8);
    memcpy(&ctx->block[120],&lowCount,8);
    sha512_process_block(ctx);
    for (i=0; i<8; i++) {
        ctx->chain[i] = htobe64(ctx->chain[i]);
--- a/test/bench.c
+++ b/test/bench.c
@@ -100,6 +100,9 @@ int main(int argc, char **argv) {
    for (i=0; i<32; i++) initial_seed[i] = i;
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, initial_seed);
    /* For testing the performance drop from the crandom debuffering change.
        ignore_result(crandom_init_from_file(&crand, "/dev/urandom", 10000, 1));
    */
    
    word_t sk[448/WORD_BITS],tk[448/WORD_BITS];
    q448_randomize(&crand, sk);
@@ -248,14 +251,14 @@ int main(int argc, char **argv) {
    
    when = now();
    for (i=0; i<nbase*100; i++) {
        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&goldi_q448);
        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&curve_prime_order);
    }
    when = now() - when;
    printf("barrett red: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*10; i++) {
        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&goldi_q448);
        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&curve_prime_order);
    }
    when = now() - when;
    printf("barrett mac: %5.1fns\n", when * 1e9 / i);
@@ -334,7 +337,7 @@ int main(int argc, char **argv) {
    when = now();
    for (i=0; i<nbase/10; i++) {
        q448_randomize(&crand, sk);
        scalarmul_vt(&ext,sk);
        scalarmul_vt(&ext,sk,446);
    }
    when = now() - when;
    printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -20,18 +20,6 @@ single_scalarmul_compatibility_test (
    int ret = 0, i;
    mask_t succ, succm;
    
    const struct p448_t
    sqrt_d_minus_1 = {{
        U58LE(0xd2e21836749f46),
        U58LE(0x888db42b4f0179),
        U58LE(0x5a189aabdeea38),
        U58LE(0x51e65ca6f14c06),
        U58LE(0xa49f7b424d9770),
        U58LE(0xdcac4628c5f656),
        U58LE(0x49443b8748734a),
        U58LE(0x12fec0c0b25b7a)
    }};
    
    succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base);
    
    succm = montgomery_ladder(&mont,base,scalar,nbits,1);
@@ -108,7 +96,7 @@ single_scalarmul_compatibility_test (
        untwist_and_double_and_serialize(&vl, &work);
        
        copy_tw_extensible(&work, &text);
        scalarmul_vt(&work, scalar);
        scalarmul_vt(&work, scalar, nbits);
        untwist_and_double_and_serialize(&vt, &work);
        
    
@@ -167,20 +155,7 @@ single_linear_combo_test (
    const struct p448_t *base2,
    const word_t *scalar2,
    int nbits2
 ) {
    /* MAGIC */
    const struct p448_t
    sqrt_d_minus_1 = {{
        U58LE(0xd2e21836749f46),
        U58LE(0x888db42b4f0179),
        U58LE(0x5a189aabdeea38),
        U58LE(0x51e65ca6f14c06),
        U58LE(0xa49f7b424d9770),
        U58LE(0xdcac4628c5f656),
        U58LE(0x49443b8748734a),
        U58LE(0x12fec0c0b25b7a)
    }};
    
 ) { 
    struct tw_extensible_t text1, text2, working;
    struct tw_pniels_t pn;
    struct p448_t result_comb, result_combo, result_wnaf;