diff --git a/HISTORY.txt b/HISTORY.txt
index 702513e..1f301e9 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,4 +1,52 @@
-May 3, 2104:
+July 11, 2014:
+    This is mostly a cleanup release.
+
+    Added CRANDOM_MIGHT_IS_MUST config flag (default: 1).  When set, this
+    causes crandom to assume that all features in the target arch will
+    be available, instead of detecting them.  This makes sense because
+    the rest of the Goldilocks code is not (yet?) able to detect features.
+    Also, I'd like to submit this to SUPERCOP eventually, and SUPERCOP won't
+    pass -DMUST_HAVE_XXX on the command line the way the Makefile here did.
+    
+    Flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES to disable the crandom
+    output buffer.  This buffer improves performance (very marginally at
+    Goldilocks sizes), but can cause problems with forking and VM
+    snapshotting.  By default, the buffer is now disabled.
+    
+    I've slightly tweaked the Elligator implementation (which is still
+    unused) to make it easier to invert.  This makes anything using Elligator
+    (i.e. nothing) incompatible with previous releases.
+    
+    I've been factoring "magic" constants such as curve orders, window sizes,
+    etc into a few headers, to reduce the effort to port the code to other
+    primes, curves, etc.  For example, I could test the Microsoft curves, and
+    something like:
+        x^2 + y^2 = 1 +- 5382[45] x^2 y^2 mod 2^480-2^240-1
+    ("Goldeneye"? "Ridinghood"?) might be a reasonable thing to try for
+    64-bit CPUs.
+    
+    In a similar vein, most of the internal code has been changed to say
+    "field" instead of p448, so that a future version of magic.h can decide
+    which field header to include.
+    
+    You can now `make bat` to create an eBAT in build/ed448-goldilocks.  This
+    is only minimally tested, though, because SUPERCOP doesn't work on my
+    machine and I'm too lazy to reverse engineer it.  It sets a new macro,
+    SUPERCOP_WONT_LET_ME_OPEN_FILES, which causes goldilocks_init() to fall
+    back to something horribly insecure if crandom_init_from_file raises
+    EMFILE.
+    
+    Slightly improved documentation.
+    
+    Removed some old commented-out code; restored the /* C-style */ comment
+    discipline.
+    
+    The AMD-64 version should now be GCC clean, at least for reasonably
+    recent GCC (tested on OS X.9.3, Haswell, gcc-4.9).
+    
+    History no longer says "2104".
+
+May 3, 2014:
     Minor changes to internal routines mean that this version is not
     compatible with the previous one.
 
diff --git a/Makefile b/Makefile
index 3efda9d..7050e90 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ endif
 ARCHFLAGS += -mcpu=cortex-a9 # FIXME
 GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
 else
-ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO
+ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
 endif
 
 ifeq ($(CC),clang)
@@ -48,26 +48,28 @@ endif
 
 ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC)))
 # ARCHFLAGS += -m32
-ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1
+XCFLAGS += -DGOLDI_FORCE_32_BIT=1
 endif
 
 CFLAGS  = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
 LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
 ASFLAGS = $(ARCHFLAGS)
 
-.PHONY: clean all test bench todo doc lib
+.PHONY: clean all test bench todo doc lib bat
 .PRECIOUS: build/%.s
 
 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp
 
 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
-  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o
+  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o
 
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
-	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o
+	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o
 
 BENCHCOMPONENTS=build/bench.o
 
+BATNAME=build/ed448-goldilocks
+
 all: lib build/test build/bench
 
 scan: clean
@@ -118,6 +120,19 @@ doc/timestamp:
 doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h
 	doxygen
 
+bat: $(BATNAME)
+
+$(BATNAME): include/* src/* src/*/*
+	rm -fr $@
+	for arch in src/arch*; do \
+		mkdir -p $@/`basename $$arch`; \
+		cp include/* src/*.c src/include/* $$arch/* $@/`basename $$arch`; \
+		perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $@/`basename $$arch`/config.h; \
+		done
+	echo 'Mike Hamburg' > $@/designers
+	echo 'Ed448-Goldilocks sign and dh' > $@/description
+	
+
 todo::
 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
@@ -139,4 +154,4 @@ test: build/test
 	./$<
 
 clean:
-	rm -fr build doc
+	rm -fr build doc $(BATNAME)
diff --git a/TODO.txt b/TODO.txt
index e1d05f2..df1a782 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -25,8 +25,8 @@ Important work items for Ed448-Goldilocks:
 
 * [DONE] Bugfix: make sure that init() and randomization are thread-safe.
 
-* Security: check on deserialization that points are < p.
-    * Check also that they're nonzero or otherwise non-pathological?
+* [DONE] Security: check on deserialization that points are < p.
+    * [NEEDS TESTING] Check also that they're nonzero or otherwise non-pathological?
 
 * Testing:
     * Corner-case testing
@@ -39,16 +39,16 @@ Important work items for Ed448-Goldilocks:
     * Most functions now have warn on ignored return.
 
 * Safety:
-    * Check for init() if it's still required once we've done the above
+    * [DONE] Check for init() if it's still required once we've done the above
     * Decide what to do about RNG failures
         * abort
         * return error and zeroize
         * return error but continue if RNG is kind of mostly OK
     
 * Flexibility: decide which API options are good.
-    * Eg, should functions take nbits and table sizes?
+    * [DONE?] Eg, should functions take nbits and table sizes?
     
-    * Remove hardcoded adjustments from comb control.
+    * [DONE] Remove hardcoded adjustments from comb control.
         * These adjustments make the output wrong when it's not 450 bits.
         
     * Other slow Barrett fields?  Montgomery fields?
@@ -71,6 +71,7 @@ Important work items for Ed448-Goldilocks:
 
 * Portability: test and make clean with other compilers
     * Using a fair amount of __attribute__ code.
+    * [DONE] Should work for GCC now.
 
 * Portability: try to make the vector code as portable as possible
     * Currently using clang ext_vector_length.
@@ -79,15 +80,15 @@ Important work items for Ed448-Goldilocks:
 
 * Portability: make the inner layers of the code 32-bit clean.
     * Write new versions of the field code.
-        * 28-bit limbs give less headroom for carries.
-        * Now have a vectorless ARM version; need NEON.
+        * [DONE] 28-bit limbs give less headroom for carries.
+        * [DONE] Now have a vectorless ARM version; need NEON.
         * Improve speed of 32-bit field code.
     
-    * Run through the SAGE tool to generate new bias & bound.
+    * [DONE] Run through the SAGE tool to generate new bias & bound.
 
 * [DONE] Portability: make the outer layers of the code 32-bit clean.
 
-* Performance/flexibility: decide which parameters should be hard-coded.
+* [DONE] Performance/flexibility: decide which parameters should be hard-coded.
     * Perhaps useful for comb precomputation.
 
 * Performance: Improve SHA512.
@@ -120,4 +121,4 @@ Important work items for Ed448-Goldilocks:
 
 * Clear other TODO/FIXME/HACK/PERF items in the code
 
-* Submit to SUPERCOP
+* [DONE?] Submit to SUPERCOP
diff --git a/src/arch_32/ec_point.c b/src/arch_32/ec_point.c
index 823e43d..47c325c 100644
--- a/src/arch_32/ec_point.c
+++ b/src/arch_32/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
     const struct montgomery_t* a,
     const struct p448_t*       sbz
 ) {
-    mask_t L0, L1, L2;
-    struct p448_t L3, L4, L5, L6;
-    p448_mul  (   &L6, &a->z0, &a->zd );
-    p448_sub  (   &L4,   &L6, &a->xd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6, &a->za,   &L4 );
-    p448_mul  (   &L5, &a->z0, &a->xd );
-    p448_sub  (   &L4,   &L5, &a->zd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L3, &a->xa,   &L4 );
-    p448_add  (   &L5,   &L3,   &L6 );
-    p448_sub  (   &L4,   &L6,   &L3 );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_copy (   &L5, &a->z0 );
-    p448_addw (   &L5,     1 );
-    p448_sqr  (   &L4,   &L5 );
-    p448_mulw (   &L5,   &L4, 39082 );
-    p448_neg  (   &L4,   &L5 );
-    p448_add  (   &L5, &a->z0, &a->z0 );
-    p448_bias (   &L5,     1 );
-    p448_add  (   &L3,   &L5,   &L5 );
-    p448_add  (   &L5,   &L3,   &L4 );
-    p448_weak_reduce(   &L5 );
-    p448_mul  (   &L3, &a->xd,   &L5 );
-       L1 = p448_is_zero( &a->zd );
-       L2 = -   L1;
-    p448_mask (   &L4,   &L3,    L1 );
-    p448_add  (   &L5,   &L4, &a->zd );
-       L0 = ~   L1;
-    p448_mul  (   &L4,   sbz,   &L6 );
-    p448_addw (   &L4,    L2 );
-    p448_mul  (   &L6,   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_mul  (   &L5,   &L6, &a->xd );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_isr  (   &L3,   &L6 );
-    p448_mul  (   &L5,   &L4,   &L3 );
-    p448_sqr  (   &L4,   &L3 );
-    p448_mul  (   &L3,   &L6,   &L4 );
-    p448_mask (     b,   &L5,    L0 );
-    p448_subw (   &L3,     1 );
-    p448_bias (   &L3,     1 );
-       L1 = p448_is_zero(   &L3 );
-       L0 = p448_is_zero(   sbz );
-    return    L1 |    L0;
+    mask_t L4, L5, L6;
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->z0, &a->zd );
+    p448_sub  (   &L1,   &L3, &a->xd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3, &a->za,   &L1 );
+    p448_mul  (   &L2, &a->z0, &a->xd );
+    p448_sub  (   &L1,   &L2, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &a->xa,   &L1 );
+    p448_add  (   &L2,   &L0,   &L3 );
+    p448_sub  (   &L1,   &L3,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_copy (   &L2, &a->z0 );
+    p448_addw (   &L2,     1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mulw (   &L2,   &L1, 39082 );
+    p448_neg  (   &L1,   &L2 );
+    p448_add  (   &L2, &a->z0, &a->z0 );
+    p448_bias (   &L2,     1 );
+    p448_add  (   &L0,   &L2,   &L2 );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L0, &a->xd,   &L2 );
+       L5 = p448_is_zero( &a->zd );
+       L6 = -   L5;
+    p448_mask (   &L1,   &L0,    L5 );
+    p448_add  (   &L2,   &L1, &a->zd );
+       L4 = ~   L5;
+    p448_mul  (   &L1,   sbz,   &L3 );
+    p448_addw (   &L1,    L6 );
+    p448_mul  (   &L3,   &L2,   &L1 );
+    p448_mul  (   &L1,   &L3,   &L2 );
+    p448_mul  (   &L2,   &L3, &a->xd );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_isr  (   &L0,   &L3 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L3,   &L1 );
+    p448_mask (     b,   &L2,    L4 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L5 = p448_is_zero(   &L0 );
+       L4 = p448_is_zero(   sbz );
+    return    L5 |    L4;
 }
 
 void
@@ -524,8 +524,8 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
     p448_sqr  ( &b->u, &a->z );
     p448_sqr  ( &b->y, &a->x );
     p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
     p448_bias ( &b->z,     2 );
     p448_weak_reduce( &b->z );
     p448_mul  ( &b->t, &b->z, &b->x );
-    p448_mul  (   &L3, &b->t, &b->u );
-    p448_mul  ( &b->x, &b->t,   &L3 );
-    p448_isr  (   &L2, &b->x );
-    p448_mul  ( &b->u, &b->t,   &L2 );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  ( &b->t, &b->x,   &L3 );
-    p448_add  ( &b->x, &a->y, &a->x );
-    p448_weak_reduce( &b->x );
-    p448_sub  (   &L2, &a->x, &a->y );
-    p448_bias (   &L2,     2 );
-    p448_weak_reduce(   &L2 );
-    p448_mul  (   &L3, &b->t,   &L2 );
-    p448_add  (   &L2,   &L3, &b->x );
-    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_mul  (   &L1, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L1 );
+    p448_isr  (   &L0, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &b->t, &b->x,   &L1 );
+    p448_add  (   &L1, &a->y, &a->x );
+    p448_weak_reduce(   &L1 );
+    p448_sub  (   &L0, &a->x, &a->y );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &b->x, &b->t,   &L0 );
+    p448_add  (   &L0, &b->x,   &L1 );
+    p448_sub  ( &b->t,   &L1, &b->x );
     p448_bias ( &b->t,     2 );
     p448_weak_reduce( &b->t );
-    p448_mul  ( &b->x,   &L2, &b->u );
-       L0 = p448_is_zero( &b->y );
-       L1 = -   L0;
-    p448_addw ( &b->x,    L1 );
+    p448_mul  ( &b->x,   &L0, &b->u );
+       L2 = p448_is_zero( &b->y );
+       L3 = -   L2;
+    p448_addw ( &b->x,    L3 );
     p448_weak_reduce( &b->x );
     p448_mul  ( &b->y, &b->t, &b->u );
-       L0 = p448_is_zero( &b->z );
-       L1 = -   L0;
-    p448_addw ( &b->y,    L1 );
+       L2 = p448_is_zero( &b->z );
+       L3 = -   L2;
+    p448_addw ( &b->y,    L3 );
     p448_weak_reduce( &b->y );
-       L1 = p448_is_zero( &a->y );
-       L0 =    L1 +     1;
-    p448_set_ui( &b->z,    L0 );
+       L3 = p448_is_zero( &a->y );
+       L2 =    L3 +     1;
+    p448_set_ui( &b->z,    L2 );
     p448_copy ( &b->t, &b->x );
     p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
     const struct p448_t* x
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
-    p448_isr  (   &L2,     x );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  (   &L2,     x,   &L3 );
-    p448_subw (   &L2,     1 );
-    p448_bias (   &L2,     1 );
-       L1 = p448_is_zero(   &L2 );
-       L0 = p448_is_zero(     x );
-    return    L1 |    L0;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,     x,   &L1 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L3 = p448_is_zero(   &L0 );
+       L2 = p448_is_zero(     x );
+    return    L3 |    L2;
 }
 
 mask_t
@@ -744,15 +744,15 @@ eq_affine (
     const struct affine_t* a,
     const struct affine_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2;
-    p448_sub  (   &L2, &a->x, &b->x );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_sub  (   &L2, &a->y, &b->y );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L1, L2;
+    struct p448_t L0;
+    p448_sub  (   &L0, &a->x, &b->x );
+    p448_bias (   &L0,     2 );
+       L2 = p448_is_zero(   &L0 );
+    p448_sub  (   &L0, &a->y, &b->y );
+    p448_bias (   &L0,     2 );
+       L1 = p448_is_zero(   &L0 );
+    return    L2 &    L1;
 }
 
 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
     const struct extensible_t* a,
     const struct extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
     const struct tw_extensible_t* a,
     const struct tw_extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
     const struct p448_t* r
 ) {
     mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8;
     p448_sqr  ( &a->x,     r );
     p448_sqr  (   &L3, &a->x );
     p448_copy ( &a->y,   &L3 );
     p448_subw ( &a->y,     1 );
-    p448_neg  (   &L9, &a->y );
-    p448_bias (   &L9,     2 );
-    p448_weak_reduce(   &L9 );
-    p448_sqr  (   &L2,   &L9 );
-    p448_mulw (   &L8,   &L2, 1527402724 );
-    p448_mulw (   &L7,   &L3, 6108985600 );
-    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_neg  (   &L4, &a->y );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_sqr  (   &L2,   &L4 );
+    p448_mulw (   &L7,   &L2, 1527402724 );
+    p448_mulw (   &L8,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L8,   &L7 );
     p448_weak_reduce( &a->y );
     p448_mulw (   &L8,   &L2, 6109454568 );
     p448_sub  (   &L7, &a->y,   &L8 );
     p448_bias (   &L7,     2 );
     p448_weak_reduce(   &L7 );
-    p448_mulw (   &L4, &a->y, 78160 );
-    p448_mul  (   &L6,   &L7,   &L9 );
-    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mulw (   &L6, &a->y, 78160 );
+    p448_mul  (   &L5,   &L7,   &L6 );
+    p448_mul  (   &L8,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_mul  (   &L5,   &L7,   &L8 );
+    p448_mul  (   &L8,   &L5,   &L4 );
     p448_mul  (   &L4,   &L7,   &L8 );
-    p448_isr  (   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_sqr  (   &L6,   &L5 );
-    p448_mul  (   &L5,   &L8,   &L6 );
-    p448_mul  (   &L8,   &L7,   &L5 );
-    p448_mul  (   &L7,   &L8,   &L5 );
-    p448_copy (   &L5, &a->x );
-    p448_subw (   &L5,     1 );
+    p448_isr  (   &L6,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_sqr  (   &L5,   &L6 );
+    p448_mul  (   &L6,   &L8,   &L5 );
+    p448_mul  (   &L8,   &L7,   &L6 );
+    p448_mul  (   &L7,   &L8,   &L6 );
+    p448_copy (   &L6, &a->x );
+    p448_subw (   &L6,     1 );
     p448_addw ( &a->x,     1 );
-    p448_mul  (   &L6, &a->x,   &L8 );
-    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_mul  (   &L5, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L6,   &L5 );
     p448_bias ( &a->x,     3 );
     p448_weak_reduce( &a->x );
     p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
     p448_mulw (   &L3,   &L2, 3054649120 );
     p448_add  (   &L2,   &L3, &a->y );
     p448_mul  ( &a->y,   &L7,   &L2 );
-       L1 = p448_is_zero(   &L9 );
+       L1 = p448_is_zero(   &L8 );
        L0 = -   L1;
     p448_addw ( &a->y,    L0 );
     p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
     const struct tw_extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L2, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L2 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L3, &ext->x, &ext->y );
-    p448_neg  (   &L2,   &L3 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_bias (   &L3,     2 );
-       L1 = p448_is_zero(   &L3 );
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L2,   &L4 );
-    p448_addw (   &L2,     0 );
-    p448_sqr  (   &L3, &ext->x );
-    p448_add  (   &L4,   &L3,   &L2 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L3, &ext->t );
-    p448_mul  (   &L2,   &L3,   &L5 );
-    p448_mulw (   &L3,   &L2, 39081 );
-    p448_neg  (   &L5,   &L3 );
-    p448_add  (   &L3,   &L5,   &L4 );
-    p448_neg  (   &L5,   &L2 );
-    p448_add  (   &L4,   &L5,   &L3 );
-    p448_sqr  (   &L3, &ext->z );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     4 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L0,   &L1, 39081 );
+    p448_neg  (   &L3,   &L0 );
+    p448_add  (   &L0,   &L3,   &L2 );
+    p448_neg  (   &L3,   &L1 );
+    p448_add  (   &L2,   &L3,   &L0 );
+    p448_sqr  (   &L1, &ext->z );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     4 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 mask_t
 validate_extensible (
     const struct extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L3,   &L4 );
-    p448_addw (   &L3,     0 );
-    p448_sqr  (   &L2, &ext->z );
-    p448_add  (   &L4,   &L2,   &L3 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L2, &ext->t );
-    p448_mul  (   &L3,   &L2,   &L5 );
-    p448_mulw (   &L5,   &L3, 39081 );
-    p448_neg  (   &L2,   &L5 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_sqr  (   &L2, &ext->x );
-    p448_neg  (   &L4,   &L2 );
-    p448_add  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     4 );
-       L1 = p448_is_zero(   &L2 );
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->z );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L3,   &L1, 39081 );
+    p448_neg  (   &L0,   &L3 );
+    p448_add  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_neg  (   &L2,   &L0 );
+    p448_add  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     4 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L3, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L3 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L2, &ext->x, &ext->y );
-    p448_neg  (   &L3,   &L2 );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 
diff --git a/src/arch_32/p448.c b/src/arch_32/p448.c
index d3b2956..e45778a 100644
--- a/src/arch_32/p448.c
+++ b/src/arch_32/p448.c
@@ -4,7 +4,6 @@
 
 #include "word.h"
 #include "p448.h"
-//#include "x86-64-arith.h"
 
 static inline mask_t __attribute__((always_inline))
 is_zero (
@@ -27,13 +26,7 @@ p448_mul (
     p448_t *__restrict__ cs,
     const p448_t *as,
     const p448_t *bs
-) {
-    // p448_t ar, br;
-//     p448_copy(&ar,as);
-//     p448_copy(&br,bs);
-//     p448_weak_reduce(&ar);
-//     p448_weak_reduce(&br);
-    
+) { 
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
 
@@ -41,13 +34,7 @@ p448_mul (
     uint32_t mask = (1<<28) - 1;  
 
     uint32_t aa[8], bb[8];
-
-    /* For some reason clang doesn't vectorize this without prompting? */
-    // unsigned int i;
-    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
-    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
-    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
-    // }
+    
     int i,j;
     for (i=0; i<8; i++) {
         aa[i] = a[i] + a[i+8];
@@ -144,7 +131,7 @@ p448_sqr (
     p448_t *__restrict__ cs,
     const p448_t *as
 ) {
-    p448_mul(cs,as,as); // PERF
+    p448_mul(cs,as,as); /* PERF */
 }
 
 void
diff --git a/src/arch_arm_32/ec_point.c b/src/arch_arm_32/ec_point.c
index 823e43d..47c325c 100644
--- a/src/arch_arm_32/ec_point.c
+++ b/src/arch_arm_32/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
     const struct montgomery_t* a,
     const struct p448_t*       sbz
 ) {
-    mask_t L0, L1, L2;
-    struct p448_t L3, L4, L5, L6;
-    p448_mul  (   &L6, &a->z0, &a->zd );
-    p448_sub  (   &L4,   &L6, &a->xd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6, &a->za,   &L4 );
-    p448_mul  (   &L5, &a->z0, &a->xd );
-    p448_sub  (   &L4,   &L5, &a->zd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L3, &a->xa,   &L4 );
-    p448_add  (   &L5,   &L3,   &L6 );
-    p448_sub  (   &L4,   &L6,   &L3 );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_copy (   &L5, &a->z0 );
-    p448_addw (   &L5,     1 );
-    p448_sqr  (   &L4,   &L5 );
-    p448_mulw (   &L5,   &L4, 39082 );
-    p448_neg  (   &L4,   &L5 );
-    p448_add  (   &L5, &a->z0, &a->z0 );
-    p448_bias (   &L5,     1 );
-    p448_add  (   &L3,   &L5,   &L5 );
-    p448_add  (   &L5,   &L3,   &L4 );
-    p448_weak_reduce(   &L5 );
-    p448_mul  (   &L3, &a->xd,   &L5 );
-       L1 = p448_is_zero( &a->zd );
-       L2 = -   L1;
-    p448_mask (   &L4,   &L3,    L1 );
-    p448_add  (   &L5,   &L4, &a->zd );
-       L0 = ~   L1;
-    p448_mul  (   &L4,   sbz,   &L6 );
-    p448_addw (   &L4,    L2 );
-    p448_mul  (   &L6,   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_mul  (   &L5,   &L6, &a->xd );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_isr  (   &L3,   &L6 );
-    p448_mul  (   &L5,   &L4,   &L3 );
-    p448_sqr  (   &L4,   &L3 );
-    p448_mul  (   &L3,   &L6,   &L4 );
-    p448_mask (     b,   &L5,    L0 );
-    p448_subw (   &L3,     1 );
-    p448_bias (   &L3,     1 );
-       L1 = p448_is_zero(   &L3 );
-       L0 = p448_is_zero(   sbz );
-    return    L1 |    L0;
+    mask_t L4, L5, L6;
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->z0, &a->zd );
+    p448_sub  (   &L1,   &L3, &a->xd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3, &a->za,   &L1 );
+    p448_mul  (   &L2, &a->z0, &a->xd );
+    p448_sub  (   &L1,   &L2, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &a->xa,   &L1 );
+    p448_add  (   &L2,   &L0,   &L3 );
+    p448_sub  (   &L1,   &L3,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_copy (   &L2, &a->z0 );
+    p448_addw (   &L2,     1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mulw (   &L2,   &L1, 39082 );
+    p448_neg  (   &L1,   &L2 );
+    p448_add  (   &L2, &a->z0, &a->z0 );
+    p448_bias (   &L2,     1 );
+    p448_add  (   &L0,   &L2,   &L2 );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L0, &a->xd,   &L2 );
+       L5 = p448_is_zero( &a->zd );
+       L6 = -   L5;
+    p448_mask (   &L1,   &L0,    L5 );
+    p448_add  (   &L2,   &L1, &a->zd );
+       L4 = ~   L5;
+    p448_mul  (   &L1,   sbz,   &L3 );
+    p448_addw (   &L1,    L6 );
+    p448_mul  (   &L3,   &L2,   &L1 );
+    p448_mul  (   &L1,   &L3,   &L2 );
+    p448_mul  (   &L2,   &L3, &a->xd );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_isr  (   &L0,   &L3 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L3,   &L1 );
+    p448_mask (     b,   &L2,    L4 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L5 = p448_is_zero(   &L0 );
+       L4 = p448_is_zero(   sbz );
+    return    L5 |    L4;
 }
 
 void
@@ -524,8 +524,8 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
     p448_sqr  ( &b->u, &a->z );
     p448_sqr  ( &b->y, &a->x );
     p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
     p448_bias ( &b->z,     2 );
     p448_weak_reduce( &b->z );
     p448_mul  ( &b->t, &b->z, &b->x );
-    p448_mul  (   &L3, &b->t, &b->u );
-    p448_mul  ( &b->x, &b->t,   &L3 );
-    p448_isr  (   &L2, &b->x );
-    p448_mul  ( &b->u, &b->t,   &L2 );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  ( &b->t, &b->x,   &L3 );
-    p448_add  ( &b->x, &a->y, &a->x );
-    p448_weak_reduce( &b->x );
-    p448_sub  (   &L2, &a->x, &a->y );
-    p448_bias (   &L2,     2 );
-    p448_weak_reduce(   &L2 );
-    p448_mul  (   &L3, &b->t,   &L2 );
-    p448_add  (   &L2,   &L3, &b->x );
-    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_mul  (   &L1, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L1 );
+    p448_isr  (   &L0, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &b->t, &b->x,   &L1 );
+    p448_add  (   &L1, &a->y, &a->x );
+    p448_weak_reduce(   &L1 );
+    p448_sub  (   &L0, &a->x, &a->y );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &b->x, &b->t,   &L0 );
+    p448_add  (   &L0, &b->x,   &L1 );
+    p448_sub  ( &b->t,   &L1, &b->x );
     p448_bias ( &b->t,     2 );
     p448_weak_reduce( &b->t );
-    p448_mul  ( &b->x,   &L2, &b->u );
-       L0 = p448_is_zero( &b->y );
-       L1 = -   L0;
-    p448_addw ( &b->x,    L1 );
+    p448_mul  ( &b->x,   &L0, &b->u );
+       L2 = p448_is_zero( &b->y );
+       L3 = -   L2;
+    p448_addw ( &b->x,    L3 );
     p448_weak_reduce( &b->x );
     p448_mul  ( &b->y, &b->t, &b->u );
-       L0 = p448_is_zero( &b->z );
-       L1 = -   L0;
-    p448_addw ( &b->y,    L1 );
+       L2 = p448_is_zero( &b->z );
+       L3 = -   L2;
+    p448_addw ( &b->y,    L3 );
     p448_weak_reduce( &b->y );
-       L1 = p448_is_zero( &a->y );
-       L0 =    L1 +     1;
-    p448_set_ui( &b->z,    L0 );
+       L3 = p448_is_zero( &a->y );
+       L2 =    L3 +     1;
+    p448_set_ui( &b->z,    L2 );
     p448_copy ( &b->t, &b->x );
     p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
     const struct p448_t* x
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
-    p448_isr  (   &L2,     x );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  (   &L2,     x,   &L3 );
-    p448_subw (   &L2,     1 );
-    p448_bias (   &L2,     1 );
-       L1 = p448_is_zero(   &L2 );
-       L0 = p448_is_zero(     x );
-    return    L1 |    L0;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,     x,   &L1 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L3 = p448_is_zero(   &L0 );
+       L2 = p448_is_zero(     x );
+    return    L3 |    L2;
 }
 
 mask_t
@@ -744,15 +744,15 @@ eq_affine (
     const struct affine_t* a,
     const struct affine_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2;
-    p448_sub  (   &L2, &a->x, &b->x );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_sub  (   &L2, &a->y, &b->y );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L1, L2;
+    struct p448_t L0;
+    p448_sub  (   &L0, &a->x, &b->x );
+    p448_bias (   &L0,     2 );
+       L2 = p448_is_zero(   &L0 );
+    p448_sub  (   &L0, &a->y, &b->y );
+    p448_bias (   &L0,     2 );
+       L1 = p448_is_zero(   &L0 );
+    return    L2 &    L1;
 }
 
 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
     const struct extensible_t* a,
     const struct extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
     const struct tw_extensible_t* a,
     const struct tw_extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
     const struct p448_t* r
 ) {
     mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8;
     p448_sqr  ( &a->x,     r );
     p448_sqr  (   &L3, &a->x );
     p448_copy ( &a->y,   &L3 );
     p448_subw ( &a->y,     1 );
-    p448_neg  (   &L9, &a->y );
-    p448_bias (   &L9,     2 );
-    p448_weak_reduce(   &L9 );
-    p448_sqr  (   &L2,   &L9 );
-    p448_mulw (   &L8,   &L2, 1527402724 );
-    p448_mulw (   &L7,   &L3, 6108985600 );
-    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_neg  (   &L4, &a->y );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_sqr  (   &L2,   &L4 );
+    p448_mulw (   &L7,   &L2, 1527402724 );
+    p448_mulw (   &L8,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L8,   &L7 );
     p448_weak_reduce( &a->y );
     p448_mulw (   &L8,   &L2, 6109454568 );
     p448_sub  (   &L7, &a->y,   &L8 );
     p448_bias (   &L7,     2 );
     p448_weak_reduce(   &L7 );
-    p448_mulw (   &L4, &a->y, 78160 );
-    p448_mul  (   &L6,   &L7,   &L9 );
-    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mulw (   &L6, &a->y, 78160 );
+    p448_mul  (   &L5,   &L7,   &L6 );
+    p448_mul  (   &L8,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_mul  (   &L5,   &L7,   &L8 );
+    p448_mul  (   &L8,   &L5,   &L4 );
     p448_mul  (   &L4,   &L7,   &L8 );
-    p448_isr  (   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_sqr  (   &L6,   &L5 );
-    p448_mul  (   &L5,   &L8,   &L6 );
-    p448_mul  (   &L8,   &L7,   &L5 );
-    p448_mul  (   &L7,   &L8,   &L5 );
-    p448_copy (   &L5, &a->x );
-    p448_subw (   &L5,     1 );
+    p448_isr  (   &L6,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_sqr  (   &L5,   &L6 );
+    p448_mul  (   &L6,   &L8,   &L5 );
+    p448_mul  (   &L8,   &L7,   &L6 );
+    p448_mul  (   &L7,   &L8,   &L6 );
+    p448_copy (   &L6, &a->x );
+    p448_subw (   &L6,     1 );
     p448_addw ( &a->x,     1 );
-    p448_mul  (   &L6, &a->x,   &L8 );
-    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_mul  (   &L5, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L6,   &L5 );
     p448_bias ( &a->x,     3 );
     p448_weak_reduce( &a->x );
     p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
     p448_mulw (   &L3,   &L2, 3054649120 );
     p448_add  (   &L2,   &L3, &a->y );
     p448_mul  ( &a->y,   &L7,   &L2 );
-       L1 = p448_is_zero(   &L9 );
+       L1 = p448_is_zero(   &L8 );
        L0 = -   L1;
     p448_addw ( &a->y,    L0 );
     p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
     const struct tw_extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L2, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L2 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L3, &ext->x, &ext->y );
-    p448_neg  (   &L2,   &L3 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_bias (   &L3,     2 );
-       L1 = p448_is_zero(   &L3 );
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L2,   &L4 );
-    p448_addw (   &L2,     0 );
-    p448_sqr  (   &L3, &ext->x );
-    p448_add  (   &L4,   &L3,   &L2 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L3, &ext->t );
-    p448_mul  (   &L2,   &L3,   &L5 );
-    p448_mulw (   &L3,   &L2, 39081 );
-    p448_neg  (   &L5,   &L3 );
-    p448_add  (   &L3,   &L5,   &L4 );
-    p448_neg  (   &L5,   &L2 );
-    p448_add  (   &L4,   &L5,   &L3 );
-    p448_sqr  (   &L3, &ext->z );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     4 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L0,   &L1, 39081 );
+    p448_neg  (   &L3,   &L0 );
+    p448_add  (   &L0,   &L3,   &L2 );
+    p448_neg  (   &L3,   &L1 );
+    p448_add  (   &L2,   &L3,   &L0 );
+    p448_sqr  (   &L1, &ext->z );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     4 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 mask_t
 validate_extensible (
     const struct extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L3,   &L4 );
-    p448_addw (   &L3,     0 );
-    p448_sqr  (   &L2, &ext->z );
-    p448_add  (   &L4,   &L2,   &L3 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L2, &ext->t );
-    p448_mul  (   &L3,   &L2,   &L5 );
-    p448_mulw (   &L5,   &L3, 39081 );
-    p448_neg  (   &L2,   &L5 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_sqr  (   &L2, &ext->x );
-    p448_neg  (   &L4,   &L2 );
-    p448_add  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     4 );
-       L1 = p448_is_zero(   &L2 );
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->z );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L3,   &L1, 39081 );
+    p448_neg  (   &L0,   &L3 );
+    p448_add  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_neg  (   &L2,   &L0 );
+    p448_add  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     4 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L3, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L3 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L2, &ext->x, &ext->y );
-    p448_neg  (   &L3,   &L2 );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 
diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c
index fa3c583..ec08fb8 100644
--- a/src/arch_arm_32/p448.c
+++ b/src/arch_arm_32/p448.c
@@ -4,7 +4,6 @@
 
 #include "word.h"
 #include "p448.h"
-//#include "x86-64-arith.h"
 
 static inline mask_t __attribute__((always_inline))
 is_zero (
@@ -105,11 +104,6 @@ p448_mul (
     const p448_t *as,
     const p448_t *bs
 ) {
-    // p448_t ar, br;
-//     p448_copy(&ar,as);
-//     p448_copy(&br,bs);
-//     p448_weak_reduce(&ar);
-//     p448_weak_reduce(&br);
     
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
@@ -119,12 +113,6 @@ p448_mul (
 
     uint32_t aa[8], bm[8];
 
-    /* For some reason clang doesn't vectorize this without prompting? */
-    // unsigned int i;
-    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
-    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
-    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
-    // }
     int i;
     for (i=0; i<8; i++) {
         aa[i] = a[i] + a[i+8];
@@ -466,12 +454,6 @@ p448_sqr (
     p448_t *__restrict__ cs,
     const p448_t *as
 ) {
-    // p448_t ar, br;
-//     p448_copy(&ar,as);
-//     p448_copy(&br,bs);
-//     p448_weak_reduce(&ar);
-//     p448_weak_reduce(&br);
-    
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
 
@@ -479,13 +461,7 @@ p448_sqr (
     uint32_t mask = (1<<28) - 1;  
 
     uint32_t bm[8];
-
-    /* For some reason clang doesn't vectorize this without prompting? */
-    // unsigned int i;
-    // for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
-    //     ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&bm[4]))[i];
-    //     ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
-    // }
+    
     int i;
     for (i=0; i<8; i++) {
         bm[i] = a[i] - a[i+8];
diff --git a/src/arch_neon/ec_point.c b/src/arch_neon/ec_point.c
index 823e43d..47c325c 100644
--- a/src/arch_neon/ec_point.c
+++ b/src/arch_neon/ec_point.c
@@ -380,55 +380,55 @@ serialize_montgomery (
     const struct montgomery_t* a,
     const struct p448_t*       sbz
 ) {
-    mask_t L0, L1, L2;
-    struct p448_t L3, L4, L5, L6;
-    p448_mul  (   &L6, &a->z0, &a->zd );
-    p448_sub  (   &L4,   &L6, &a->xd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6, &a->za,   &L4 );
-    p448_mul  (   &L5, &a->z0, &a->xd );
-    p448_sub  (   &L4,   &L5, &a->zd );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L3, &a->xa,   &L4 );
-    p448_add  (   &L5,   &L3,   &L6 );
-    p448_sub  (   &L4,   &L6,   &L3 );
-    p448_bias (   &L4,     2 );
-    p448_weak_reduce(   &L4 );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_copy (   &L5, &a->z0 );
-    p448_addw (   &L5,     1 );
-    p448_sqr  (   &L4,   &L5 );
-    p448_mulw (   &L5,   &L4, 39082 );
-    p448_neg  (   &L4,   &L5 );
-    p448_add  (   &L5, &a->z0, &a->z0 );
-    p448_bias (   &L5,     1 );
-    p448_add  (   &L3,   &L5,   &L5 );
-    p448_add  (   &L5,   &L3,   &L4 );
-    p448_weak_reduce(   &L5 );
-    p448_mul  (   &L3, &a->xd,   &L5 );
-       L1 = p448_is_zero( &a->zd );
-       L2 = -   L1;
-    p448_mask (   &L4,   &L3,    L1 );
-    p448_add  (   &L5,   &L4, &a->zd );
-       L0 = ~   L1;
-    p448_mul  (   &L4,   sbz,   &L6 );
-    p448_addw (   &L4,    L2 );
-    p448_mul  (   &L6,   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_mul  (   &L5,   &L6, &a->xd );
-    p448_mul  (   &L6,   &L4,   &L5 );
-    p448_isr  (   &L3,   &L6 );
-    p448_mul  (   &L5,   &L4,   &L3 );
-    p448_sqr  (   &L4,   &L3 );
-    p448_mul  (   &L3,   &L6,   &L4 );
-    p448_mask (     b,   &L5,    L0 );
-    p448_subw (   &L3,     1 );
-    p448_bias (   &L3,     1 );
-       L1 = p448_is_zero(   &L3 );
-       L0 = p448_is_zero(   sbz );
-    return    L1 |    L0;
+    mask_t L4, L5, L6;
+    struct p448_t L0, L1, L2, L3;
+    p448_mul  (   &L3, &a->z0, &a->zd );
+    p448_sub  (   &L1,   &L3, &a->xd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3, &a->za,   &L1 );
+    p448_mul  (   &L2, &a->z0, &a->xd );
+    p448_sub  (   &L1,   &L2, &a->zd );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L0, &a->xa,   &L1 );
+    p448_add  (   &L2,   &L0,   &L3 );
+    p448_sub  (   &L1,   &L3,   &L0 );
+    p448_bias (   &L1,     2 );
+    p448_weak_reduce(   &L1 );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_copy (   &L2, &a->z0 );
+    p448_addw (   &L2,     1 );
+    p448_sqr  (   &L1,   &L2 );
+    p448_mulw (   &L2,   &L1, 39082 );
+    p448_neg  (   &L1,   &L2 );
+    p448_add  (   &L2, &a->z0, &a->z0 );
+    p448_bias (   &L2,     1 );
+    p448_add  (   &L0,   &L2,   &L2 );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_weak_reduce(   &L2 );
+    p448_mul  (   &L0, &a->xd,   &L2 );
+       L5 = p448_is_zero( &a->zd );
+       L6 = -   L5;
+    p448_mask (   &L1,   &L0,    L5 );
+    p448_add  (   &L2,   &L1, &a->zd );
+       L4 = ~   L5;
+    p448_mul  (   &L1,   sbz,   &L3 );
+    p448_addw (   &L1,    L6 );
+    p448_mul  (   &L3,   &L2,   &L1 );
+    p448_mul  (   &L1,   &L3,   &L2 );
+    p448_mul  (   &L2,   &L3, &a->xd );
+    p448_mul  (   &L3,   &L1,   &L2 );
+    p448_isr  (   &L0,   &L3 );
+    p448_mul  (   &L2,   &L1,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,   &L3,   &L1 );
+    p448_mask (     b,   &L2,    L4 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L5 = p448_is_zero(   &L0 );
+       L4 = p448_is_zero(   sbz );
+    return    L5 |    L4;
 }
 
 void
@@ -524,8 +524,8 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
     p448_sqr  ( &b->u, &a->z );
     p448_sqr  ( &b->y, &a->x );
     p448_sub  ( &b->z, &b->u, &b->y );
@@ -541,35 +541,35 @@ test_only_twist (
     p448_bias ( &b->z,     2 );
     p448_weak_reduce( &b->z );
     p448_mul  ( &b->t, &b->z, &b->x );
-    p448_mul  (   &L3, &b->t, &b->u );
-    p448_mul  ( &b->x, &b->t,   &L3 );
-    p448_isr  (   &L2, &b->x );
-    p448_mul  ( &b->u, &b->t,   &L2 );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  ( &b->t, &b->x,   &L3 );
-    p448_add  ( &b->x, &a->y, &a->x );
-    p448_weak_reduce( &b->x );
-    p448_sub  (   &L2, &a->x, &a->y );
-    p448_bias (   &L2,     2 );
-    p448_weak_reduce(   &L2 );
-    p448_mul  (   &L3, &b->t,   &L2 );
-    p448_add  (   &L2,   &L3, &b->x );
-    p448_sub  ( &b->t, &b->x,   &L3 );
+    p448_mul  (   &L1, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L1 );
+    p448_isr  (   &L0, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L0 );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  ( &b->t, &b->x,   &L1 );
+    p448_add  (   &L1, &a->y, &a->x );
+    p448_weak_reduce(   &L1 );
+    p448_sub  (   &L0, &a->x, &a->y );
+    p448_bias (   &L0,     2 );
+    p448_weak_reduce(   &L0 );
+    p448_mul  ( &b->x, &b->t,   &L0 );
+    p448_add  (   &L0, &b->x,   &L1 );
+    p448_sub  ( &b->t,   &L1, &b->x );
     p448_bias ( &b->t,     2 );
     p448_weak_reduce( &b->t );
-    p448_mul  ( &b->x,   &L2, &b->u );
-       L0 = p448_is_zero( &b->y );
-       L1 = -   L0;
-    p448_addw ( &b->x,    L1 );
+    p448_mul  ( &b->x,   &L0, &b->u );
+       L2 = p448_is_zero( &b->y );
+       L3 = -   L2;
+    p448_addw ( &b->x,    L3 );
     p448_weak_reduce( &b->x );
     p448_mul  ( &b->y, &b->t, &b->u );
-       L0 = p448_is_zero( &b->z );
-       L1 = -   L0;
-    p448_addw ( &b->y,    L1 );
+       L2 = p448_is_zero( &b->z );
+       L3 = -   L2;
+    p448_addw ( &b->y,    L3 );
     p448_weak_reduce( &b->y );
-       L1 = p448_is_zero( &a->y );
-       L0 =    L1 +     1;
-    p448_set_ui( &b->z,    L0 );
+       L3 = p448_is_zero( &a->y );
+       L2 =    L3 +     1;
+    p448_set_ui( &b->z,    L2 );
     p448_copy ( &b->t, &b->x );
     p448_copy ( &b->u, &b->y );
 }
@@ -578,16 +578,16 @@ mask_t
 is_square (
     const struct p448_t* x
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3;
-    p448_isr  (   &L2,     x );
-    p448_sqr  (   &L3,   &L2 );
-    p448_mul  (   &L2,     x,   &L3 );
-    p448_subw (   &L2,     1 );
-    p448_bias (   &L2,     1 );
-       L1 = p448_is_zero(   &L2 );
-       L0 = p448_is_zero(     x );
-    return    L1 |    L0;
+    mask_t L2, L3;
+    struct p448_t L0, L1;
+    p448_isr  (   &L0,     x );
+    p448_sqr  (   &L1,   &L0 );
+    p448_mul  (   &L0,     x,   &L1 );
+    p448_subw (   &L0,     1 );
+    p448_bias (   &L0,     1 );
+       L3 = p448_is_zero(   &L0 );
+       L2 = p448_is_zero(     x );
+    return    L3 |    L2;
 }
 
 mask_t
@@ -744,15 +744,15 @@ eq_affine (
     const struct affine_t* a,
     const struct affine_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2;
-    p448_sub  (   &L2, &a->x, &b->x );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_sub  (   &L2, &a->y, &b->y );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L1, L2;
+    struct p448_t L0;
+    p448_sub  (   &L0, &a->x, &b->x );
+    p448_bias (   &L0,     2 );
+       L2 = p448_is_zero(   &L0 );
+    p448_sub  (   &L0, &a->y, &b->y );
+    p448_bias (   &L0,     2 );
+       L1 = p448_is_zero(   &L0 );
+    return    L2 &    L1;
 }
 
 mask_t
@@ -760,19 +760,19 @@ eq_extensible (
     const struct extensible_t* a,
     const struct extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 mask_t
@@ -780,19 +780,19 @@ eq_tw_extensible (
     const struct tw_extensible_t* a,
     const struct tw_extensible_t* b
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4;
-    p448_mul  (   &L4, &b->z, &a->x );
-    p448_mul  (   &L3, &a->z, &b->x );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L1 = p448_is_zero(   &L2 );
-    p448_mul  (   &L4, &b->z, &a->y );
-    p448_mul  (   &L3, &a->z, &b->y );
-    p448_sub  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    mask_t L3, L4;
+    struct p448_t L0, L1, L2;
+    p448_mul  (   &L2, &b->z, &a->x );
+    p448_mul  (   &L1, &a->z, &b->x );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    p448_mul  (   &L2, &b->z, &a->y );
+    p448_mul  (   &L1, &a->z, &b->y );
+    p448_sub  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     2 );
+       L3 = p448_is_zero(   &L0 );
+    return    L4 &    L3;
 }
 
 void
@@ -801,38 +801,41 @@ elligator_2s_inject (
     const struct p448_t* r
 ) {
     mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8;
     p448_sqr  ( &a->x,     r );
     p448_sqr  (   &L3, &a->x );
     p448_copy ( &a->y,   &L3 );
     p448_subw ( &a->y,     1 );
-    p448_neg  (   &L9, &a->y );
-    p448_bias (   &L9,     2 );
-    p448_weak_reduce(   &L9 );
-    p448_sqr  (   &L2,   &L9 );
-    p448_mulw (   &L8,   &L2, 1527402724 );
-    p448_mulw (   &L7,   &L3, 6108985600 );
-    p448_add  ( &a->y,   &L7,   &L8 );
+    p448_neg  (   &L4, &a->y );
+    p448_bias (   &L4,     2 );
+    p448_weak_reduce(   &L4 );
+    p448_sqr  (   &L2,   &L4 );
+    p448_mulw (   &L7,   &L2, 1527402724 );
+    p448_mulw (   &L8,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L8,   &L7 );
     p448_weak_reduce( &a->y );
     p448_mulw (   &L8,   &L2, 6109454568 );
     p448_sub  (   &L7, &a->y,   &L8 );
     p448_bias (   &L7,     2 );
     p448_weak_reduce(   &L7 );
-    p448_mulw (   &L4, &a->y, 78160 );
-    p448_mul  (   &L6,   &L7,   &L9 );
-    p448_mul  (   &L8,   &L6,   &L4 );
+    p448_mulw (   &L6, &a->y, 78160 );
+    p448_mul  (   &L5,   &L7,   &L6 );
+    p448_mul  (   &L8,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_mul  (   &L5,   &L7,   &L8 );
+    p448_mul  (   &L8,   &L5,   &L4 );
     p448_mul  (   &L4,   &L7,   &L8 );
-    p448_isr  (   &L5,   &L4 );
-    p448_mul  (   &L4,   &L6,   &L5 );
-    p448_sqr  (   &L6,   &L5 );
-    p448_mul  (   &L5,   &L8,   &L6 );
-    p448_mul  (   &L8,   &L7,   &L5 );
-    p448_mul  (   &L7,   &L8,   &L5 );
-    p448_copy (   &L5, &a->x );
-    p448_subw (   &L5,     1 );
+    p448_isr  (   &L6,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_sqr  (   &L5,   &L6 );
+    p448_mul  (   &L6,   &L8,   &L5 );
+    p448_mul  (   &L8,   &L7,   &L6 );
+    p448_mul  (   &L7,   &L8,   &L6 );
+    p448_copy (   &L6, &a->x );
+    p448_subw (   &L6,     1 );
     p448_addw ( &a->x,     1 );
-    p448_mul  (   &L6, &a->x,   &L8 );
-    p448_sub  ( &a->x,   &L5,   &L6 );
+    p448_mul  (   &L5, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L6,   &L5 );
     p448_bias ( &a->x,     3 );
     p448_weak_reduce( &a->x );
     p448_mul  (   &L5,   &L4, &a->x );
@@ -849,7 +852,7 @@ elligator_2s_inject (
     p448_mulw (   &L3,   &L2, 3054649120 );
     p448_add  (   &L2,   &L3, &a->y );
     p448_mul  ( &a->y,   &L7,   &L2 );
-       L1 = p448_is_zero(   &L9 );
+       L1 = p448_is_zero(   &L8 );
        L0 = -   L1;
     p448_addw ( &a->y,    L0 );
     p448_weak_reduce( &a->y );
@@ -877,83 +880,83 @@ mask_t
 validate_tw_extensible (
     const struct tw_extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L2, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L2 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L3, &ext->x, &ext->y );
-    p448_neg  (   &L2,   &L3 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_bias (   &L3,     2 );
-       L1 = p448_is_zero(   &L3 );
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L2,   &L4 );
-    p448_addw (   &L2,     0 );
-    p448_sqr  (   &L3, &ext->x );
-    p448_add  (   &L4,   &L3,   &L2 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L3, &ext->t );
-    p448_mul  (   &L2,   &L3,   &L5 );
-    p448_mulw (   &L3,   &L2, 39081 );
-    p448_neg  (   &L5,   &L3 );
-    p448_add  (   &L3,   &L5,   &L4 );
-    p448_neg  (   &L5,   &L2 );
-    p448_add  (   &L4,   &L5,   &L3 );
-    p448_sqr  (   &L3, &ext->z );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     4 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L0,   &L1, 39081 );
+    p448_neg  (   &L3,   &L0 );
+    p448_add  (   &L0,   &L3,   &L2 );
+    p448_neg  (   &L3,   &L1 );
+    p448_add  (   &L2,   &L3,   &L0 );
+    p448_sqr  (   &L1, &ext->z );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     4 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 mask_t
 validate_extensible (
     const struct extensible_t* ext
 ) {
-    mask_t L0, L1;
-    struct p448_t L2, L3, L4, L5;
+    mask_t L4, L5;
+    struct p448_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    p448_sqr  (   &L4, &ext->y );
-    p448_neg  (   &L3,   &L4 );
-    p448_addw (   &L3,     0 );
-    p448_sqr  (   &L2, &ext->z );
-    p448_add  (   &L4,   &L2,   &L3 );
-    p448_sqr  (   &L5, &ext->u );
-    p448_sqr  (   &L2, &ext->t );
-    p448_mul  (   &L3,   &L2,   &L5 );
-    p448_mulw (   &L5,   &L3, 39081 );
-    p448_neg  (   &L2,   &L5 );
-    p448_add  (   &L3,   &L2,   &L4 );
-    p448_sqr  (   &L2, &ext->x );
-    p448_neg  (   &L4,   &L2 );
-    p448_add  (   &L2,   &L4,   &L3 );
-    p448_bias (   &L2,     4 );
-       L1 = p448_is_zero(   &L2 );
+    p448_sqr  (   &L2, &ext->y );
+    p448_neg  (   &L1,   &L2 );
+    p448_addw (   &L1,     0 );
+    p448_sqr  (   &L0, &ext->z );
+    p448_add  (   &L2,   &L0,   &L1 );
+    p448_sqr  (   &L3, &ext->u );
+    p448_sqr  (   &L0, &ext->t );
+    p448_mul  (   &L1,   &L0,   &L3 );
+    p448_mulw (   &L3,   &L1, 39081 );
+    p448_neg  (   &L0,   &L3 );
+    p448_add  (   &L1,   &L0,   &L2 );
+    p448_sqr  (   &L0, &ext->x );
+    p448_neg  (   &L2,   &L0 );
+    p448_add  (   &L0,   &L2,   &L1 );
+    p448_bias (   &L0,     4 );
+       L5 = p448_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L3, &ext->t, &ext->u );
-    p448_mul  (   &L4, &ext->z,   &L3 );
-    p448_addw (   &L4,     0 );
-    p448_mul  (   &L2, &ext->x, &ext->y );
-    p448_neg  (   &L3,   &L2 );
-    p448_add  (   &L2,   &L3,   &L4 );
-    p448_bias (   &L2,     2 );
-       L0 = p448_is_zero(   &L2 );
-    return    L1 &    L0;
+    p448_mul  (   &L1, &ext->t, &ext->u );
+    p448_mul  (   &L2, &ext->z,   &L1 );
+    p448_addw (   &L2,     0 );
+    p448_mul  (   &L0, &ext->x, &ext->y );
+    p448_neg  (   &L1,   &L0 );
+    p448_add  (   &L0,   &L1,   &L2 );
+    p448_bias (   &L0,     2 );
+       L4 = p448_is_zero(   &L0 );
+    return    L5 &    L4;
 }
 
 
diff --git a/src/arch_neon/neon_emulation.h b/src/arch_neon/neon_emulation.h
index 6fecbc7..a97978c 100644
--- a/src/arch_neon/neon_emulation.h
+++ b/src/arch_neon/neon_emulation.h
@@ -8,9 +8,12 @@
  *
  * This lets you test and debug NEON code on x86.
  */
+
 #ifndef __NEON_EMULATION_H__
 #define __NEON_EMULATION_H__ 1
 
+/** @cond internal */
+
 #include "word.h"
 
 #include <stdint.h>
@@ -147,4 +150,6 @@ static inline int64x2_t vmull_lane_s32 (
     return xx*(lane?yy.yy:yy.xx);
 }
 
+/** @endcond */
+
 #endif /* __NEON_EMULATION_H__ */
diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c
index 6cd78aa..fe69639 100644
--- a/src/arch_neon/p448.c
+++ b/src/arch_neon/p448.c
@@ -37,7 +37,7 @@ xx_vaddup_s64(int64x2_t x) {
 }
 #else
 #include "neon_emulation.h"
-#endif // ARM_NEON
+#endif /* ARM_NEON */
 
 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
@@ -75,12 +75,6 @@ smull2 (
     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }
 
-// static inline int64x2_t copy_now(int64x2_t x) {
-//     int64x2_t y;
-//     __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x));
-//     return y;
-// }
-
 void
 p448_mul (
     p448_t *__restrict__ cs,
diff --git a/src/arch_x86_64/ec_point.c b/src/arch_x86_64/ec_point.c
index 87df79f..1fba091 100644
--- a/src/arch_x86_64/ec_point.c
+++ b/src/arch_x86_64/ec_point.c
@@ -356,51 +356,51 @@ serialize_montgomery (
     const struct montgomery_t* a,
     const struct p448_t*       sbz
 ) {
-    struct p448_t L0, L1, L2, L3;
-    mask_t L4, L5, L6;
-    p448_mul  (   &L3, &a->z0, &a->zd );
-    p448_sub  (   &L1,   &L3, &a->xd );
-    p448_bias (   &L1,     2 );
-    p448_mul  (   &L3, &a->za,   &L1 );
-    p448_mul  (   &L2, &a->z0, &a->xd );
-    p448_sub  (   &L1,   &L2, &a->zd );
-    p448_bias (   &L1,     2 );
-    p448_mul  (   &L2, &a->xa,   &L1 );
-    p448_add  (   &L1,   &L2,   &L3 );
-    p448_sub  (   &L0,   &L3,   &L2 );
-    p448_bias (   &L0,     2 );
-    p448_mul  (   &L3,   &L0,   &L1 );
-    p448_copy (   &L2, &a->z0 );
-    p448_addw (   &L2,     1 );
-    p448_sqr  (   &L1,   &L2 );
-    p448_mulw (   &L2,   &L1, 39082 );
-    p448_neg  (   &L1,   &L2 );
-    p448_add  (   &L0, &a->z0, &a->z0 );
-    p448_bias (   &L0,     1 );
-    p448_add  (   &L2,   &L0,   &L0 );
-    p448_add  (   &L0,   &L2,   &L1 );
-    p448_mul  (   &L2, &a->xd,   &L0 );
-       L5 = p448_is_zero( &a->zd );
-       L6 = -   L5;
-    p448_mask (   &L1,   &L2,    L5 );
-    p448_add  (   &L2,   &L1, &a->zd );
-       L4 = ~   L5;
-    p448_mul  (   &L1,   sbz,   &L3 );
-    p448_addw (   &L1,    L6 );
-    p448_mul  (   &L3,   &L2,   &L1 );
-    p448_mul  (   &L1,   &L3,   &L2 );
-    p448_mul  (   &L2,   &L3, &a->xd );
-    p448_mul  (   &L3,   &L1,   &L2 );
-    p448_isr  (   &L0,   &L3 );
-    p448_mul  (   &L2,   &L1,   &L0 );
-    p448_sqr  (   &L1,   &L0 );
-    p448_mul  (   &L0,   &L3,   &L1 );
-    p448_mask (     b,   &L2,    L4 );
-    p448_subw (   &L0,     1 );
-    p448_bias (   &L0,     1 );
-       L5 = p448_is_zero(   &L0 );
-       L4 = p448_is_zero(   sbz );
-    return    L5 |    L4;
+    mask_t L0, L1, L2;
+    struct p448_t L3, L4, L5, L6;
+    p448_mul  (   &L6, &a->z0, &a->zd );
+    p448_sub  (   &L4,   &L6, &a->xd );
+    p448_bias (   &L4,     2 );
+    p448_mul  (   &L6, &a->za,   &L4 );
+    p448_mul  (   &L5, &a->z0, &a->xd );
+    p448_sub  (   &L4,   &L5, &a->zd );
+    p448_bias (   &L4,     2 );
+    p448_mul  (   &L3, &a->xa,   &L4 );
+    p448_add  (   &L5,   &L3,   &L6 );
+    p448_sub  (   &L4,   &L6,   &L3 );
+    p448_bias (   &L4,     2 );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_copy (   &L5, &a->z0 );
+    p448_addw (   &L5,     1 );
+    p448_sqr  (   &L4,   &L5 );
+    p448_mulw (   &L5,   &L4, 39082 );
+    p448_neg  (   &L4,   &L5 );
+    p448_add  (   &L3, &a->z0, &a->z0 );
+    p448_bias (   &L3,     1 );
+    p448_add  (   &L5,   &L3,   &L3 );
+    p448_add  (   &L3,   &L5,   &L4 );
+    p448_mul  (   &L5, &a->xd,   &L3 );
+       L1 = p448_is_zero( &a->zd );
+       L2 = -   L1;
+    p448_mask (   &L4,   &L5,    L1 );
+    p448_add  (   &L5,   &L4, &a->zd );
+       L0 = ~   L1;
+    p448_mul  (   &L4,   sbz,   &L6 );
+    p448_addw (   &L4,    L2 );
+    p448_mul  (   &L6,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L6,   &L5 );
+    p448_mul  (   &L5,   &L6, &a->xd );
+    p448_mul  (   &L6,   &L4,   &L5 );
+    p448_isr  (   &L3,   &L6 );
+    p448_mul  (   &L5,   &L4,   &L3 );
+    p448_sqr  (   &L4,   &L3 );
+    p448_mul  (   &L3,   &L6,   &L4 );
+    p448_mask (     b,   &L5,    L0 );
+    p448_subw (   &L3,     1 );
+    p448_bias (   &L3,     1 );
+       L1 = p448_is_zero(   &L3 );
+       L0 = p448_is_zero(   sbz );
+    return    L1 |    L0;
 }
 
 void
@@ -491,8 +491,8 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    struct p448_t L0, L1;
-    mask_t L2, L3;
+    mask_t L0, L1;
+    struct p448_t L2, L3;
     p448_sqr  ( &b->u, &a->z );
     p448_sqr  ( &b->y, &a->x );
     p448_sub  ( &b->z, &b->u, &b->y );
@@ -501,36 +501,36 @@ test_only_twist (
     p448_add  ( &b->u, &b->y, &b->y );
     p448_sub  ( &b->y, &a->z, &a->x );
     p448_bias ( &b->y,     2 );
-    p448_mul  ( &b->t, &b->y, &a->y );
+    p448_mul  ( &b->x, &b->y, &a->y );
     p448_sub  ( &b->z, &a->z, &a->y );
     p448_bias ( &b->z,     2 );
-    p448_mul  ( &b->x, &b->z, &b->t );
-    p448_mul  ( &b->t, &b->x, &b->u );
-    p448_mul  (   &L1, &b->x, &b->t );
-    p448_isr  ( &b->t,   &L1 );
-    p448_mul  ( &b->u, &b->x, &b->t );
-    p448_sqr  ( &b->x, &b->t );
-    p448_mul  ( &b->t,   &L1, &b->x );
-    p448_add  (   &L1, &a->y, &a->x );
-    p448_sub  (   &L0, &a->x, &a->y );
-    p448_bias (   &L0,     2 );
-    p448_mul  ( &b->x, &b->t,   &L0 );
-    p448_add  (   &L0, &b->x,   &L1 );
-    p448_sub  ( &b->t,   &L1, &b->x );
+    p448_mul  ( &b->t, &b->z, &b->x );
+    p448_mul  (   &L3, &b->t, &b->u );
+    p448_mul  ( &b->x, &b->t,   &L3 );
+    p448_isr  (   &L2, &b->x );
+    p448_mul  ( &b->u, &b->t,   &L2 );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  ( &b->t, &b->x,   &L3 );
+    p448_add  (   &L3, &a->y, &a->x );
+    p448_sub  (   &L2, &a->x, &a->y );
+    p448_bias (   &L2,     2 );
+    p448_mul  ( &b->x, &b->t,   &L2 );
+    p448_add  (   &L2, &b->x,   &L3 );
+    p448_sub  ( &b->t,   &L3, &b->x );
     p448_bias ( &b->t,     2 );
-    p448_mul  ( &b->x,   &L0, &b->u );
-       L2 = p448_is_zero( &b->y );
-       L3 = -   L2;
-    p448_addw ( &b->x,    L3 );
+    p448_mul  ( &b->x,   &L2, &b->u );
+       L0 = p448_is_zero( &b->y );
+       L1 = -   L0;
+    p448_addw ( &b->x,    L1 );
     p448_weak_reduce( &b->x );
     p448_mul  ( &b->y, &b->t, &b->u );
-       L2 = p448_is_zero( &b->z );
-       L3 = -   L2;
-    p448_addw ( &b->y,    L3 );
+       L0 = p448_is_zero( &b->z );
+       L1 = -   L0;
+    p448_addw ( &b->y,    L1 );
     p448_weak_reduce( &b->y );
-       L3 = p448_is_zero( &a->y );
-       L2 =    L3 +     1;
-    p448_set_ui( &b->z,    L2 );
+       L1 = p448_is_zero( &a->y );
+       L0 =    L1 +     1;
+    p448_set_ui( &b->z,    L0 );
     p448_copy ( &b->t, &b->x );
     p448_copy ( &b->u, &b->y );
 }
@@ -539,16 +539,16 @@ mask_t
 is_square (
     const struct p448_t* x
 ) {
-    struct p448_t L0, L1;
-    mask_t L2, L3;
-    p448_isr  (   &L0,     x );
-    p448_sqr  (   &L1,   &L0 );
-    p448_mul  (   &L0,     x,   &L1 );
-    p448_subw (   &L0,     1 );
-    p448_bias (   &L0,     1 );
-       L3 = p448_is_zero(   &L0 );
-       L2 = p448_is_zero(     x );
-    return    L3 |    L2;
+    mask_t L0, L1;
+    struct p448_t L2, L3;
+    p448_isr  (   &L2,     x );
+    p448_sqr  (   &L3,   &L2 );
+    p448_mul  (   &L2,     x,   &L3 );
+    p448_subw (   &L2,     1 );
+    p448_bias (   &L2,     1 );
+       L1 = p448_is_zero(   &L2 );
+       L0 = p448_is_zero(     x );
+    return    L1 |    L0;
 }
 
 mask_t
@@ -700,15 +700,15 @@ eq_affine (
     const struct affine_t* a,
     const struct affine_t* b
 ) {
-    struct p448_t L0;
-    mask_t L1, L2;
-    p448_sub  (   &L0, &a->x, &b->x );
-    p448_bias (   &L0,     2 );
-       L2 = p448_is_zero(   &L0 );
-    p448_sub  (   &L0, &a->y, &b->y );
-    p448_bias (   &L0,     2 );
-       L1 = p448_is_zero(   &L0 );
-    return    L2 &    L1;
+    mask_t L0, L1;
+    struct p448_t L2;
+    p448_sub  (   &L2, &a->x, &b->x );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_sub  (   &L2, &a->y, &b->y );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
 }
 
 mask_t
@@ -716,19 +716,19 @@ eq_extensible (
     const struct extensible_t* a,
     const struct extensible_t* b
 ) {
-    struct p448_t L0, L1, L2;
-    mask_t L3, L4;
-    p448_mul  (   &L2, &b->z, &a->x );
-    p448_mul  (   &L1, &a->z, &b->x );
-    p448_sub  (   &L0,   &L2,   &L1 );
-    p448_bias (   &L0,     2 );
-       L4 = p448_is_zero(   &L0 );
-    p448_mul  (   &L2, &b->z, &a->y );
-    p448_mul  (   &L1, &a->z, &b->y );
-    p448_sub  (   &L0,   &L2,   &L1 );
-    p448_bias (   &L0,     2 );
-       L3 = p448_is_zero(   &L0 );
-    return    L4 &    L3;
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
 }
 
 mask_t
@@ -736,19 +736,19 @@ eq_tw_extensible (
     const struct tw_extensible_t* a,
     const struct tw_extensible_t* b
 ) {
-    struct p448_t L0, L1, L2;
-    mask_t L3, L4;
-    p448_mul  (   &L2, &b->z, &a->x );
-    p448_mul  (   &L1, &a->z, &b->x );
-    p448_sub  (   &L0,   &L2,   &L1 );
-    p448_bias (   &L0,     2 );
-       L4 = p448_is_zero(   &L0 );
-    p448_mul  (   &L2, &b->z, &a->y );
-    p448_mul  (   &L1, &a->z, &b->y );
-    p448_sub  (   &L0,   &L2,   &L1 );
-    p448_bias (   &L0,     2 );
-       L3 = p448_is_zero(   &L0 );
-    return    L4 &    L3;
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4;
+    p448_mul  (   &L4, &b->z, &a->x );
+    p448_mul  (   &L3, &a->z, &b->x );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L1 = p448_is_zero(   &L2 );
+    p448_mul  (   &L4, &b->z, &a->y );
+    p448_mul  (   &L3, &a->z, &b->y );
+    p448_sub  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
 }
 
 void
@@ -756,53 +756,56 @@ elligator_2s_inject (
     struct affine_t*     a,
     const struct p448_t* r
 ) {
-    struct p448_t L0, L1, L2, L3, L4, L5, L6, L7;
-    mask_t L8, L9;
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5, L6, L7, L8;
     p448_sqr  ( &a->x,     r );
-    p448_sqr  (   &L1, &a->x );
-    p448_copy ( &a->y,   &L1 );
+    p448_sqr  (   &L3, &a->x );
+    p448_copy ( &a->y,   &L3 );
     p448_subw ( &a->y,     1 );
-    p448_neg  (   &L7, &a->y );
+    p448_neg  (   &L4, &a->y );
+    p448_bias (   &L4,     2 );
+    p448_sqr  (   &L2,   &L4 );
+    p448_mulw (   &L7,   &L2, 1527402724 );
+    p448_mulw (   &L8,   &L3, 6108985600 );
+    p448_add  ( &a->y,   &L8,   &L7 );
+    p448_mulw (   &L8,   &L2, 6109454568 );
+    p448_sub  (   &L7, &a->y,   &L8 );
     p448_bias (   &L7,     2 );
-    p448_sqr  (   &L0,   &L7 );
-    p448_mulw (   &L6,   &L0, 1527402724 );
-    p448_mulw (   &L5,   &L1, 6108985600 );
-    p448_add  ( &a->y,   &L5,   &L6 );
-    p448_mulw (   &L6,   &L0, 6109454568 );
-    p448_sub  (   &L5, &a->y,   &L6 );
-    p448_bias (   &L5,     2 );
-    p448_mulw (   &L2, &a->y, 78160 );
-    p448_mul  (   &L4,   &L5,   &L7 );
-    p448_mul  (   &L6,   &L4,   &L2 );
-    p448_mul  (   &L2,   &L5,   &L6 );
-    p448_isr  (   &L3,   &L2 );
-    p448_mul  (   &L2,   &L4,   &L3 );
-    p448_sqr  (   &L4,   &L3 );
-    p448_mul  (   &L3,   &L6,   &L4 );
-    p448_mul  (   &L6,   &L5,   &L3 );
-    p448_mul  (   &L5,   &L6,   &L3 );
-    p448_copy (   &L4, &a->x );
-    p448_subw (   &L4,     1 );
+    p448_mulw (   &L6, &a->y, 78160 );
+    p448_mul  (   &L5,   &L7,   &L6 );
+    p448_mul  (   &L8,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_mul  (   &L5,   &L7,   &L8 );
+    p448_mul  (   &L8,   &L5,   &L4 );
+    p448_mul  (   &L4,   &L7,   &L8 );
+    p448_isr  (   &L6,   &L4 );
+    p448_mul  (   &L4,   &L5,   &L6 );
+    p448_sqr  (   &L5,   &L6 );
+    p448_mul  (   &L6,   &L8,   &L5 );
+    p448_mul  (   &L8,   &L7,   &L6 );
+    p448_mul  (   &L7,   &L8,   &L6 );
+    p448_copy (   &L6, &a->x );
+    p448_subw (   &L6,     1 );
     p448_addw ( &a->x,     1 );
-    p448_mul  (   &L3, &a->x,   &L6 );
-    p448_sub  ( &a->x,   &L4,   &L3 );
+    p448_mul  (   &L5, &a->x,   &L8 );
+    p448_sub  ( &a->x,   &L6,   &L5 );
     p448_bias ( &a->x,     3 );
-    p448_mul  (   &L3,   &L2, &a->x );
-    p448_mulw (   &L2,   &L3, 78160 );
-    p448_neg  ( &a->x,   &L2 );
+    p448_mul  (   &L5,   &L4, &a->x );
+    p448_mulw (   &L4,   &L5, 78160 );
+    p448_neg  ( &a->x,   &L4 );
     p448_bias ( &a->x,     2 );
     p448_weak_reduce( &a->x );
-    p448_add  (   &L2,   &L1,   &L1 );
-    p448_add  (   &L1,   &L2,   &L0 );
-    p448_subw (   &L1,     2 );
-    p448_bias (   &L1,     1 );
-    p448_mul  (   &L0,   &L1,   &L6 );
-    p448_mulw (   &L1,   &L0, 3054649120 );
-    p448_add  (   &L0,   &L1, &a->y );
-    p448_mul  ( &a->y,   &L5,   &L0 );
-       L9 = p448_is_zero(   &L7 );
-       L8 = -   L9;
-    p448_addw ( &a->y,    L8 );
+    p448_add  (   &L4,   &L3,   &L3 );
+    p448_add  (   &L3,   &L4,   &L2 );
+    p448_subw (   &L3,     2 );
+    p448_bias (   &L3,     1 );
+    p448_mul  (   &L2,   &L3,   &L8 );
+    p448_mulw (   &L3,   &L2, 3054649120 );
+    p448_add  (   &L2,   &L3, &a->y );
+    p448_mul  ( &a->y,   &L7,   &L2 );
+       L1 = p448_is_zero(   &L8 );
+       L0 = -   L1;
+    p448_addw ( &a->y,    L0 );
     p448_weak_reduce( &a->y );
 }
 
@@ -828,83 +831,83 @@ mask_t
 validate_tw_extensible (
     const struct tw_extensible_t* ext
 ) {
-    struct p448_t L0, L1, L2, L3;
-    mask_t L4, L5;
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L0, &ext->t, &ext->u );
-    p448_mul  (   &L2, &ext->z,   &L0 );
-    p448_addw (   &L2,     0 );
-    p448_mul  (   &L1, &ext->x, &ext->y );
-    p448_neg  (   &L0,   &L1 );
-    p448_add  (   &L1,   &L0,   &L2 );
-    p448_bias (   &L1,     2 );
-       L5 = p448_is_zero(   &L1 );
+    p448_mul  (   &L2, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L2 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L3, &ext->x, &ext->y );
+    p448_neg  (   &L2,   &L3 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_bias (   &L3,     2 );
+       L1 = p448_is_zero(   &L3 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    p448_sqr  (   &L2, &ext->y );
-    p448_neg  (   &L0,   &L2 );
-    p448_addw (   &L0,     0 );
-    p448_sqr  (   &L1, &ext->x );
-    p448_add  (   &L2,   &L1,   &L0 );
-    p448_sqr  (   &L3, &ext->u );
-    p448_sqr  (   &L1, &ext->t );
-    p448_mul  (   &L0,   &L1,   &L3 );
-    p448_mulw (   &L1,   &L0, 39081 );
-    p448_neg  (   &L3,   &L1 );
-    p448_add  (   &L1,   &L3,   &L2 );
-    p448_neg  (   &L3,   &L0 );
-    p448_add  (   &L2,   &L3,   &L1 );
-    p448_sqr  (   &L1, &ext->z );
-    p448_add  (   &L0,   &L1,   &L2 );
-    p448_bias (   &L0,     4 );
-       L4 = p448_is_zero(   &L0 );
-    return    L5 &    L4;
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L2,   &L4 );
+    p448_addw (   &L2,     0 );
+    p448_sqr  (   &L3, &ext->x );
+    p448_add  (   &L4,   &L3,   &L2 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L3, &ext->t );
+    p448_mul  (   &L2,   &L3,   &L5 );
+    p448_mulw (   &L3,   &L2, 39081 );
+    p448_neg  (   &L5,   &L3 );
+    p448_add  (   &L3,   &L5,   &L4 );
+    p448_neg  (   &L5,   &L2 );
+    p448_add  (   &L4,   &L5,   &L3 );
+    p448_sqr  (   &L3, &ext->z );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     4 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
 }
 
 mask_t
 validate_extensible (
     const struct extensible_t* ext
 ) {
-    struct p448_t L0, L1, L2, L3;
-    mask_t L4, L5;
+    mask_t L0, L1;
+    struct p448_t L2, L3, L4, L5;
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    p448_sqr  (   &L2, &ext->y );
-    p448_neg  (   &L1,   &L2 );
-    p448_addw (   &L1,     0 );
-    p448_sqr  (   &L0, &ext->z );
-    p448_add  (   &L2,   &L0,   &L1 );
-    p448_sqr  (   &L3, &ext->u );
-    p448_sqr  (   &L0, &ext->t );
-    p448_mul  (   &L1,   &L0,   &L3 );
-    p448_mulw (   &L3,   &L1, 39081 );
-    p448_neg  (   &L0,   &L3 );
-    p448_add  (   &L1,   &L0,   &L2 );
-    p448_sqr  (   &L0, &ext->x );
-    p448_neg  (   &L2,   &L0 );
-    p448_add  (   &L0,   &L2,   &L1 );
-    p448_bias (   &L0,     4 );
-       L5 = p448_is_zero(   &L0 );
+    p448_sqr  (   &L4, &ext->y );
+    p448_neg  (   &L3,   &L4 );
+    p448_addw (   &L3,     0 );
+    p448_sqr  (   &L2, &ext->z );
+    p448_add  (   &L4,   &L2,   &L3 );
+    p448_sqr  (   &L5, &ext->u );
+    p448_sqr  (   &L2, &ext->t );
+    p448_mul  (   &L3,   &L2,   &L5 );
+    p448_mulw (   &L5,   &L3, 39081 );
+    p448_neg  (   &L2,   &L5 );
+    p448_add  (   &L3,   &L2,   &L4 );
+    p448_sqr  (   &L2, &ext->x );
+    p448_neg  (   &L4,   &L2 );
+    p448_add  (   &L2,   &L4,   &L3 );
+    p448_bias (   &L2,     4 );
+       L1 = p448_is_zero(   &L2 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    p448_mul  (   &L1, &ext->t, &ext->u );
-    p448_mul  (   &L2, &ext->z,   &L1 );
-    p448_addw (   &L2,     0 );
-    p448_mul  (   &L0, &ext->x, &ext->y );
-    p448_neg  (   &L1,   &L0 );
-    p448_add  (   &L0,   &L1,   &L2 );
-    p448_bias (   &L0,     2 );
-       L4 = p448_is_zero(   &L0 );
-    return    L5 &    L4;
+    p448_mul  (   &L3, &ext->t, &ext->u );
+    p448_mul  (   &L4, &ext->z,   &L3 );
+    p448_addw (   &L4,     0 );
+    p448_mul  (   &L2, &ext->x, &ext->y );
+    p448_neg  (   &L3,   &L2 );
+    p448_add  (   &L2,   &L3,   &L4 );
+    p448_bias (   &L2,     2 );
+       L0 = p448_is_zero(   &L2 );
+    return    L1 &    L0;
 }
 
 
diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c
index 4abc788..5e97812 100644
--- a/src/arch_x86_64/p448.c
+++ b/src/arch_x86_64/p448.c
@@ -180,9 +180,6 @@ p448_mulw (
 
     c[3] = accum0 & mask; accum0 >>= 56;
     c[7] = accum4 & mask; accum4 >>= 56;
-
-    // c[4] += accum0 + accum4;
-    // c[0] += accum4;
     
     accum0 += accum4 + c[4];
     c[4] = accum0 & mask;
diff --git a/src/crandom.c b/src/crandom.c
index e4a71d0..4b75f66 100644
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -5,8 +5,11 @@
 
 /* Chacha random number generator code copied from crandom */
 
-#include "intrinsics.h"
 #include "crandom.h"
+#include "intrinsics.h"
+#include "config.h"
+#include "magic.h"
+
 #include <stdio.h>
 
 volatile unsigned int crandom_features = 0;
@@ -67,7 +70,7 @@ INTRINSIC u_int64_t rdrand(int abort_on_fail) {
         out = out << 32 | reg;
         return out;
     # else
-        abort(); // whut
+        abort(); /* whut */
     # endif
     } else {
         tries = 0;
@@ -296,9 +299,6 @@ crandom_chacha_expand(u_int64_t iv,
 #endif /* NEED_CONV */
 }
 
-/* "return 4", cf xkcd #221 */
-#define CRANDOM_MAGIC 0x72657475726e2034ull
-
 int
 crandom_init_from_file(
     struct crandom_state_t *state,
@@ -361,6 +361,52 @@ crandom_generate(
 
     int ret = 0;
 
+    /* 
+     * Addition 5/21/2014.
+     *
+     * If this is used in an application inside a VM, and the VM
+     * is snapshotted and restored, then crandom_generate() would
+     * produce the same output.
+     * 
+     * Of course, the real defense against this is "don't do that",
+     * but we mitigate it by the RDRAND and/or rdtsc() in the refilling
+     * code.  Since chacha is pseudorandom, when the attacker doesn't
+     * know the state, it's good enough if RDRAND/rdtsc() return
+     * different results.  However, if (part of) the request is filled
+     * from the buffer, this won't help.
+     *
+     * So, add a flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES which
+     * disables the buffer for requests larger than this size.
+     *
+     * Suggest EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES = 0, which
+     * disables the buffer.  But instead you can set it to say 16,
+     * so that pulls of at least 128 bits will be stirred.  This
+     * could still be a problem for eg 64-bit nonces, but those
+     * aren't entirely collision-resistant anyway.
+     *
+     * Heuristic: large requests are more likely to be 
+     * cryptographically important, and the buffer doesn't impact
+     * their performance as much.  So if the request is bigger
+     * than a certain size, just drop the buffer on the floor.
+     *
+     * This code isn't activated if state->reseed_interval == 0,
+     * because then the PRNG is deterministic anyway.
+     *
+     * TODO: sample 128 bits out of RDRAND() instead of 64 bits.
+     * TODO: option to completely remove the buffer and fill?
+     * FUTURE: come up with a less band-aid-y solution to this problem.
+     */
+#ifdef EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES
+    if (state->reseed_interval
+#if EXPERIMENT_CRANDOM_CUTOFF_BYTES > 0
+        /* #if'd to a warning from -Wtype-limits in GCC when it's zero */
+        && length >= EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES
+#endif
+    ) {
+        state->fill = 0;
+    }
+#endif
+    
     while (length) {
         if (unlikely(state->fill <= 0)) {
             uint64_t iv = 0;
diff --git a/src/goldilocks.c b/src/goldilocks.c
index 4314e46..440c8bd 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -32,73 +32,27 @@
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif
 
-#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS))
 #define GOLDI_DIVERSIFY_BYTES 8
 
-/* FUTURE: auto.  MAGIC */
-const struct affine_t goldilocks_base_point = {
-    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
-       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
-       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
-       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
-    }},
-    {{ 19 }}
-};
-
 /* These are just unique identifiers */
 static const char *G_INITING = "initializing";
 static const char *G_INITED = "initialized";
 static const char *G_FAILED = "failed to initialize";
 
-/* FUTURE: auto.  MAGIC */
-static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
-    U64LE(0xdc873d6d54a7bb0d),
-    U64LE(0xde933d8d723a70aa),
-    U64LE(0x3bb124b65129c96f),
-    0x8335dc16
-};
-const struct barrett_prime_t goldi_q448 = {
-    GOLDI_FIELD_WORDS,
-    62 % WORD_BITS,
-    sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
-    goldi_q448_lo
-};
-
-/* MAGIC */
-static const struct p448_t
-sqrt_d_minus_1 = {{
-    U58LE(0xd2e21836749f46),
-    U58LE(0x888db42b4f0179),
-    U58LE(0x5a189aabdeea38),
-    U58LE(0x51e65ca6f14c06),
-    U58LE(0xa49f7b424d9770),
-    U58LE(0xdcac4628c5f656),
-    U58LE(0x49443b8748734a),
-    U58LE(0x12fec0c0b25b7a)
-}};
-
 struct goldilocks_precomputed_public_key_t {
     struct goldilocks_public_key_t pub;
     struct fixed_base_table_t table;
 };
 
-#ifndef USE_BIG_TABLES
-#if __ARM_NEON__
-#define USE_BIG_TABLES 1
-#else
-#define USE_BIG_TABLES (WORD_BITS==64)
-#endif
-#endif
-
-/* FUTURE: auto.  MAGIC */
-struct {
+/* FUTURE: auto. */
+static struct {
     const char * volatile state;
 #if GOLDILOCKS_USE_PTHREAD
     pthread_mutex_t mutex;
 #endif
-    struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64];
+    struct tw_niels_t combs[COMB_N << (COMB_T-1)];
     struct fixed_base_table_t fixed_base;
-    struct tw_niels_t wnafs[32];
+    struct tw_niels_t wnafs[1<<WNAF_PRECMP_BITS];
     struct crandom_state_t rand;
 } goldilocks_global;
 
@@ -136,18 +90,23 @@ goldilocks_init () {
     
     /* Precompute the tables. */
     mask_t succ;
-    
-    int big = USE_BIG_TABLES;
-    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;
 
-    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
-    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, 5);
+    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text,
+        COMB_N, COMB_T, COMB_S, goldilocks_global.combs);
+    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, WNAF_PRECMP_BITS);
     
     int criff_res = crandom_init_from_file(&goldilocks_global.rand,
         GOLDILOCKS_RANDOM_INIT_FILE,
         GOLDILOCKS_RANDOM_RESEED_INTERVAL,
         GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
         
+#ifdef SUPERCOP_WONT_LET_ME_OPEN_FILES
+    if (criff_res == EMFILE) {
+        crandom_init_from_buffer(&goldilocks_global.rand, "SUPERCOP won't let me open files");
+        criff_res = 0;
+    }
+#endif
+        
     if (succ & !criff_res) {
         if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) {
             abort();
@@ -182,20 +141,20 @@ goldilocks_derive_private_key (
     
     struct sha512_ctx_t ctx;
     struct tw_extensible_t exta;
-    struct p448_t pk;
+    struct field_t pk;
     
     sha512_init(&ctx);
     sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
     sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
     sha512_final(&ctx, (unsigned char *)skb);
 
-    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448);
+    barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &curve_prime_order);
     barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);
 
     scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
     untwist_and_double_and_serialize(&pk, &exta);
     
-    p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
+    field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
     
     return GOLDI_EOK;
 }
@@ -245,11 +204,11 @@ goldilocks_private_to_public (
     struct goldilocks_public_key_t *pubkey,
     const struct goldilocks_private_key_t *privkey
 ) {
-    struct p448_t pk;
-    mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
+    struct field_t pk;
+    mask_t msucc = field_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
     
     if (msucc) {
-        p448_serialize(pubkey->opaque, &pk);
+        field_serialize(pubkey->opaque, &pk);
         return GOLDI_EOK;
     } else {
         return GOLDI_ECORRUPT;
@@ -270,18 +229,18 @@ goldilocks_shared_secret_core (
     assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
     
     word_t sk[GOLDI_FIELD_WORDS];
-    struct p448_t pk;
+    struct field_t pk;
     
-    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
+    mask_t succ = field_deserialize(&pk,your_pubkey->opaque), msucc = -1;
     
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
-    struct p448_t sum, prod;
-    msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
-    p448_mul(&prod,&pk,&sum);
-    p448_add(&sum,&pk,&sum);
+    struct field_t sum, prod;
+    msucc &= field_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
+    field_mul(&prod,&pk,&sum);
+    field_add(&sum,&pk,&sum);
 #endif
     
-    msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
+    msucc &= barrett_deserialize(sk,my_privkey->opaque,&curve_prime_order);
     
 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
     if (pre) {
@@ -297,7 +256,7 @@ goldilocks_shared_secret_core (
 #endif
     
     
-    p448_serialize(shared,&pk);
+    field_serialize(shared,&pk);
     
     /* obliterate records of our failure by adjusting with obliteration key */
     struct sha512_ctx_t ctx;
@@ -318,9 +277,9 @@ goldilocks_shared_secret_core (
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
     /* stir in the sum and product of the pubkeys. */
     uint8_t a_pk[GOLDI_FIELD_BYTES];
-    p448_serialize(a_pk, &sum);
+    field_serialize(a_pk, &sum);
     sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
-    p448_serialize(a_pk, &prod);
+    field_serialize(a_pk, &prod);
     sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
        
@@ -363,7 +322,7 @@ goldilocks_derive_challenge(
     sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
     sha512_update(&ctx, message, message_len);
     sha512_final(&ctx, sha_out);
-    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448);
+    barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order);
 }
 
 int
@@ -379,7 +338,7 @@ goldilocks_sign (
     
     /* challenge = H(pk, [nonceG], message). */
     word_t skw[GOLDI_FIELD_WORDS];
-    mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
+    mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order);
     if (!succ) {
         memset(skw,0,sizeof(skw));
         return GOLDI_ECORRUPT;
@@ -395,16 +354,16 @@ goldilocks_sign (
     sha512_update(&ctx, message, message_len);
     sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
     sha512_final(&ctx, sha_out);
-    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448);
+    barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order);
     
     /* 4[nonce]G */
     uint8_t signature_tmp[GOLDI_FIELD_BYTES];
     struct tw_extensible_t exta;
-    struct p448_t gsk;
+    struct field_t gsk;
     scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
     double_tw_extensible(&exta);
     untwist_and_double_and_serialize(&gsk, &exta);
-    p448_serialize(signature_tmp, &gsk);
+    field_serialize(signature_tmp, &gsk);
     
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge (
@@ -415,18 +374,18 @@ goldilocks_sign (
         message_len
     );
     
-    // reduce challenge and sub.
-    barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448);
+    /* reduce challenge and sub. */
+    barrett_negate(challenge,GOLDI_FIELD_WORDS,&curve_prime_order);
 
     barrett_mac(
         tk,GOLDI_FIELD_WORDS,
         challenge,GOLDI_FIELD_WORDS,
         skw,GOLDI_FIELD_WORDS,
-        &goldi_q448
+        &curve_prime_order
     );
         
     word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1);
-    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448);
+    barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&curve_prime_order);
         
     memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
     barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
@@ -454,23 +413,23 @@ goldilocks_verify (
         return GOLDI_EUNINIT;
     }
     
-    struct p448_t pk;
+    struct field_t pk;
     word_t s[GOLDI_FIELD_WORDS];
     
-    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
+    mask_t succ = field_deserialize(&pk,pubkey->opaque);
     if (!succ) return GOLDI_EINVAL;
     
-    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
+    succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order);
     if (!succ) return GOLDI_EINVAL;
     
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
     
-    struct p448_t eph;
+    struct field_t eph;
     struct tw_extensible_t pk_text;
     
     /* deserialize [nonce]G */
-    succ = p448_deserialize(&eph, signature);
+    succ = field_deserialize(&eph, signature);
     if (!succ) return GOLDI_EINVAL;
     
     succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
@@ -479,13 +438,13 @@ goldilocks_verify (
     linear_combo_var_fixed_vt( &pk_text,
         challenge, GOLDI_SCALAR_BITS,
         s, GOLDI_SCALAR_BITS,
-        goldilocks_global.wnafs, 5 );
+        goldilocks_global.wnafs, WNAF_PRECMP_BITS );
     
     untwist_and_double_and_serialize( &pk, &pk_text );
-    p448_sub(&eph, &eph, &pk);
-    p448_bias(&eph, 2);
+    field_sub(&eph, &eph, &pk);
+    field_bias(&eph, 2);
     
-    succ = p448_is_zero(&eph);
+    succ = field_is_zero(&eph);
     
     return succ ? 0 : GOLDI_EINVAL;
 }
@@ -504,8 +463,8 @@ goldilocks_precompute_public_key (
     
     struct tw_extensible_t pk_text;
     
-    struct p448_t pk;
-    mask_t succ = p448_deserialize(&pk, pub->opaque);
+    struct field_t pk;
+    mask_t succ = field_deserialize(&pk, pub->opaque);
     if (!succ) {
         free(precom);
         return NULL;
@@ -516,11 +475,9 @@ goldilocks_precompute_public_key (
         free(precom);
         return NULL;
     }
-    
-    int big = USE_BIG_TABLES;
-    uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;
 
-    succ =  precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL);
+    succ =  precompute_fixed_base(&precom->table, &pk_text,
+        COMB_N, COMB_T, COMB_S, NULL);
     if (!succ) {
         free(precom);
         return NULL;
@@ -553,17 +510,17 @@ goldilocks_verify_precomputed (
     }
 
     word_t s[GOLDI_FIELD_WORDS];
-    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
+    mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order);
     if (!succ) return GOLDI_EINVAL;
     
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
     
-    struct p448_t eph, pk;
+    struct field_t eph, pk;
     struct tw_extensible_t pk_text;
     
     /* deserialize [nonce]G */
-    succ = p448_deserialize(&eph, signature);
+    succ = field_deserialize(&eph, signature);
     if (!succ) return GOLDI_EINVAL;
         
     succ = linear_combo_combs_vt (
@@ -574,10 +531,10 @@ goldilocks_verify_precomputed (
     if (!succ) return GOLDI_EINVAL;
     
     untwist_and_double_and_serialize( &pk, &pk_text );
-    p448_sub(&eph, &eph, &pk);
-    p448_bias(&eph, 2);
+    field_sub(&eph, &eph, &pk);
+    field_bias(&eph, 2);
     
-    succ = p448_is_zero(&eph);
+    succ = field_is_zero(&eph);
     
     return succ ? 0 : GOLDI_EINVAL;
 }
@@ -596,5 +553,5 @@ goldilocks_shared_secret_precomputed (
     );
 }
 
-#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */
 
diff --git a/src/include/api.h b/src/include/api.h
new file mode 100644
index 0000000..cc20246
--- /dev/null
+++ b/src/include/api.h
@@ -0,0 +1,190 @@
+/**
+ * @file sizes.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief BATMAN / SUPERCOP glue for benchmarking.
+ */
+
+#include <string.h>
+#include "goldilocks.h"
+
+#define PUBLICKEY_BYTES GOLDI_PUBLIC_KEY_BYTES
+#define SECRETKEY_BYTES GOLDI_PRIVATE_KEY_BYTES
+#define SIGNATURE_BYTES GOLDI_SIGNATURE_BYTES
+#define SHAREDSECRET_BYTES GOLDI_SHARED_SECRET_BYTES
+
+#define crypto_dh_PUBLICKEYBYTES PUBLICKEY_BYTES
+#define crypto_dh_SECRETKEYBYTES SECRETKEY_BYTES
+#define PRIVATEKEY_BYTES SECRETKEY_BYTES
+#define crypto_dh_BYTES SHAREDSECRET_BYTES
+#define crypto_dh_IMPLEMENTATION "AMD64"
+#define crypto_dh_VERSION "2014-07-11"
+
+#define crypto_sign_PUBLICKEYBYTES PUBLICKEY_BYTES
+#define crypto_sign_SECRETKEYBYTES SECRETKEY_BYTES
+#define crypto_sign_IMPLEMENTATION "AMD64"
+#define crypto_sign_VERSION "2014-07-11"
+#define crypto_sign_BYTES SIGNATURE_BYTES
+
+#define CRYPTO_DETERMINISTIC 1
+
+/*
+#ifndef LOOPS
+#define LOOPS 512
+#endif
+*/
+
+static inline int timingattacks() { return 0; }
+static inline int copyrightclaims() { return 0; }
+static inline int patentclaims() {
+    /* Until the end of July 2014, point compression
+     * is patented. */
+    return 20;
+}
+
+#define crypto_sign_keypair crypto_dh_keypair
+static inline int crypto_dh_keypair (
+    unsigned char pk[SECRETKEY_BYTES],
+    unsigned char sk[PUBLICKEY_BYTES]
+) {
+  int ret;
+  ret = goldilocks_init();
+  if (ret && ret != GOLDI_EALREADYINIT)
+    return ret;
+  if ((ret = goldilocks_keygen(
+      (struct goldilocks_private_key_t *)sk,
+      (struct goldilocks_public_key_t *)pk
+  ))) abort();
+  return ret;
+}
+
+static inline void keypair (
+    unsigned char sk[SECRETKEY_BYTES],
+    unsigned long long *sklen,
+    unsigned char pk[PUBLICKEY_BYTES],
+    unsigned long long *pklen
+) {
+    int ret = goldilocks_init();
+    if (ret) abort();
+
+    ret = goldilocks_keygen(
+        (struct goldilocks_private_key_t *)sk,
+        (struct goldilocks_public_key_t *)pk
+    );
+    if (ret) abort();
+
+    *sklen = SECRETKEY_BYTES;
+    *pklen = PUBLICKEY_BYTES;
+}
+
+static inline int crypto_sign (
+    unsigned char *sm,
+    unsigned long long *smlen,
+    const unsigned char *m,
+    unsigned long long mlen,
+    const unsigned char sk[SECRETKEY_BYTES]
+) {
+    int ret = goldilocks_sign(
+        sm, m, mlen,
+        (const struct goldilocks_private_key_t *)sk
+    );
+    if (ret) abort();
+
+    memcpy(sm + SIGNATURE_BYTES, m, mlen);
+    
+    *smlen = mlen + SIGNATURE_BYTES;
+    return 0;
+}
+
+static inline void signmessage (
+    unsigned char *sm,
+    unsigned long long *smlen,
+    const unsigned char *m,
+    unsigned long long mlen,
+    const unsigned char sk[SECRETKEY_BYTES],
+    unsigned long long sklen
+) {
+    if (sklen != PRIVATEKEY_BYTES) abort();
+    
+    int ret = goldilocks_sign(
+        sm, m, mlen,
+        (const struct goldilocks_private_key_t *)sk
+    );
+    if (ret) abort();
+
+    memcpy(sm + SIGNATURE_BYTES, m, mlen);
+    
+    *smlen = mlen + SIGNATURE_BYTES;
+}
+
+static inline int crypto_sign_open (
+    unsigned char *m,
+    unsigned long long *mlen,
+    const unsigned char *sm,
+    unsigned long long smlen,
+    const unsigned char pk[PUBLICKEY_BYTES]
+) {
+    int ret = goldilocks_verify(
+        sm, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES,
+        (const struct goldilocks_public_key_t *)pk
+    );
+    if (!ret) {
+        *mlen = smlen - SIGNATURE_BYTES;
+        memcpy(m, sm + SIGNATURE_BYTES, *mlen);
+    }
+    return ret ? -1 : 0;
+}
+
+static inline int verification (
+    const unsigned char *m,
+    unsigned long long mlen,
+    const unsigned char *sm,
+    unsigned long long smlen,
+    const unsigned char pk[PUBLICKEY_BYTES],
+    unsigned long long pklen
+) {
+    if (pklen != PUBLICKEY_BYTES) abort();
+    
+    int ret = goldilocks_verify(
+        sm, m, mlen,
+        (const struct goldilocks_public_key_t *)pk
+    );
+    return ret ? -1 : 0;
+}
+
+
+static inline int crypto_dh (
+    unsigned char s[SHAREDSECRET_BYTES],
+    const unsigned char sk[SECRETKEY_BYTES],
+    const unsigned char pk[PUBLICKEY_BYTES]
+) {
+  return goldilocks_shared_secret (
+        s,
+        (const struct goldilocks_private_key_t *)sk,
+        (const struct goldilocks_public_key_t *)pk
+  );
+}
+
+static inline int sharedsecret (
+    unsigned char s[SHAREDSECRET_BYTES],
+    unsigned long long *slen,
+    const unsigned char sk[SECRETKEY_BYTES],
+    unsigned long long sklen,
+    const unsigned char pk[PUBLICKEY_BYTES],
+    unsigned long long pklen
+) {
+    if (pklen != PUBLICKEY_BYTES) abort();
+    if (sklen != SECRETKEY_BYTES) abort();
+    
+    int ret = goldilocks_shared_secret (
+        s,
+        (const struct goldilocks_private_key_t *)sk,
+        (const struct goldilocks_public_key_t *)pk
+    );
+    if (ret) return -1;
+    *slen = SHAREDSECRET_BYTES;
+    return 0;
+}
+
diff --git a/src/include/barrett_field.h b/src/include/barrett_field.h
index 9d8f930..1187138 100644
--- a/src/include/barrett_field.h
+++ b/src/include/barrett_field.h
@@ -32,7 +32,7 @@ struct barrett_prime_t {
 /**
  * The Goldilocks prime.  I'm not sure this is the right place for it, but oh well.
  */
-extern const struct barrett_prime_t goldi_q448;
+extern const struct barrett_prime_t curve_prime_order;
 
 /**
  * Reduce a number (with optional high carry word) mod p.
diff --git a/src/include/config.h b/src/include/config.h
index dbd785d..ca6da24 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -1,8 +1,64 @@
+/**
+ * @file config.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Goldilocks top-level configuration flags.
+ */
+
 #ifndef __GOLDILOCKS_CONFIG_H__
 #define __GOLDILOCKS_CONFIG_H__ 1
 
+/** @brief crandom architecture detection.
+ * With this flag set to 1, crandom will assume that any flag
+ * supported by -march and friends (MIGHT_HAVE) will actually
+ * be available on the target machine (MUST_HAVE), instead of
+ * trying to detect it.
+ *
+ * Without this flag, crandom can detect, eg, that while -mavx
+ * was passed, the currint machine doesn't support AVX, and can
+ * fall back to SSE2 or whatever.  But the rest of the
+ * Goldilocks code doesn't support this, so it'll still crash
+ * with an illegal instruction error.
+ *
+ * Setting this flag will make the library smaller.
+ */
+#define CRANDOM_MIGHT_IS_MUST           1
+
+/**
+ * @brief Causes crandom to refuse to buffer requests bigger
+ * than this size.  Setting 0 disables buffering for all
+ * requests, which hurts performance.
+ *
+ * The advantage is that if a user process forks or is VM-
+ * snapshotted, the buffer is not adjusted (FUTURE).  However,
+ * with the buffer disabled, the refresh routines will stir
+ * in entropy from RDTSC and/or RDRAND, making this operation
+ * mostly-safe.
+ */
+#define EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES 0
+
+/**
+ * @brief Goldilocks uses libpthread mutexes to provide
+ * thread-safety.  If you disable this flag, it won't link
+ * libpthread, but it won't be thread-safe either.
+ */
 #define GOLDILOCKS_USE_PTHREAD          1
+
+/**
+ * @brief Experiment to change the hash inputs for ECDH,
+ * in a way that obliterates the result -- overwriting it with
+ * a safe pseudorandom value -- if the public key is invalid.
+ * That way users who ignore the status result won't be
+ * exposed to invalid key attacks. 
+ */
 #define EXPERIMENT_ECDH_OBLITERATE_CT   1
+
+/**
+ * @brief ECDH adds public keys into the hash, to prevent
+ * esoteric attacks.
+ */
 #define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1
 
-#endif // __GOLDILOCKS_CONFIG_H__
+#endif /* __GOLDILOCKS_CONFIG_H__ */
diff --git a/src/include/crandom.h b/src/include/crandom.h
index f603f13..90cc374 100644
--- a/src/include/crandom.h
+++ b/src/include/crandom.h
@@ -12,6 +12,7 @@
 #ifndef __GOLDI_CRANDOM_H__
 #define __GOLDI_CRANDOM_H__ 1
 
+#define _XOPEN_SOURCE 600
 #include <stdint.h>  /* for uint64_t */
 #include <fcntl.h>   /* for open */
 #include <errno.h>   /* for returning errors after open */
diff --git a/src/include/field.h b/src/include/field.h
new file mode 100644
index 0000000..6231aba
--- /dev/null
+++ b/src/include/field.h
@@ -0,0 +1,30 @@
+/**
+ * @file field.h
+ * @brief Field switch code.
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+#ifndef __FIELD_H__
+#define __FIELD_H__
+#include "magic.h"
+
+#include "p448.h"
+
+#define field_t              p448_t
+#define field_mul            p448_mul
+#define field_add            p448_add
+#define field_sub            p448_sub
+#define field_bias           p448_bias
+#define field_copy           p448_copy
+#define field_weak_reduce    p448_weak_reduce
+#define field_strong_reduce  p448_strong_reduce
+#define field_cond_swap      p448_cond_swap
+#define field_cond_neg       p448_cond_neg
+#define field_serialize      p448_serialize
+#define field_deserialize    p448_deserialize
+#define field_is_zero        p448_is_zero
+#define simultaneous_invert  simultaneous_invert_p448 /* FUTURE: consistency */
+
+#endif /* __FIELD_H__ */
diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h
index 1dac686..1b39eb5 100644
--- a/src/include/intrinsics.h
+++ b/src/include/intrinsics.h
@@ -11,25 +11,27 @@
 #define __CRANDOM_INTRINSICS_H__ 1
 
 #include <sys/types.h>
+#include "config.h"
 
 #if __i386__ || __x86_64__
 #include <immintrin.h>
 #endif
 
+/** @brief Macro to make a function static, forcibly inlined and possibly unused. */
 #define INTRINSIC \
   static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))
 
-#define GEN    1
-#define SSE2   2
-#define SSSE3  4
-#define AESNI  8
-#define XOP    16
-#define AVX    32
-#define AVX2   64
-#define RDRAND 128
+#define GEN    1     /**< @brief Intrinsics field has been generated. */
+#define SSE2   2     /**< @brief Machine supports SSE2 */
+#define SSSE3  4     /**< @brief Machine supports SSSE3 (for shuffles) */
+#define AESNI  8     /**< @brief Machine supports Intel AES-NI */
+#define XOP    16    /**< @brief Machine supports AMD XOP */
+#define AVX    32    /**< @brief Machine supports Intel AVX (for masking)  */
+#define AVX2   64    /**< @brief Machine supports Intel AVX2 (for bignums) */
+#define RDRAND 128   /**< @brief Machine supports Intel RDRAND */
 
 /**
- * If on x86, read the timestamp counter.  Otherwise, return 0.
+ * @brief If on x86, read the timestamp counter.  Otherwise, return 0.
  */
 INTRINSIC u_int64_t rdtsc() {
   u_int64_t out = 0;
@@ -53,6 +55,8 @@ INTRINSIC u_int64_t opacify(u_int64_t x) {
   return x;
 }
 
+
+/** @cond internal */
 #ifdef __AVX2__
 #  define MIGHT_HAVE_AVX2 1
 #  ifndef MUST_HAVE_AVX2
@@ -92,10 +96,6 @@ INTRINSIC u_int64_t opacify(u_int64_t x) {
 #  define pslldq _mm_slli_epi32
 #  define pshufd _mm_shuffle_epi32
 
-INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
-  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
-}
-
 #else
 #  define MIGHT_HAVE_SSE2 0
 #  define MUST_HAVE_SSE2  0
@@ -127,11 +127,6 @@ INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
 #  ifndef MUST_HAVE_XOP
 #    define MUST_HAVE_XOP 0
 #  endif
-INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
-  ssereg out;
-  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
-  return out;
-}
 #else
 #  define MIGHT_HAVE_XOP 0
 #  define MUST_HAVE_XOP 0
@@ -146,6 +141,9 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
   | RDRAND * MIGHT_HAVE_RDRAND \
   | AVX2   * MIGHT_HAVE_AVX2)
 
+#if CRANDOM_MIGHT_IS_MUST
+#define MUST_MASK MIGHT_MASK
+#else
 #define MUST_MASK \
   ( SSE2   * MUST_HAVE_SSE2   \
   | SSSE3  * MUST_HAVE_SSSE3  \
@@ -154,22 +152,58 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
   | AVX    * MUST_HAVE_AVX    \
   | RDRAND * MUST_HAVE_RDRAND \
   | AVX2   * MUST_HAVE_AVX2 )
+#endif
+/** @endcond */
+
+#ifdef __SSE2__
+/** Rotate a register by some amount using SSE2. */
+INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
+  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
+}
+#endif
+      
+#ifdef __XOP__
+/** Rotate a register by some amount using AMD XOP. */      
+INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
+  ssereg out;
+  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
+  return out;
+}
+#endif
 
+/**
+ * @brief Macro which detects that targets might support this feature,
+ * so that we can include code for it.
+ */
 #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
+
+/**
+ * @brief Macro which detects that targets must support this feature,
+ * so we can omit fallback code.
+ */
 #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)
 
+/**
+ * @brief Make a functiona available by C API.
+ */
 #ifdef __cplusplus
 #  define extern_c extern "C"
 #else
 #  define extern_c
 #endif
 
+/** @cond internal
+ * @brief Detect platform features and return them as a flagfield int.
+ */
 extern_c
 unsigned int crandom_detect_features();
+/** @endcond */
 
 #ifndef likely
-#  define likely(x)       __builtin_expect((x),1)
-#  define unlikely(x)     __builtin_expect((x),0)
+#  define likely(x)       __builtin_expect((x),1) \
+    /**< @brief Tell the compiler that a branch is likely, for optimization. */
+#  define unlikely(x)     __builtin_expect((x),0) \
+    /**< @brief Tell the compiler that a branch is unlikely, for optimization. */
 #endif
   
 /**
@@ -187,12 +221,6 @@ compare_and_swap (
     const char *volatile* target,
     const char *old,
     const char *new
-);
-    
-const char *compare_and_swap (
-    const char *volatile* target,
-    const char *old,
-    const char *new
 ) {
     return __sync_val_compare_and_swap(target,old,new);
 }
@@ -208,13 +236,6 @@ const char *compare_and_swap (
  * @param [in] new A value to replace the target on success.
  */
 INTRINSIC int
-bool_compare_and_swap (
-    const char *volatile* target,
-    const char *old,
-    const char *new
-);
-
-int
 bool_compare_and_swap (
     const char *volatile* target,
     const char *old,
@@ -231,6 +252,8 @@ bool_compare_and_swap (
  * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not.
  */
 extern volatile unsigned int crandom_features;
+
+/** @brief Determine if a given CPU feature is available. */
 INTRINSIC int HAVE(unsigned int feature);
 
 int HAVE(unsigned int feature) {
diff --git a/src/include/magic.h b/src/include/magic.h
new file mode 100644
index 0000000..1aac4ce
--- /dev/null
+++ b/src/include/magic.h
@@ -0,0 +1,105 @@
+/**
+ * @file magic.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
+ */
+
+
+#ifndef __GOLDI_MAGIC_H__
+#define __GOLDI_MAGIC_H__ 1
+
+#include "word.h"
+#include "p448.h"
+#include "ec_point.h"
+
+/* TODO: standardize notation */
+
+
+/** @brief The number of bits in the Goldilocks field. */
+#define GOLDI_FIELD_BITS 448
+
+/** @brief The number of words in the Goldilocks field. */
+#define GOLDI_FIELD_WORDS DIV_CEIL(GOLDI_FIELD_BITS,WORD_BITS)
+
+/** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */
+#define COFACTOR_BITS 2
+
+/** @brief The number of bits in a Goldilocks scalar. */
+#define SCALAR_BITS (GOLDI_FIELD_BITS - COFACTOR_BITS)
+
+/** @brief The number of words in the Goldilocks field. */
+#define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)
+
+/**
+ * @brief sqrt(d-1), used for point formats and twisting.
+ */
+extern const struct p448_t sqrt_d_minus_1;
+
+/**
+ * @brief The base point for Goldilocks.
+ */
+extern const struct affine_t goldilocks_base_point;
+
+/**
+ * @brief The Goldilocks prime subgroup order.
+ */ 
+extern const struct barrett_prime_t curve_prime_order;
+
+/**
+ * @brief Window size for fixed-window signed binary scalarmul.
+ * Table size is 2^(this - 1).
+ */
+#define SCALARMUL_FIXED_WINDOW_SIZE 5
+
+/**
+ * @brief Even/odd adjustments for fixed window with
+ * ROUNDUP(SCALAR_BITS,SCALARMUL_FIXED_WINDOW_SIZE).
+ */
+extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS];
+
+/**
+ * @brief Table size for wNAF signed binary (variable-time) scalarmul.
+ * Table size is 2^this.
+ */
+#define SCALARMUL_WNAF_TABLE_BITS 3
+
+/**
+ * @brief Table size for wNAF signed binary (variable-time) linear combo.
+ * Table size is 2^this.
+ */
+#define SCALARMUL_WNAF_COMBO_TABLE_BITS 4
+
+/**
+ * @brief If true, use wider tables for the precomputed combs.
+ */
+#ifndef USE_BIG_COMBS
+#if __ARM_NEON__
+#define USE_BIG_COMBS 1
+#else
+#define USE_BIG_COMBS (WORD_BITS==64)
+#endif
+#endif
+
+/** @brief The number of combs to use for signed comb algo */
+#define COMB_N (USE_BIG_COMBS ? 5  : 8)
+
+/** @brief The number of teeth of the combs for signed comb algo */
+#define COMB_T (USE_BIG_COMBS ? 5  : 4)
+
+/** @brief The spacing the of combs for signed comb algo */
+#define COMB_S (USE_BIG_COMBS ? 18 : 14)
+
+/**
+ * @brief The bit width of the precomputed WNAF tables.  Size is 2^this elements.
+ */
+#define WNAF_PRECMP_BITS 5
+
+/**
+ * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
+ */
+#define CRANDOM_MAGIC 0x72657475726e2034ull
+
+#endif /* __GOLDI_MAGIC_H__ */
diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h
index 8b42fd7..bd97cc9 100644
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -10,12 +10,19 @@
 #define __P448_ALGO_H__ 1
 
 #include "ec_point.h"
+#include "field.h"
 #include "intrinsics.h"
+#include "magic.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/**
+ * A word array containing a scalar
+ */
+typedef word_t scalar_t[SCALAR_WORDS];
+
 /**
  * A precomputed table for fixed-base scalar multiplication.
  *
@@ -26,7 +33,7 @@ struct fixed_base_table_t {
   struct tw_niels_t *table;
   
   /** Adjustments to the scalar in even and odd cases, respectively. */
-  word_t scalar_adjustments[2*(448/WORD_BITS)];  /* MAGIC */
+  word_t scalar_adjustments[2*SCALAR_WORDS];
   
   /** The number of combs in the table. */
   unsigned int n;
@@ -83,8 +90,8 @@ struct fixed_base_table_t {
  */
 mask_t
 montgomery_ladder (
-    struct p448_t *out,
-    const struct p448_t *in,
+    struct field_t *out,
+    const struct field_t *in,
     const word_t *scalar,
     unsigned int nbits,
     unsigned int n_extra_doubles
@@ -103,7 +110,7 @@ montgomery_ladder (
 void
 scalarmul (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS] /* MAGIC */
+    const word_t scalar[SCALAR_WORDS]
     /* TODO? int nbits */
 );
     
@@ -124,8 +131,7 @@ scalarmul (
 void
 scalarmul_vlook (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS] /* MAGIC */
-    /* TODO? int nbits */
+    const word_t scalar[SCALAR_WORDS]
 );
 
 /**
@@ -134,7 +140,7 @@ scalarmul_vlook (
  *
  * This function computes $n$ "comb" tables, each containing
  * 2^(t-1) points in tw_niels_t format.  You must have
- * n * t * s >= 446 for complete coverage.
+ * n * t * s >= SCALAR_BITS = 446 for complete coverage.
  *
  * The scalar multiplication algorithm may adjust the scalar by
  * a multiple of q.  Therefore, we strongly recommend to use base
@@ -205,11 +211,13 @@ scalarmul_fixed_base (
  *
  * @param [inout] working The input and output point.
  * @param [in] scalar The scalar.
+ * @param [in] nbits The number of bits in the scalar
  */ 
 void
 scalarmul_vt (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS] /* MAGIC */
+    const word_t *scalar,
+    unsigned int nbits
 );
 
 
@@ -274,9 +282,9 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt (
     struct tw_extensible_t *working,
-    const word_t scalar_var[448/WORD_BITS], /* MAGIC */
+    const word_t scalar_var[SCALAR_WORDS],
     unsigned int nbits_var,
-    const word_t scalar_pre[448/WORD_BITS], /* MAGIC */
+    const word_t scalar_pre[SCALAR_WORDS],
     unsigned int nbits_pre,
     const struct tw_niels_t *precmp,
     unsigned int table_bits_pre
@@ -302,10 +310,10 @@ linear_combo_var_fixed_vt (
 mask_t
 linear_combo_combs_vt (
     struct tw_extensible_t *out,
-    const word_t scalar1[448/WORD_BITS],
+    const word_t scalar1[SCALAR_WORDS],
     unsigned int nbits1,
     const struct fixed_base_table_t *table1,
-    const word_t scalar2[448/WORD_BITS],
+    const word_t scalar2[SCALAR_WORDS],
     unsigned int nbits2,
     const struct fixed_base_table_t *table2
 );
diff --git a/src/include/word.h b/src/include/word.h
index 0f6c6e6..d165647 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -26,7 +26,6 @@
 
 #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
 /* It's a 64-bit machine if:
- * // limits.h thinks so
  * __uint128_t exists
  * size_t is 64 bits
  * Either longs are 64-bits (doesn't happen on Windows)
@@ -61,6 +60,9 @@ typedef int64_t dsword_t;
 #endif
 
 #define WORD_BITS (sizeof(word_t) * 8)
+#define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y))
+#define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y))
+#define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS))
 
 typedef word_t mask_t;
 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;
@@ -69,51 +71,80 @@ static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;
 
 #ifdef __ARM_NEON__
 typedef uint32x4_t vecmask_t;
-#else
-/* FIXME this only works on clang */
+#elif __clang__
+typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
+typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
+typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
+typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
+typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
+typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
+typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
+typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
+typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
+typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
+typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
+#else /* GCC-cleanliness */
 typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
 typedef int64_t  int64x2_t __attribute__((vector_size(16)));
 typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
 typedef int64_t  int64x4_t __attribute__((vector_size(32)));
-typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
-typedef int32_t  int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 typedef int32_t  int32x4_t __attribute__((vector_size(16)));
+typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
+typedef int32_t  int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
 typedef int32_t  int32x8_t __attribute__((vector_size(32)));
-/* TODO: vector width for procs like ARM; gcc support */
 typedef word_t vecmask_t __attribute__((vector_size(32)));
 #endif
 
 #if __AVX2__
-typedef uint32x8_t big_register_t;
-typedef uint64x4_t uint64xn_t;
-typedef uint32x8_t uint32xn_t;
-#elif __SSE2__ || __ARM_NEON__
-typedef uint32x4_t big_register_t;
-typedef uint64x2_t uint64xn_t;
-typedef uint32x4_t uint32xn_t;
+    typedef uint32x8_t big_register_t;
+    typedef uint64x4_t uint64xn_t;
+    typedef uint32x8_t uint32xn_t;
+
+    static __inline__ big_register_t
+    br_set_to_mask(mask_t x) {
+        uint32_t y = x;
+        big_register_t ret = {y,y,y,y,y,y,y,y};
+        return ret;
+    }
+#elif __SSE2__
+    typedef uint32x4_t big_register_t;
+    typedef uint64x2_t uint64xn_t;
+    typedef uint32x4_t uint32xn_t;
+    typedef uint32_t uint32xn_t;
+
+    static __inline__ big_register_t
+    br_set_to_mask(mask_t x) {
+        uint32_t y = x;
+        big_register_t ret = {y,y,y,y};
+        return ret;
+    }
+#elif __ARM_NEON__
+    typedef uint32x4_t big_register_t;
+    typedef uint64x2_t uint64xn_t;
+    typedef uint32x4_t uint32xn_t;
+    static __inline__ big_register_t
+    br_set_to_mask(mask_t x) {
+        return vdupq_n_u32(x);
+    }
 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
-typedef uint64_t big_register_t, uint64xn_t;
-typedef uint32_t uint32xn_t;
-#else
-typedef uint64_t uint64xn_t;
-typedef uint32_t uint32xn_t;
-typedef uint32_t big_register_t;
-#endif
-
+    typedef uint64_t big_register_t, uint64xn_t;
 
-#ifdef __ARM_NEON__
-static __inline__ big_register_t
-br_set_to_mask(mask_t x) {
-    return vdupq_n_u32(x);
-}
+    typedef uint32_t uint32xn_t;
+    static __inline__ big_register_t
+    br_set_to_mask(mask_t x) {
+        return (big_register_t)x;
+    }
 #else
-static __inline__ big_register_t
-br_set_to_mask(mask_t x) {
-    big_register_t out = {x,x,x,x,x,x,x,x};
-    return out;
-}
+    typedef uint64_t uint64xn_t;
+    typedef uint32_t uint32xn_t;
+    typedef uint32_t big_register_t;
+
+    static __inline__ big_register_t
+    br_set_to_mask(mask_t x) {
+        return (big_register_t)x;
+    }
 #endif
 
 #if __AVX2__ || __SSE2__
diff --git a/src/magic.c b/src/magic.c
new file mode 100644
index 0000000..a6336f7
--- /dev/null
+++ b/src/magic.c
@@ -0,0 +1,61 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "field.h"
+#include "magic.h"
+#include "barrett_field.h"
+
+/* FUTURE: automatically generate this file. */
+
+const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
+    U64LE(0xebec9967f5d3f5c2),
+    U64LE(0x0aa09b49b16c9a02),
+    U64LE(0x7f6126aec172cd8e),
+    U64LE(0x00000007b027e54d),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    U64LE(0x4000000000000000),
+    
+    U64LE(0xc873d6d54a7bb0cf),
+    U64LE(0xe933d8d723a70aad),
+    U64LE(0xbb124b65129c96fd),
+    U64LE(0x00000008335dc163),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000)
+};
+
+const struct affine_t goldilocks_base_point = {
+    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
+       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
+       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
+       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
+    }},
+    {{ 19 }}
+};
+
+static const word_t curve_prime_order_lo[(224+WORD_BITS-1)/WORD_BITS] = {
+    U64LE(0xdc873d6d54a7bb0d),
+    U64LE(0xde933d8d723a70aa),
+    U64LE(0x3bb124b65129c96f),
+    0x8335dc16
+};
+const struct barrett_prime_t curve_prime_order = {
+    GOLDI_FIELD_WORDS,
+    62 % WORD_BITS,
+    sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]),
+    curve_prime_order_lo
+};
+
+const struct field_t
+sqrt_d_minus_1 = {{
+    U58LE(0xd2e21836749f46),
+    U58LE(0x888db42b4f0179),
+    U58LE(0x5a189aabdeea38),
+    U58LE(0x51e65ca6f14c06),
+    U58LE(0xa49f7b424d9770),
+    U58LE(0xdcac4628c5f656),
+    U58LE(0x49443b8748734a),
+    U58LE(0x12fec0c0b25b7a)
+}};
diff --git a/src/scalarmul.c b/src/scalarmul.c
index 89891db..e3e5850 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -13,8 +13,8 @@
 
 mask_t
 montgomery_ladder (
-    struct p448_t *out,
-    const struct p448_t *in,
+    struct field_t *out,
+    const struct field_t *in,
     const word_t *scalar,
     unsigned int nbits,
     unsigned int n_extra_doubles
@@ -28,15 +28,15 @@ montgomery_ladder (
         word_t w = scalar[j];
         for (i=n; i>=0; i--) {
             mask_t flip = -((w>>i)&1);
-            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
-            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
+            field_cond_swap(&mont.xa,&mont.xd,flip^pflip);
+            field_cond_swap(&mont.za,&mont.zd,flip^pflip);
             montgomery_step(&mont);
             pflip = flip;
         }
         n = WORD_BITS-1;
     }
-    p448_cond_swap(&mont.xa,&mont.xd,pflip);
-    p448_cond_swap(&mont.za,&mont.zd,pflip);
+    field_cond_swap(&mont.xa,&mont.xd,pflip);
+    field_cond_swap(&mont.za,&mont.zd,pflip);
     
     assert(n_extra_doubles < INT_MAX);
     for (j=0; j<(int)n_extra_doubles; j++) {
@@ -51,8 +51,8 @@ cond_negate_tw_niels (
     struct tw_niels_t *n,
     mask_t doNegate
 ) {
-    p448_cond_swap(&n->a, &n->b, doNegate);
-    p448_cond_neg(&n->c, doNegate);
+    field_cond_swap(&n->a, &n->b, doNegate);
+    field_cond_neg(&n->c, doNegate);
 }
 
 static __inline__ void
@@ -137,34 +137,18 @@ convert_to_signed_window_form (
 void
 scalarmul (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
+    const word_t scalar[SCALAR_WORDS]
 ) {
-    const int nbits=450; /* MAGIC */
-    word_t prepared_data[448*2/WORD_BITS] = {
-        
-        U64LE(0xebec9967f5d3f5c2),
-        U64LE(0x0aa09b49b16c9a02),
-        U64LE(0x7f6126aec172cd8e),
-        U64LE(0x00000007b027e54d),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000),
-        U64LE(0x4000000000000000),
-            
-        U64LE(0xc873d6d54a7bb0cf),
-        U64LE(0xe933d8d723a70aad),
-        U64LE(0xbb124b65129c96fd),
-        U64LE(0x00000008335dc163),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000)
-    }; /* MAGIC */
-    
-    word_t scalar2[448/WORD_BITS];
-    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
-    
-    const int WINDOW = 5, /* MAGIC */
+    const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
         WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
-        NTABLE = 1<<(WINDOW-1);
+        NTABLE = 1<<(WINDOW-1),
+        nbits = ROUND_UP(SCALAR_BITS,WINDOW);
+    
+    word_t scalar2[SCALAR_WORDS];
+    convert_to_signed_window_form (
+        scalar2, scalar, SCALAR_WORDS,
+        SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
+    );
 
     struct tw_extensible_t tabulator;
     copy_tw_extensible(&tabulator, working);
@@ -197,7 +181,7 @@ scalarmul (
 
         bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
         
-        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
+        if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
             bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
         }
                 
@@ -214,34 +198,19 @@ scalarmul (
 void
 scalarmul_vlook (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
-) {
-    const int nbits=450; /* HACK? */
-    word_t prepared_data[448*2/WORD_BITS] = {
-        
-        U64LE(0xebec9967f5d3f5c2),
-        U64LE(0x0aa09b49b16c9a02),
-        U64LE(0x7f6126aec172cd8e),
-        U64LE(0x00000007b027e54d),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000),
-        U64LE(0x4000000000000000),
-            
-        U64LE(0xc873d6d54a7bb0cf),
-        U64LE(0xe933d8d723a70aad),
-        U64LE(0xbb124b65129c96fd),
-        U64LE(0x00000008335dc163),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000),
-        U64LE(0x0000000000000000)
-    }; /* MAGIC: split off */
-    
-    word_t scalar2[448/WORD_BITS];
-    convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
-    
-    const int WINDOW = 5, /* MAGIC */
+    const word_t scalar[SCALAR_WORDS]
+) {    
+    const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
         WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
-        NTABLE = 1<<(WINDOW-1);
+        NTABLE = 1<<(WINDOW-1),
+        nbits = ROUND_UP(SCALAR_BITS,WINDOW);
+    
+    word_t scalar2[SCALAR_WORDS];
+    convert_to_signed_window_form(
+        scalar2, scalar, SCALAR_WORDS,
+        SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
+    );
+
 
     struct tw_extensible_t tabulator;
     copy_tw_extensible(&tabulator, working);
@@ -274,7 +243,7 @@ scalarmul_vlook (
 
         bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
         
-        if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
+        if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
             bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
         }
                 
@@ -304,8 +273,8 @@ schedule_scalar_for_combs (
     
     unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS,
         scalar2_words = scalar_words;
-    if (scalar2_words < 448 / WORD_BITS)
-        scalar2_words = 448 / WORD_BITS;
+    if (scalar2_words < SCALAR_WORDS)
+        scalar2_words = SCALAR_WORDS;
     word_t scalar3[scalar2_words];
     
     /* Copy scalar to scalar3, but clear its high bits (if there are any) */
@@ -322,7 +291,7 @@ schedule_scalar_for_combs (
     convert_to_signed_window_form (
         scalar2,
         scalar3, scalar2_words,
-        table->scalar_adjustments , 448 / WORD_BITS
+        table->scalar_adjustments , SCALAR_WORDS
     );
     
     return MASK_SUCCESS;
@@ -331,7 +300,7 @@ schedule_scalar_for_combs (
 mask_t
 scalarmul_fixed_base (
     struct tw_extensible_t *out,
-    const word_t scalar[448/WORD_BITS],
+    const word_t scalar[SCALAR_WORDS],
     unsigned int nbits,
     const struct fixed_base_table_t *table
 ) {
@@ -339,7 +308,7 @@ scalarmul_fixed_base (
     unsigned int n = table->n, t = table->t, s = table->s;
     
     unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
-    if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS;
+    if (scalar2_words < SCALAR_WORDS) scalar2_words = SCALAR_WORDS;
     
     word_t scalar2[scalar2_words];
 
@@ -389,10 +358,10 @@ scalarmul_fixed_base (
 mask_t
 linear_combo_combs_vt (
     struct tw_extensible_t *out,
-    const word_t scalar1[448/WORD_BITS],
+    const word_t scalar1[SCALAR_WORDS],
     unsigned int nbits1,
     const struct fixed_base_table_t *table1,
-    const word_t scalar2[448/WORD_BITS],
+    const word_t scalar2[SCALAR_WORDS],
     unsigned int nbits2,
     const struct fixed_base_table_t *table2
 ) { 
@@ -400,10 +369,10 @@ linear_combo_combs_vt (
     unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
     
     unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
-    if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS;
+    if (scalar1b_words < SCALAR_WORDS) scalar1b_words = SCALAR_WORDS;
     
     unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
-    if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS;
+    if (scalar2b_words < SCALAR_WORDS) scalar2b_words = SCALAR_WORDS;
     
     word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];
 
@@ -479,7 +448,7 @@ precompute_fixed_base (
   unsigned int s,
   struct tw_niels_t *prealloc
 ) {
-    if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */
+    if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
         memset(out, 0, sizeof(*out));
         return 0;
     }
@@ -493,8 +462,8 @@ precompute_fixed_base (
     struct tw_pniels_t pn_tmp;
   
     struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
-    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
-    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
+    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
+    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
     
     struct tw_niels_t *table = prealloc;
     if (prealloc) {
@@ -519,30 +488,19 @@ precompute_fixed_base (
     
     /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */
     unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1;
-    assert(adjustment_size >= 448/WORD_BITS);
+    assert(adjustment_size >= SCALAR_WORDS);
     word_t adjustment[adjustment_size];
     for (i=0; i<adjustment_size; i++) {
         adjustment[i] = -1;
     }
     
     adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);
-
-    /* MAGIC: factor out somehow */
-    const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
-        U64LE(0xdc873d6d54a7bb0d),
-        U64LE(0xde933d8d723a70aa),
-        U64LE(0x3bb124b65129c96f),
-        0x8335dc16
-    };
-    const struct barrett_prime_t goldi_q448 = {
-        448/WORD_BITS, 62 % WORD_BITS, sizeof(goldi_q448_lo)/sizeof(word_t), goldi_q448_lo
-    };
     
     /* The low adjustment is 2^nbits - 1 mod q */
-    barrett_reduce(adjustment, adjustment_size, 0, &goldi_q448);
-    word_t *low_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)],
-        *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)];
-    for (i=0; i<448/WORD_BITS; i++) {
+    barrett_reduce(adjustment, adjustment_size, 0, &curve_prime_order);
+    word_t *low_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*(adjustment[0] & 1)],
+        *high_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*((~adjustment[0]) & 1)];
+    for (i=0; i<SCALAR_WORDS; i++) {
         low_adjustment[i] = adjustment[i];
     }
     
@@ -550,12 +508,12 @@ precompute_fixed_base (
     (void)
     sub_nr_ext_packed(
         high_adjustment,
-        adjustment, 448/WORD_BITS,
-        goldi_q448.p_lo, goldi_q448.nwords_lo,
+        adjustment, SCALAR_WORDS,
+        curve_prime_order.p_lo, curve_prime_order.nwords_lo,
         -1
     );
-    if (goldi_q448.p_shift) {
-        high_adjustment[goldi_q448.nwords_p - 1] += ((word_t)1)<<goldi_q448.p_shift;
+    if (curve_prime_order.p_shift) {
+        high_adjustment[curve_prime_order.nwords_p - 1] += ((word_t)1)<<curve_prime_order.p_shift;
     }
     
     /* OK, now compute the tables */
@@ -591,7 +549,7 @@ precompute_fixed_base (
 
             convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
             copy_tw_niels(&table[idx], &pn_tmp.n);
-            p448_copy(&zs[idx], &pn_tmp.z);
+            field_copy(&zs[idx], &pn_tmp.z);
 			
             if (j >= (1u<<(t-1)) - 1) break;
             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
@@ -611,24 +569,24 @@ precompute_fixed_base (
         }
     }
 	
-    simultaneous_invert_p448(zis, zs, n<<(t-1));
+    simultaneous_invert(zis, zs, n<<(t-1));
 
-    p448_t product;
+    field_t product;
     for (i=0; i<n<<(t-1); i++) {
-        p448_mul(&product, &table[i].a, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&table[i].a, &product);
+        field_mul(&product, &table[i].a, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&table[i].a, &product);
         
-        p448_mul(&product, &table[i].b, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&table[i].b, &product);
+        field_mul(&product, &table[i].b, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&table[i].b, &product);
         
-        p448_mul(&product, &table[i].c, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&table[i].c, &product);
+        field_mul(&product, &table[i].c, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&table[i].c, &product);
     }
 	
-	mask_t ret = ~p448_is_zero(&zis[0]);
+	mask_t ret = ~field_is_zero(&zis[0]);
 
     free(doubles);
     free(zs);
@@ -664,8 +622,8 @@ precompute_fixed_base_wnaf (
     unsigned int tbits
 ) {
     int i;
-    struct p448_t *zs  = (struct p448_t *) malloc_vector(sizeof(*zs)<<tbits);
-    struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis)<<tbits);
+    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs)<<tbits);
+    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis)<<tbits);
 
     if (!zs || !zis) {
         free(zs);
@@ -679,7 +637,7 @@ precompute_fixed_base_wnaf (
     struct tw_pniels_t twop, tmp;
     
     convert_tw_extensible_to_tw_pniels(&tmp, &base);
-    p448_copy(&zs[0], &tmp.z);
+    field_copy(&zs[0], &tmp.z);
     copy_tw_niels(&out[0], &tmp.n);
 
     if (tbits > 0) {
@@ -688,32 +646,32 @@ precompute_fixed_base_wnaf (
         add_tw_pniels_to_tw_extensible(&base, &tmp);
         
         convert_tw_extensible_to_tw_pniels(&tmp, &base);
-        p448_copy(&zs[1], &tmp.z);
+        field_copy(&zs[1], &tmp.z);
         copy_tw_niels(&out[1], &tmp.n);
 
         for (i=2; i < 1<<tbits; i++) {
             add_tw_pniels_to_tw_extensible(&base, &twop);
             convert_tw_extensible_to_tw_pniels(&tmp, &base);
-            p448_copy(&zs[i], &tmp.z);
+            field_copy(&zs[i], &tmp.z);
             copy_tw_niels(&out[i], &tmp.n);
         }
     }
     
-    simultaneous_invert_p448(zis, zs, 1<<tbits);
+    simultaneous_invert(zis, zs, 1<<tbits);
 
-    p448_t product;
+    field_t product;
     for (i=0; i<1<<tbits; i++) {
-        p448_mul(&product, &out[i].a, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&out[i].a, &product);
+        field_mul(&product, &out[i].a, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&out[i].a, &product);
         
-        p448_mul(&product, &out[i].b, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&out[i].b, &product);
+        field_mul(&product, &out[i].b, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&out[i].b, &product);
         
-        p448_mul(&product, &out[i].c, &zis[i]);
-        p448_strong_reduce(&product);
-        p448_copy(&out[i].c, &product);
+        field_mul(&product, &out[i].c, &zis[i]);
+        field_strong_reduce(&product);
+        field_copy(&out[i].c, &product);
     }
 
     free(zs);
@@ -757,7 +715,7 @@ recode_wnaf(
          * There's also the stopper with power -1, for a total of +3.
          */
         if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
-            int delta = (current + 1) >> 1; // |delta| < 2^tablebits
+            int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
             current = -(current & 1);
 
             for (j=i; (delta & 1) == 0; j++) {
@@ -813,10 +771,10 @@ prepare_wnaf_table(
 void
 scalarmul_vt (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS]
+    const word_t scalar[SCALAR_WORDS],
+    unsigned int nbits
 ) {
-    /* HACK: not 448? */
-    const int nbits=448, table_bits = 3;
+    const int table_bits = SCALARMUL_WNAF_TABLE_BITS;
     struct smvt_control control[nbits/(table_bits+1)+3];
     
     int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
@@ -854,7 +812,7 @@ scalarmul_vt (
 void
 scalarmul_fixed_base_wnaf_vt (
     struct tw_extensible_t *working,
-    const word_t scalar[448/WORD_BITS],
+    const word_t scalar[SCALAR_WORDS],
     unsigned int nbits,
     const struct tw_niels_t *precmp,
     unsigned int table_bits
@@ -895,14 +853,14 @@ scalarmul_fixed_base_wnaf_vt (
 void
 linear_combo_var_fixed_vt(
     struct tw_extensible_t *working,
-    const word_t scalar_var[448/WORD_BITS],
+    const word_t scalar_var[SCALAR_WORDS],
     unsigned int nbits_var,
-    const word_t scalar_pre[448/WORD_BITS],
+    const word_t scalar_pre[SCALAR_WORDS],
     unsigned int nbits_pre,
     const struct tw_niels_t *precmp,
     unsigned int table_bits_pre
 ) {
-    const int table_bits_var = 4;
+    const int table_bits_var = SCALARMUL_WNAF_COMBO_TABLE_BITS;
     struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
     struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
     
diff --git a/src/sha512.c b/src/sha512.c
index 09d2e4c..82f81ad 100644
--- a/src/sha512.c
+++ b/src/sha512.c
@@ -2,8 +2,8 @@
  * Copyright (c) 2014 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
-#include "sha512.h"
 #include "word.h"
+#include "sha512.h"
 
 #include <string.h>
 #include <assert.h>
@@ -163,9 +163,11 @@ sha512_final (
         sha512_process_block(ctx);
         fill = 0;
     }
-    memset(ctx->block + fill, 0, 120-fill);
-    uint64_t size = htobe64((ctx->nbytes * 8));
-    memcpy(&ctx->block[120], &size, sizeof(size));
+    memset(ctx->block + fill, 0, 112-fill);
+    
+    uint64_t highCount = 0, lowCount = htobe64((ctx->nbytes * 8));
+    memcpy(&ctx->block[112],&highCount,8);
+    memcpy(&ctx->block[120],&lowCount,8);
     sha512_process_block(ctx);
     for (i=0; i<8; i++) {
         ctx->chain[i] = htobe64(ctx->chain[i]);
diff --git a/test/bench.c b/test/bench.c
index 2e90e9a..b80be14 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -100,6 +100,9 @@ int main(int argc, char **argv) {
     for (i=0; i<32; i++) initial_seed[i] = i;
     struct crandom_state_t crand;
     crandom_init_from_buffer(&crand, initial_seed);
+    /* For testing the performance drop from the crandom debuffering change.
+        ignore_result(crandom_init_from_file(&crand, "/dev/urandom", 10000, 1));
+    */
     
     word_t sk[448/WORD_BITS],tk[448/WORD_BITS];
     q448_randomize(&crand, sk);
@@ -248,14 +251,14 @@ int main(int argc, char **argv) {
     
     when = now();
     for (i=0; i<nbase*100; i++) {
-        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&goldi_q448);
+        barrett_reduce(lsk,sizeof(lsk)/sizeof(word_t),0,&curve_prime_order);
     }
     when = now() - when;
     printf("barrett red: %5.1fns\n", when * 1e9 / i);
     
     when = now();
     for (i=0; i<nbase*10; i++) {
-        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&goldi_q448);
+        barrett_mac(lsk,448/WORD_BITS,lsk,448/WORD_BITS,lsk,448/WORD_BITS,&curve_prime_order);
     }
     when = now() - when;
     printf("barrett mac: %5.1fns\n", when * 1e9 / i);
@@ -334,7 +337,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
-        scalarmul_vt(&ext,sk);
+        scalarmul_vt(&ext,sk,446);
     }
     when = now() - when;
     printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index d6f16c7..5627b64 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -20,18 +20,6 @@ single_scalarmul_compatibility_test (
     int ret = 0, i;
     mask_t succ, succm;
     
-    const struct p448_t
-    sqrt_d_minus_1 = {{
-        U58LE(0xd2e21836749f46),
-        U58LE(0x888db42b4f0179),
-        U58LE(0x5a189aabdeea38),
-        U58LE(0x51e65ca6f14c06),
-        U58LE(0xa49f7b424d9770),
-        U58LE(0xdcac4628c5f656),
-        U58LE(0x49443b8748734a),
-        U58LE(0x12fec0c0b25b7a)
-    }};
-    
     succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base);
     
     succm = montgomery_ladder(&mont,base,scalar,nbits,1);
@@ -108,7 +96,7 @@ single_scalarmul_compatibility_test (
         untwist_and_double_and_serialize(&vl, &work);
         
         copy_tw_extensible(&work, &text);
-        scalarmul_vt(&work, scalar);
+        scalarmul_vt(&work, scalar, nbits);
         untwist_and_double_and_serialize(&vt, &work);
         
     
@@ -167,20 +155,7 @@ single_linear_combo_test (
     const struct p448_t *base2,
     const word_t *scalar2,
     int nbits2
-) {
-    /* MAGIC */
-    const struct p448_t
-    sqrt_d_minus_1 = {{
-        U58LE(0xd2e21836749f46),
-        U58LE(0x888db42b4f0179),
-        U58LE(0x5a189aabdeea38),
-        U58LE(0x51e65ca6f14c06),
-        U58LE(0xa49f7b424d9770),
-        U58LE(0xdcac4628c5f656),
-        U58LE(0x49443b8748734a),
-        U58LE(0x12fec0c0b25b7a)
-    }};
-    
+) { 
     struct tw_extensible_t text1, text2, working;
     struct tw_pniels_t pn;
     struct p448_t result_comb, result_combo, result_wnaf;