diff --git a/Makefile b/Makefile
index 16eb948..420f010 100644
--- a/Makefile
+++ b/Makefile
@@ -31,13 +31,6 @@ LD = $(CC)
 LDXX = $(CXX)
 ASM ?= $(CC)
 
-ifneq (,$(findstring x86_64,$(MACHINE)))
-ARCH ?= arch_x86_64
-else
-# no i386 port yet
-ARCH ?= arch_ref32
-endif
-
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 
@@ -55,17 +48,8 @@ endif
 
 TODAY = $(shell date "+%Y-%m-%d")
 
-ifneq (,$(findstring arm,$(MACHINE)))
-ifneq (,$(findstring neon,$(ARCH)))
-ARCHFLAGS += -mfpu=neon
-else
-ARCHFLAGS += -mfpu=vfpv3-d16
-endif
-ARCHFLAGS += -mcpu=cortex-a8 # FIXME
-GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
-else
-ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
-endif
+#FIXME ARCHFLAGS
+ARCHFLAGS ?= -maes -mavx2 -mbmi2 #TODO
 
 ifeq ($(CC),clang)
 WARNFLAGS += -Wgcc-compat
@@ -141,18 +125,18 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/*
 # Per-field code: call with field, arch
 ################################################################
 define define_field
-ARCH_FOR_$(1) = $(2)
+ARCH_FOR_$(1) ?= $(2)
 COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
 
 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
 
-$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
 endef
 
diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c
index b3d0c56..9012b4c 100644
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -5,7 +5,6 @@
 
 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
-#define NLIMBS DECAF_255_LIMBS
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c
index b42c944..b5c8217 100644
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -4,7 +4,6 @@
 
 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
-#define NLIMBS DECAF_448_LIMBS
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
diff --git a/src/decaf.c b/src/decaf.c
index 2025ca3..a690678 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -10,13 +10,14 @@
 
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
-#include <decaf.h>
 #include <string.h>
 
 #include "word.h"
 #include "field.h"
 #include "decaf_config.h"
 
+#include <decaf.h>
+
 /* Include the curve data here */
 #include "curve_data.inc.c"
 
@@ -41,7 +42,10 @@ extern const gf SQRT_MINUS_ONE;
 extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
 #endif
 
-#define WBITS DECAF_WORD_BITS
+/* FIXME: this can be different from DECAF_WORD_BITS, and word_t can be different from decaf_word_t,
+ * eg when mixing and matching implementations for different curves.  Homogenize this.
+ */
+#define WBITS WORD_BITS
 
 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const scalar_t API_NS(sc_r2);
@@ -82,8 +86,8 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 #define UNROLL
 #endif
 
-#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
-#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
+#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
+#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
 
 /** Copy x = y */
 static INLINE void
@@ -106,11 +110,11 @@ cond_neg(gf x, decaf_bool_t neg) {
 /** Constant time, if (swap) (x,y) = (y,x); */
 static INLINE void
 cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
-    FOR_LIMB_U(i, {
+    UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
         decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
         x->limb[i] ^= s;
         y->limb[i] ^= s;
-    });
+    }
 }
 
 /** Compare a==b */
@@ -123,9 +127,11 @@ gf_eq(const gf a, const gf b) {
     gf_sub(c,a,b);
     gf_strong_reduce(c);
     decaf_word_t ret=0;
-    FOR_LIMB(i, ret |= c->limb[i] );
-    /* Hope the compiler is too dumb to optimize this, thus noinline */
-    return ((decaf_dword_t)ret - 1) >> WBITS;
+    for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
+        ret |= c->limb[i];
+    }
+
+    return word_is_zero(ret);
 }
 
 /** Inverse square root using addition chain. */
@@ -385,7 +391,7 @@ API_NS(scalar_eq) (
     for (i=0; i<SCALAR_LIMBS; i++) {
         diff |= a->limb[i] ^ b->limb[i];
     }
-    return (((decaf_dword_t)diff)-1)>>WBITS;
+    return word_is_zero(diff);
 }
 
 /* *** API begins here *** */    
@@ -1280,7 +1286,7 @@ API_NS(invert_elligator_nonuniform) (
     const point_t p,
     uint16_t hint_
 ) {
-    uint64_t hint = hint_;
+    decaf_bool_t hint = hint_;
     decaf_bool_t sgn_s = -(hint & 1),
         sgn_t_over_s = -(hint>>1 & 1),
         sgn_r0 = -(hint>>2 & 1),
@@ -1293,13 +1299,13 @@ API_NS(invert_elligator_nonuniform) (
     gf_sub(b,ONE,b); /* t+1 */
     gf_sqr(c,a); /* s^2 */
     decaf_bool_t is_identity = gf_eq(p->t,ZERO);
-    {   /* identity adjustments */
+    {
+        /* identity adjustments */
         /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
         /* if hint is 0, -> 0 */
         /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
         cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
-        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
-        
+        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */        
     }
     gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
     gf_add(a,b,d); /* num? */
diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c
index de917d6..85feced 100644
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -11,9 +11,10 @@
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #include <stdio.h>
 #include <stdlib.h>
+
+#include "field.h"
 #include "decaf.h"
 #include "decaf_config.h"
-#include "field.h"
 
 #define GEN_TABLES
 #include "curve_data.inc.c"
@@ -91,8 +92,8 @@ int main(int argc, char **argv) {
     unsigned i;
     
     printf("/** @warning: this file was automatically generated. */\n");
-    printf("#include <decaf.h>\n\n");
     printf("#include \"field.h\"\n\n");
+    printf("#include <decaf.h>\n\n");
     printf("#define API_NS(_id) %s_##_id\n", API_NAME);
     printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);
     
diff --git a/src/gen_headers/curve_data.py b/src/gen_headers/curve_data.py
index 772a217..ed0e901 100644
--- a/src/gen_headers/curve_data.py
+++ b/src/gen_headers/curve_data.py
@@ -21,7 +21,6 @@ curve_data = {
         "name" : "IsoEd25519",
         "cxx_ns" : "IsoEd25519",
         "shortname" : "255",
-        "longnum" : "25519",
         "c_ns" : "decaf_255",
         "cofactor" : 8,
         "field" : "p25519",
@@ -32,7 +31,6 @@ curve_data = {
         "name" : "Ed448-Goldilocks",
         "cxx_ns" : "Ed448Goldilocks",
         "shortname" : "448",
-        "longnum" : "448",
         "c_ns" : "decaf_448",
         "cofactor" : 4,
         "field" : "p448",
diff --git a/src/gen_headers/decaf_h.py b/src/gen_headers/decaf_h.py
index 8a6151f..f092e61 100644
--- a/src/gen_headers/decaf_h.py
+++ b/src/gen_headers/decaf_h.py
@@ -13,7 +13,6 @@ extern "C" {
 #endif
 
 /** @cond internal */
-#define %(C_NS)s_LIMBS (%(gf_impl_bits)d/DECAF_WORD_BITS)
 #define %(C_NS)s_SCALAR_LIMBS ((%(scalar_bits)d-1)/DECAF_WORD_BITS+1)
 /** @endcond */
 
@@ -21,13 +20,13 @@ extern "C" {
 #define %(C_NS)s_SCALAR_BITS %(scalar_bits)d
 
 /** @cond internal */
-#ifndef __%(C_NS)s_GF_DEFINED__
-#define __%(C_NS)s_GF_DEFINED__ 1
+#ifndef __DECAF_%(gf_shortname)s_GF_DEFINED__
+#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
 /** @brief Galois field element internal structure */
-typedef struct gf_%(longnum)s_s {
-    decaf_word_t limb[%(C_NS)s_LIMBS];
-} __attribute__((aligned(32))) gf_%(longnum)s_s, gf_%(longnum)s_t[1];
-#endif /* __%(C_NS)s_GF_DEFINED__ */
+typedef struct gf_%(gf_shortname)s_s {
+    decaf_word_t limb[%(gf_impl_bits)d/DECAF_WORD_BITS];
+} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
+#endif /* __DECAF_%(gf_shortname)s_GF_DEFINED__ */
 /** @endcond */
 
 /** Number of bytes in a serialized point. */
@@ -39,7 +38,7 @@ typedef struct gf_%(longnum)s_s {
 /** Twisted Edwards extended homogeneous coordinates */
 typedef struct %(c_ns)s_point_s {
     /** @cond internal */
-    gf_%(longnum)s_t x,y,z,t;
+    gf_%(gf_shortname)s_t x,y,z,t;
     /** @endcond */
 } %(c_ns)s_point_t[1];
 
diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index a06360b..388faba 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -10,9 +10,13 @@ f_field_h = gen_file(
 #include <string.h>
 #include <assert.h>
 
-#include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
 #include "word.h"
 
+#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
+typedef struct gf_%(gf_shortname)s_s {
+    word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8];
+} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
+
 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
 #define gf                gf_%(gf_shortname)s_t
@@ -57,4 +61,4 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
 #endif
 
 #include "f_impl.h" /* Bring in the inline implementations */
-""")
\ No newline at end of file
+""")
diff --git a/src/include/arch_32/arch_intrinsics.h b/src/include/arch_32/arch_intrinsics.h
new file mode 100644
index 0000000..4e9d159
--- /dev/null
+++ b/src/include/arch_32/arch_intrinsics.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    /* let's hope the compiler isn't clever enough to optimize this. */
+    return (((uint64_t)a)-1)>>32;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    return ((uint64_t)a) * b;
+}
+
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_arm_32/arch_intrinsics.h b/src/include/arch_arm_32/arch_intrinsics.h
new file mode 100644
index 0000000..86080b1
--- /dev/null
+++ b/src/include/arch_arm_32/arch_intrinsics.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
+#define __ARCH_ARM_32_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    uint32_t ret;
+    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+    return ret;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
+    return ((uint64_t)a) * b; 
+}
+
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_neon/arch_intrinsics.h b/src/include/arch_neon/arch_intrinsics.h
new file mode 100644
index 0000000..b138796
--- /dev/null
+++ b/src/include/arch_neon/arch_intrinsics.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
+#define __ARCH_NEON_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    uint32_t ret;
+    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+    return ret;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
+    return ((uint64_t)a) * b; 
+}
+
+#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_ref64/arch_intrinsics.h b/src/include/arch_ref64/arch_intrinsics.h
new file mode 100644
index 0000000..8413a2e
--- /dev/null
+++ b/src/include/arch_ref64/arch_intrinsics.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
+#define __ARCH_REF64_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 64
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t word_is_zero(uint64_t a) {
+    /* let's hope the compiler isn't clever enough to optimize this. */
+    return (((__uint128_t)a)-1)>>64;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint64_t a, uint64_t b) {
+    return ((__uint128_t)a) * b; 
+}
+
+#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_x86_64/arch_intrinsics.h b/src/include/arch_x86_64/arch_intrinsics.h
index d2b03e1..843f337 100644
--- a/src/include/arch_x86_64/arch_intrinsics.h
+++ b/src/include/arch_x86_64/arch_intrinsics.h
@@ -5,6 +5,8 @@
 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
 #define __ARCH_X86_64_ARCH_INTRINSICS_H__
 
+#define WORD_BITS 64
+
 #include <stdint.h>
 
 /* FUTURE: non x86-64 versions of these.
@@ -294,7 +296,7 @@ static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *
   *acc = (((__uint128_t)(d))<<64) | c;
 }
 
-static __inline__ uint64_t is_zero(uint64_t x) {
+static __inline__ uint64_t word_is_zero(uint64_t x) {
   __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
   return ~x;
 }
diff --git a/src/include/field.h b/src/include/field.h
index 0121c39..9850f1c 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -74,7 +74,6 @@ gf_add (
 
 /** Subtract mod p.  Bias by 2 and don't reduce  */
 static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
-//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
     gf_sub_RAW(c,a,b);
     gf_bias(c, 2);
     if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
diff --git a/src/include/word.h b/src/include/word.h
index b44a92e..2261b13 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -8,7 +8,7 @@
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600
 
-#include "arch_config.h"
+#include <stdint.h>
 #include "arch_intrinsics.h"
 
 #include <decaf/common.h>
@@ -21,7 +21,6 @@
 #include <endian.h>
 #endif
 
-#include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>
@@ -64,7 +63,7 @@
     #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
     #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
     #define letohWORD letoh32
-    #define SC_LIMB(x) (x##ull)
+    #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
 #else
     #error "For now, libdecaf only supports 32- and 64-bit architectures."
 #endif
@@ -159,14 +158,6 @@ typedef struct {
 typedef struct {
     uint32xn_t unaligned;
 } __attribute__((packed)) unaligned_uint32xn_t;
-    
-/**
- * Return -1 if x==0, and 0 otherwise.
- */
-static INLINE UNUSED mask_t
-word_is_zero(word_t x) {
-    return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
-}
 
 #if __AVX2__
     static INLINE big_register_t
@@ -185,15 +176,10 @@ word_is_zero(word_t x) {
         return vceqq_u32(x,x^x);
     }
 #else
-    static INLINE mask_t
-    br_is_zero(word_t x) {
-        return (((dword_t)x) - 1)>>WORD_BITS;
-    }
+    #define br_is_zero word_is_zero
 #endif
 
 
-
-
 #ifdef __APPLE__
     static INLINE uint64_t htole64 (uint64_t x) { return x; }
     static INLINE uint64_t letoh64 (uint64_t x) { return x; }
diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c
new file mode 100644
index 0000000..cfc3fb3
--- /dev/null
+++ b/src/p25519/arch_32/f_impl.c
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "f_field.h"
+
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
+    const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
+    
+    uint64_t bh[9];
+    int i,j;
+    for (i=0; i<9; i++) bh[i] = b[i+1] * 19;
+    
+    uint32_t *c = cs->limb;
+
+    uint64_t accum = 0;
+    for (i=0; i<10; /*i+=2*/) {
+        /* Even case. */
+        for (j=0; j<i; /*j+=2*/) {
+            accum += widemul(b[i-j], a[j]); j++;
+            accum += widemul(2*b[i-j], a[j]); j++;
+        }
+        accum += widemul(b[0], a[j]); j++;
+        accum += widemul(2*bh[8], a[j]); j++;
+        for (; j<10; /* j+=2*/) {
+            accum += widemul(bh[i-j+9], a[j]); j++;
+            accum += widemul(2*bh[i-j+9], a[j]); j++;
+        }
+        c[i] = accum & maske;
+        accum >>= 26;
+        i++;
+
+        /* Odd case is easier: all place values are exact. */
+        for (j=0; j<=i; j++) {
+            accum += widemul(b[i-j], a[j]);
+        }
+        for (; j<10; j++) {
+            accum += widemul(bh[i-j+9], a[j]);
+        }
+        c[i] = accum & masko;
+        accum >>= 25;
+        i++;
+    }
+    
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & maske;
+    accum >>= 26;
+    
+    assert(accum < masko);
+    c[1] += accum;
+}
+
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+    const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
+    uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
+    uint32_t *c = cs->limb;
+    uint64_t accum = 0;
+
+    accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]);
+    c[0] = accum & maske;
+    accum >>= 26;
+
+    accum += widemul(blo, a[1]) + widemul(bhi,a[0]);
+    c[1] = accum & masko;
+    accum >>= 25;
+
+    for (int i=2; i<10; /*i+=2*/) {
+        accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]);
+        c[i] = accum & maske;
+        accum >>= 26;
+        i++;
+
+        accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]);
+        c[i] = accum & masko;
+        accum >>= 25;
+        i++;
+    }
+    
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & maske;
+    accum >>= 26;
+    
+    assert(accum < masko);
+    c[1] += accum;
+}
+
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+    gf_mul(cs,as,as); // PERF
+}
+
+void gf_strong_reduce (gf a) {
+    uint32_t maske = (1<<26)-1, masko = (1<<25)-1;
+
+    /* first, clear high */
+    a->limb[0] += (a->limb[9]>>25)*19;
+    a->limb[9] &= masko;
+
+    /* now the total is less than 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+    int64_t scarry = 0;
+    int i;
+    for (i=0; i<10; /*i+=2*/) {
+        scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske);
+        a->limb[i] = scarry & maske;
+        scarry >>= 26;
+        i++;
+
+        scarry = scarry + a->limb[i] - masko;
+        a->limb[i] = scarry & masko;
+        scarry >>= 25;
+        i++;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
+     * so let's add back in p.  will carry back off the top for 2^255.
+     */
+
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
+
+    uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske;
+    uint64_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<10; /*i+=2*/) {
+        carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske);
+        a->limb[i] = carry & maske;
+        carry >>= 26;
+        i++;
+
+        carry = carry + a->limb[i] + scarry_masko;
+        a->limb[i] = carry & masko;
+        carry >>= 25;
+        i++;
+    }
+
+    assert(word_is_zero(carry + scarry));
+}
+
+#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
+void gf_serialize (uint8_t serial[32], const gf x) {
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    for (unsigned int i=0; i<32; i++) {
+        if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) {
+            buffer |= ((dword_t)red->limb[j]) << fill;
+            fill += LIMB_PLACE_VALUE(j);
+            j++;
+        }
+        serial[i] = buffer;
+        fill -= 8;
+        buffer >>= 8;
+    }
+}
+
+mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    for (unsigned int i=0; i<32; i++) {
+        buffer |= ((dword_t)serial[i]) << fill;
+        fill += 8;
+        if (fill >= LIMB_PLACE_VALUE(j) || i == 31) {
+            assert(j < sizeof(x->limb)/sizeof(x->limb[0]));
+            word_t mask = ((1ull)<<LIMB_PLACE_VALUE(j))-1;
+            x->limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field.
+            buffer >>= LIMB_PLACE_VALUE(j);
+            fill -= LIMB_PLACE_VALUE(j);
+            j++;
+        }
+    }
+    return -1; // FIXME: test whether in field.
+}
diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h
new file mode 100644
index 0000000..5e51bf0
--- /dev/null
+++ b/src/p25519/arch_32/f_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2014-2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26
+#define FIELD_LITERAL(a,b,c,d,e) \
+    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}
+
+void gf_add_RAW (gf out, const gf a, const gf b) {
+    for (unsigned int i=0; i<10; i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    gf_weak_reduce(out);
+}
+
+void gf_sub_RAW (gf out, const gf a, const gf b) {
+    uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36;
+    for (unsigned int i=0; i<10; i+=2) {
+        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe);
+        out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo;
+    }
+    gf_weak_reduce(out);
+}
+
+void gf_bias (gf a, int amt) {
+    (void) a;
+    (void) amt;
+}
+
+void gf_weak_reduce (gf a) {
+    uint32_t maske = (1ull<<26) - 1, masko = (1ull<<25) - 1;
+    uint32_t tmp = a->limb[9] >> 25;
+    for (unsigned int i=8; i>0; i-=2) {
+        a->limb[i+1] = (a->limb[i+1] & masko) + (a->limb[i]>>26);
+        a->limb[i] = (a->limb[i] & maske) + (a->limb[i-1]>>25);
+    }
+    a->limb[1] = (a->limb[1] & masko) + (a->limb[0]>>26);
+    a->limb[0] = (a->limb[0] & maske) + tmp*19;
+}
+
diff --git a/src/p25519/arch_ref64/arch_config.h b/src/p25519/arch_ref64/arch_config.h
deleted file mode 100644
index b9504c3..0000000
--- a/src/p25519/arch_ref64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_255_LIMB_BITS 51
\ No newline at end of file
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index 7afd485..414fd66 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     
@@ -95,7 +83,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^255.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -107,15 +95,15 @@ void gf_strong_reduce (gf a) {
         carry >>= 51;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
-void gf_serialize (uint8_t serial[32], const struct gf x) {
+void gf_serialize (uint8_t serial[32], const gf x) {
     int i,j;
     gf red;
-    gf_copy(&red, x);
-    gf_strong_reduce(&red);
-    uint64_t *r = red.limb;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    uint64_t *r = red->limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
         for (j=0; j<8; j++) {
@@ -149,5 +137,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
     x->limb[4] = ser64[3]>>12;
     
-    return ~is_zero(~ge);
+    return ~word_is_zero(~ge);
 }
diff --git a/src/p25519/arch_x86_64/arch_config.h b/src/p25519/arch_x86_64/arch_config.h
deleted file mode 100644
index 6d2cbd9..0000000
--- a/src/p25519/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_255_LIMB_BITS 51
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 168dbd5..0b02519 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -194,7 +194,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^255.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -206,7 +206,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 51;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t serial[32], const gf x) {
@@ -248,5 +248,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
     x->limb[4] = ser64[3]>>12;
     
-    return ~is_zero(~ge);
+    return ~word_is_zero(~ge);
 }
diff --git a/src/p448/arch_32/arch_config.h b/src/p448/arch_32/arch_config.h
deleted file mode 100644
index d4ada31..0000000
--- a/src/p448/arch_32/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index 739b1fb..24e8fe2 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -4,19 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t is_zero (word_t x) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
-static uint64_t widemul (
-    const uint32_t a,
-    const uint32_t b
-) {
-    return ((uint64_t)a)* b;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
@@ -141,7 +128,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -153,7 +140,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -195,13 +182,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p448/arch_arm_32/arch_config.h b/src/p448/arch_arm_32/arch_config.h
deleted file mode 100644
index d4ada31..0000000
--- a/src/p448/arch_arm_32/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index 62eda0f..b1719ad 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -4,19 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t is_zero (word_t x) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
-static uint64_t widemul (
-    const uint32_t a,
-    const uint32_t b
-) {
-    return ((uint64_t)a)* b;
-}
-
 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
     uint64_t *acc,
@@ -874,7 +861,7 @@ void gf_strong_reduce (
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -886,7 +873,7 @@ void gf_strong_reduce (
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (
@@ -935,12 +922,12 @@ gf_deserialize (
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon/f_impl.c
similarity index 98%
rename from src/p448/arch_neon_experimental/f_impl.c
rename to src/p448/arch_neon/f_impl.c
index 1225f5e..845f31e 100644
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -4,15 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t __attribute__((always_inline))
-is_zero (
-    word_t x
-) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
 xx_vaddup_u64(uint64x2_t x) {
     __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
@@ -629,7 +620,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -641,7 +632,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -684,13 +675,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask);
+    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | word_is_zero(x->limb[LIMBPERM(8)] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[LIMBPERM(i)];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p448/arch_neon_experimental/f_impl.h b/src/p448/arch_neon/f_impl.h
similarity index 100%
rename from src/p448/arch_neon_experimental/f_impl.h
rename to src/p448/arch_neon/f_impl.h
diff --git a/src/p448/arch_neon_experimental/arch_config.h b/src/p448/arch_neon_experimental/arch_config.h
deleted file mode 100644
index e65216f..0000000
--- a/src/p448/arch_neon_experimental/arch_config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
-
diff --git a/src/p448/arch_ref64/arch_config.h b/src/p448/arch_ref64/arch_config.h
deleted file mode 100644
index f58980e..0000000
--- a/src/p448/arch_ref64/arch_config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_448_LIMB_BITS 56
-
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index 74aeeb1..4717b0e 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
@@ -337,7 +325,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -349,7 +337,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 56;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -389,12 +377,12 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
diff --git a/src/p448/arch_x86_64/arch_config.h b/src/p448/arch_x86_64/arch_config.h
deleted file mode 100644
index 3f449f4..0000000
--- a/src/p448/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_448_LIMB_BITS 56
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 07744fa..8ebb569 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 56;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -367,13 +367,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p480/arch_x86_64/arch_config.h b/src/p480/arch_x86_64/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p480/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c
index b3c565b..e021241 100644
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf *a) {
     * so let's add back in p.  will carry back off the top for 2^480.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf *a) {
         carry >>= 60;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const struct gf *x) {
@@ -381,13 +381,13 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p521/arch_ref64/arch_config.h b/src/p521/arch_ref64/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p521/arch_ref64/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c
index 03c98ee..c3aee6f 100644
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb, *b = bs->limb;
@@ -318,7 +306,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^521.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -330,7 +318,7 @@ void gf_strong_reduce (gf a) {
         carry >>= (i==8) ? 57 : 58;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const struct gf x) {
@@ -367,14 +355,14 @@ mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
     }
     
     /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = is_zero(out>>57);
+    mask_t good = word_is_zero(out>>57);
     
     uint64_t and = -1ull;
     for (i=0; i<8; i++) {
         and &= x->limb[i];
     }
     and &= (2*out+1);
-    good &= is_zero((and+1)>>58);
+    good &= word_is_zero((and+1)>>58);
     
     return good;
 }
diff --git a/src/p521/arch_x86_64_r12/arch_config.h b/src/p521/arch_x86_64_r12/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p521/arch_x86_64_r12/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c
index 39d0f1e..2040531 100644
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -8,11 +8,6 @@ typedef struct {
   uint64x3_t lo, hi, hier;
 } nonad_t;
 
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
     return ((__uint128_t)(a)) * b;
 }
@@ -378,7 +373,7 @@ void gf_strong_reduce (gf *a) {
     * so let's add back in p.  will carry back off the top for 2^521.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -390,7 +385,7 @@ void gf_strong_reduce (gf *a) {
         carry >>= (i==8) ? 57 : 58;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 
     a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }
@@ -429,14 +424,14 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
     }
     
     /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = is_zero(out>>57);
+    mask_t good = word_is_zero(out>>57);
     
     uint64_t and = -1ull;
     for (i=0; i<8; i++) {
         and &= x->limb[LIMBPERM(i)];
     }
     and &= (2*out+1);
-    good &= is_zero((and+1)>>58);
+    good &= word_is_zero((and+1)>>58);
 
     x->limb[3] = x->limb[7] = x->limb[11] = 0;