From a5bed6b351c55dfe14fb9a3396d20b4cb4f7b6d6 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 14 Jan 2016 23:35:30 -0800
Subject: [PATCH] Working on getting cross-arch working again.  Several TODOs.

Currently compiles and passes tests on x86_64 with arch_32 and
DECAF_FORCE_32_BIT=1 (as well as the native settigs of course),
so that's a start.

Want to make serialization routine cross-arch.  Need to check that
perf is good enough (likely).  Current routine in p25519/arch_32
is almost cross-arch, but has known bugs (FIXMEs).  Needs to take
into account separate p and, for NEON, the LIMBPERM.

Want to decouple arches for each curve/field.  Currently the split
between decaf_word_t and word_t makes this fraught with peril.  Fix
is probably to rename decaf_word_t to decaf_api_word_t and fix it
to either uint32 or uint64, then make internal things separate per
field.  That way we don't have to try arch detection in the header,
which is nice.

Need to make decaf_gen_tables use SC_LIMB.  Might as well get rid
of API_NS there too.
---
 Makefile                                      |  32 +---
 src/curve_ed25519/curve_data.inc.c            |   1 -
 src/curve_ed448goldilocks/curve_data.inc.c    |   1 -
 src/decaf.c                                   |  34 ++--
 src/decaf_gen_tables.c                        |   5 +-
 src/gen_headers/curve_data.py                 |   2 -
 src/gen_headers/decaf_h.py                    |  15 +-
 src/gen_headers/f_field_h.py                  |   8 +-
 src/include/arch_32/arch_intrinsics.h         |  22 +++
 src/include/arch_arm_32/arch_intrinsics.h     |  24 +++
 src/include/arch_neon/arch_intrinsics.h       |  24 +++
 src/include/arch_ref64/arch_intrinsics.h      |  22 +++
 src/include/arch_x86_64/arch_intrinsics.h     |   4 +-
 src/include/field.h                           |   1 -
 src/include/word.h                            |  20 +-
 src/p25519/arch_32/f_impl.c                   | 178 ++++++++++++++++++
 src/p25519/arch_32/f_impl.h                   |  40 ++++
 src/p25519/arch_ref64/arch_config.h           |   2 -
 src/p25519/arch_ref64/f_impl.c                |  26 +--
 src/p25519/arch_x86_64/arch_config.h          |   2 -
 src/p25519/arch_x86_64/f_impl.c               |   6 +-
 src/p448/arch_32/arch_config.h                |   2 -
 src/p448/arch_32/f_impl.c                     |  21 +--
 src/p448/arch_arm_32/arch_config.h            |   2 -
 src/p448/arch_arm_32/f_impl.c                 |  21 +--
 .../f_impl.c                                  |  17 +-
 .../f_impl.h                                  |   0
 src/p448/arch_neon_experimental/arch_config.h |   3 -
 src/p448/arch_ref64/arch_config.h             |   3 -
 src/p448/arch_ref64/f_impl.c                  |  20 +-
 src/p448/arch_x86_64/arch_config.h            |   2 -
 src/p448/arch_x86_64/f_impl.c                 |   8 +-
 src/p480/arch_x86_64/arch_config.h            |   1 -
 src/p480/arch_x86_64/f_impl.c                 |   8 +-
 src/p521/arch_ref64/arch_config.h             |   1 -
 src/p521/arch_ref64/f_impl.c                  |  20 +-
 src/p521/arch_x86_64_r12/arch_config.h        |   1 -
 src/p521/arch_x86_64_r12/f_impl.c             |  13 +-
 38 files changed, 402 insertions(+), 210 deletions(-)
 create mode 100644 src/include/arch_32/arch_intrinsics.h
 create mode 100644 src/include/arch_arm_32/arch_intrinsics.h
 create mode 100644 src/include/arch_neon/arch_intrinsics.h
 create mode 100644 src/include/arch_ref64/arch_intrinsics.h
 create mode 100644 src/p25519/arch_32/f_impl.c
 create mode 100644 src/p25519/arch_32/f_impl.h
 delete mode 100644 src/p25519/arch_ref64/arch_config.h
 delete mode 100644 src/p25519/arch_x86_64/arch_config.h
 delete mode 100644 src/p448/arch_32/arch_config.h
 delete mode 100644 src/p448/arch_arm_32/arch_config.h
 rename src/p448/{arch_neon_experimental => arch_neon}/f_impl.c (98%)
 rename src/p448/{arch_neon_experimental => arch_neon}/f_impl.h (100%)
 delete mode 100644 src/p448/arch_neon_experimental/arch_config.h
 delete mode 100644 src/p448/arch_ref64/arch_config.h
 delete mode 100644 src/p448/arch_x86_64/arch_config.h
 delete mode 100644 src/p480/arch_x86_64/arch_config.h
 delete mode 100644 src/p521/arch_ref64/arch_config.h
 delete mode 100644 src/p521/arch_x86_64_r12/arch_config.h

diff --git a/Makefile b/Makefile
index 16eb948..420f010 100644
--- a/Makefile
+++ b/Makefile
@@ -31,13 +31,6 @@ LD = $(CC)
 LDXX = $(CXX)
 ASM ?= $(CC)
 
-ifneq (,$(findstring x86_64,$(MACHINE)))
-ARCH ?= arch_x86_64
-else
-# no i386 port yet
-ARCH ?= arch_ref32
-endif
-
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 
@@ -55,17 +48,8 @@ endif
 
 TODAY = $(shell date "+%Y-%m-%d")
 
-ifneq (,$(findstring arm,$(MACHINE)))
-ifneq (,$(findstring neon,$(ARCH)))
-ARCHFLAGS += -mfpu=neon
-else
-ARCHFLAGS += -mfpu=vfpv3-d16
-endif
-ARCHFLAGS += -mcpu=cortex-a8 # FIXME
-GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
-else
-ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
-endif
+#FIXME ARCHFLAGS
+ARCHFLAGS ?= -maes -mavx2 -mbmi2 #TODO
 
 ifeq ($(CC),clang)
 WARNFLAGS += -Wgcc-compat
@@ -141,18 +125,18 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/*
 # Per-field code: call with field, arch
 ################################################################
 define define_field
-ARCH_FOR_$(1) = $(2)
+ARCH_FOR_$(1) ?= $(2)
 COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
 
 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
 
-$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
 endef
 
diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c
index b3d0c56..9012b4c 100644
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -5,7 +5,6 @@
 
 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
-#define NLIMBS DECAF_255_LIMBS
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c
index b42c944..b5c8217 100644
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -4,7 +4,6 @@
 
 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
-#define NLIMBS DECAF_448_LIMBS
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
diff --git a/src/decaf.c b/src/decaf.c
index 2025ca3..a690678 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -10,13 +10,14 @@
 
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
-#include <decaf.h>
 #include <string.h>
 
 #include "word.h"
 #include "field.h"
 #include "decaf_config.h"
 
+#include <decaf.h>
+
 /* Include the curve data here */
 #include "curve_data.inc.c"
 
@@ -41,7 +42,10 @@ extern const gf SQRT_MINUS_ONE;
 extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
 #endif
 
-#define WBITS DECAF_WORD_BITS
+/* FIXME: this can be different from DECAF_WORD_BITS, and word_t can be different from decaf_word_t,
+ * eg when mixing and matching implementations for different curves.  Homogenize this.
+ */
+#define WBITS WORD_BITS
 
 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const scalar_t API_NS(sc_r2);
@@ -82,8 +86,8 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 #define UNROLL
 #endif
 
-#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
-#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
+#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
+#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
 
 /** Copy x = y */
 static INLINE void
@@ -106,11 +110,11 @@ cond_neg(gf x, decaf_bool_t neg) {
 /** Constant time, if (swap) (x,y) = (y,x); */
 static INLINE void
 cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
-    FOR_LIMB_U(i, {
+    UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
         decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
         x->limb[i] ^= s;
         y->limb[i] ^= s;
-    });
+    }
 }
 
 /** Compare a==b */
@@ -123,9 +127,11 @@ gf_eq(const gf a, const gf b) {
     gf_sub(c,a,b);
     gf_strong_reduce(c);
     decaf_word_t ret=0;
-    FOR_LIMB(i, ret |= c->limb[i] );
-    /* Hope the compiler is too dumb to optimize this, thus noinline */
-    return ((decaf_dword_t)ret - 1) >> WBITS;
+    for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
+        ret |= c->limb[i];
+    }
+
+    return word_is_zero(ret);
 }
 
 /** Inverse square root using addition chain. */
@@ -385,7 +391,7 @@ API_NS(scalar_eq) (
     for (i=0; i<SCALAR_LIMBS; i++) {
         diff |= a->limb[i] ^ b->limb[i];
     }
-    return (((decaf_dword_t)diff)-1)>>WBITS;
+    return word_is_zero(diff);
 }
 
 /* *** API begins here *** */    
@@ -1280,7 +1286,7 @@ API_NS(invert_elligator_nonuniform) (
     const point_t p,
     uint16_t hint_
 ) {
-    uint64_t hint = hint_;
+    decaf_bool_t hint = hint_;
     decaf_bool_t sgn_s = -(hint & 1),
         sgn_t_over_s = -(hint>>1 & 1),
         sgn_r0 = -(hint>>2 & 1),
@@ -1293,13 +1299,13 @@ API_NS(invert_elligator_nonuniform) (
     gf_sub(b,ONE,b); /* t+1 */
     gf_sqr(c,a); /* s^2 */
     decaf_bool_t is_identity = gf_eq(p->t,ZERO);
-    {   /* identity adjustments */
+    {
+        /* identity adjustments */
         /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
         /* if hint is 0, -> 0 */
         /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
         cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
-        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
-        
+        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */        
     }
     gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
     gf_add(a,b,d); /* num? */
diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c
index de917d6..85feced 100644
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -11,9 +11,10 @@
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #include <stdio.h>
 #include <stdlib.h>
+
+#include "field.h"
 #include "decaf.h"
 #include "decaf_config.h"
-#include "field.h"
 
 #define GEN_TABLES
 #include "curve_data.inc.c"
@@ -91,8 +92,8 @@ int main(int argc, char **argv) {
     unsigned i;
     
     printf("/** @warning: this file was automatically generated. */\n");
-    printf("#include <decaf.h>\n\n");
     printf("#include \"field.h\"\n\n");
+    printf("#include <decaf.h>\n\n");
     printf("#define API_NS(_id) %s_##_id\n", API_NAME);
     printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);
     
diff --git a/src/gen_headers/curve_data.py b/src/gen_headers/curve_data.py
index 772a217..ed0e901 100644
--- a/src/gen_headers/curve_data.py
+++ b/src/gen_headers/curve_data.py
@@ -21,7 +21,6 @@ curve_data = {
         "name" : "IsoEd25519",
         "cxx_ns" : "IsoEd25519",
         "shortname" : "255",
-        "longnum" : "25519",
         "c_ns" : "decaf_255",
         "cofactor" : 8,
         "field" : "p25519",
@@ -32,7 +31,6 @@ curve_data = {
         "name" : "Ed448-Goldilocks",
         "cxx_ns" : "Ed448Goldilocks",
         "shortname" : "448",
-        "longnum" : "448",
         "c_ns" : "decaf_448",
         "cofactor" : 4,
         "field" : "p448",
diff --git a/src/gen_headers/decaf_h.py b/src/gen_headers/decaf_h.py
index 8a6151f..f092e61 100644
--- a/src/gen_headers/decaf_h.py
+++ b/src/gen_headers/decaf_h.py
@@ -13,7 +13,6 @@ extern "C" {
 #endif
 
 /** @cond internal */
-#define %(C_NS)s_LIMBS (%(gf_impl_bits)d/DECAF_WORD_BITS)
 #define %(C_NS)s_SCALAR_LIMBS ((%(scalar_bits)d-1)/DECAF_WORD_BITS+1)
 /** @endcond */
 
@@ -21,13 +20,13 @@ extern "C" {
 #define %(C_NS)s_SCALAR_BITS %(scalar_bits)d
 
 /** @cond internal */
-#ifndef __%(C_NS)s_GF_DEFINED__
-#define __%(C_NS)s_GF_DEFINED__ 1
+#ifndef __DECAF_%(gf_shortname)s_GF_DEFINED__
+#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
 /** @brief Galois field element internal structure */
-typedef struct gf_%(longnum)s_s {
-    decaf_word_t limb[%(C_NS)s_LIMBS];
-} __attribute__((aligned(32))) gf_%(longnum)s_s, gf_%(longnum)s_t[1];
-#endif /* __%(C_NS)s_GF_DEFINED__ */
+typedef struct gf_%(gf_shortname)s_s {
+    decaf_word_t limb[%(gf_impl_bits)d/DECAF_WORD_BITS];
+} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
+#endif /* __DECAF_%(gf_shortname)s_GF_DEFINED__ */
 /** @endcond */
 
 /** Number of bytes in a serialized point. */
@@ -39,7 +38,7 @@ typedef struct gf_%(longnum)s_s {
 /** Twisted Edwards extended homogeneous coordinates */
 typedef struct %(c_ns)s_point_s {
     /** @cond internal */
-    gf_%(longnum)s_t x,y,z,t;
+    gf_%(gf_shortname)s_t x,y,z,t;
     /** @endcond */
 } %(c_ns)s_point_t[1];
 
diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index a06360b..388faba 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -10,9 +10,13 @@ f_field_h = gen_file(
 #include <string.h>
 #include <assert.h>
 
-#include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
 #include "word.h"
 
+#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
+typedef struct gf_%(gf_shortname)s_s {
+    word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8];
+} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
+
 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
 #define gf                gf_%(gf_shortname)s_t
@@ -57,4 +61,4 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
 #endif
 
 #include "f_impl.h" /* Bring in the inline implementations */
-""")
\ No newline at end of file
+""")
diff --git a/src/include/arch_32/arch_intrinsics.h b/src/include/arch_32/arch_intrinsics.h
new file mode 100644
index 0000000..4e9d159
--- /dev/null
+++ b/src/include/arch_32/arch_intrinsics.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    /* let's hope the compiler isn't clever enough to optimize this. */
+    return (((uint64_t)a)-1)>>32;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    return ((uint64_t)a) * b;
+}
+
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_arm_32/arch_intrinsics.h b/src/include/arch_arm_32/arch_intrinsics.h
new file mode 100644
index 0000000..86080b1
--- /dev/null
+++ b/src/include/arch_arm_32/arch_intrinsics.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
+#define __ARCH_ARM_32_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    uint32_t ret;
+    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+    return ret;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
+    return ((uint64_t)a) * b; 
+}
+
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_neon/arch_intrinsics.h b/src/include/arch_neon/arch_intrinsics.h
new file mode 100644
index 0000000..b138796
--- /dev/null
+++ b/src/include/arch_neon/arch_intrinsics.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
+#define __ARCH_NEON_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 32
+
+static __inline__ __attribute((always_inline,unused))
+uint32_t word_is_zero(uint32_t a) {
+    uint32_t ret;
+    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+    return ret;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint32_t a, uint32_t b) {
+    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
+    return ((uint64_t)a) * b; 
+}
+
+#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_ref64/arch_intrinsics.h b/src/include/arch_ref64/arch_intrinsics.h
new file mode 100644
index 0000000..8413a2e
--- /dev/null
+++ b/src/include/arch_ref64/arch_intrinsics.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
+#define __ARCH_REF64_ARCH_INTRINSICS_H__
+
+#define WORD_BITS 64
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t word_is_zero(uint64_t a) {
+    /* let's hope the compiler isn't clever enough to optimize this. */
+    return (((__uint128_t)a)-1)>>64;
+}
+
+static __inline__ __attribute((always_inline,unused))
+uint64_t widemul(uint64_t a, uint64_t b) {
+    return ((__uint128_t)a) * b; 
+}
+
+#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
+
diff --git a/src/include/arch_x86_64/arch_intrinsics.h b/src/include/arch_x86_64/arch_intrinsics.h
index d2b03e1..843f337 100644
--- a/src/include/arch_x86_64/arch_intrinsics.h
+++ b/src/include/arch_x86_64/arch_intrinsics.h
@@ -5,6 +5,8 @@
 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
 #define __ARCH_X86_64_ARCH_INTRINSICS_H__
 
+#define WORD_BITS 64
+
 #include <stdint.h>
 
 /* FUTURE: non x86-64 versions of these.
@@ -294,7 +296,7 @@ static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *
   *acc = (((__uint128_t)(d))<<64) | c;
 }
 
-static __inline__ uint64_t is_zero(uint64_t x) {
+static __inline__ uint64_t word_is_zero(uint64_t x) {
   __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
   return ~x;
 }
diff --git a/src/include/field.h b/src/include/field.h
index 0121c39..9850f1c 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -74,7 +74,6 @@ gf_add (
 
 /** Subtract mod p.  Bias by 2 and don't reduce  */
 static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
-//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
     gf_sub_RAW(c,a,b);
     gf_bias(c, 2);
     if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
diff --git a/src/include/word.h b/src/include/word.h
index b44a92e..2261b13 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -8,7 +8,7 @@
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600
 
-#include "arch_config.h"
+#include <stdint.h>
 #include "arch_intrinsics.h"
 
 #include <decaf/common.h>
@@ -21,7 +21,6 @@
 #include <endian.h>
 #endif
 
-#include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>
@@ -64,7 +63,7 @@
     #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
     #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
     #define letohWORD letoh32
-    #define SC_LIMB(x) (x##ull)
+    #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
 #else
     #error "For now, libdecaf only supports 32- and 64-bit architectures."
 #endif
@@ -159,14 +158,6 @@ typedef struct {
 typedef struct {
     uint32xn_t unaligned;
 } __attribute__((packed)) unaligned_uint32xn_t;
-    
-/**
- * Return -1 if x==0, and 0 otherwise.
- */
-static INLINE UNUSED mask_t
-word_is_zero(word_t x) {
-    return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
-}
 
 #if __AVX2__
     static INLINE big_register_t
@@ -185,15 +176,10 @@ word_is_zero(word_t x) {
         return vceqq_u32(x,x^x);
     }
 #else
-    static INLINE mask_t
-    br_is_zero(word_t x) {
-        return (((dword_t)x) - 1)>>WORD_BITS;
-    }
+    #define br_is_zero word_is_zero
 #endif
 
 
-
-
 #ifdef __APPLE__
     static INLINE uint64_t htole64 (uint64_t x) { return x; }
     static INLINE uint64_t letoh64 (uint64_t x) { return x; }
diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c
new file mode 100644
index 0000000..cfc3fb3
--- /dev/null
+++ b/src/p25519/arch_32/f_impl.c
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "f_field.h"
+
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
+    const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
+    
+    uint64_t bh[9];
+    int i,j;
+    for (i=0; i<9; i++) bh[i] = b[i+1] * 19;
+    
+    uint32_t *c = cs->limb;
+
+    uint64_t accum = 0;
+    for (i=0; i<10; /*i+=2*/) {
+        /* Even case. */
+        for (j=0; j<i; /*j+=2*/) {
+            accum += widemul(b[i-j], a[j]); j++;
+            accum += widemul(2*b[i-j], a[j]); j++;
+        }
+        accum += widemul(b[0], a[j]); j++;
+        accum += widemul(2*bh[8], a[j]); j++;
+        for (; j<10; /* j+=2*/) {
+            accum += widemul(bh[i-j+9], a[j]); j++;
+            accum += widemul(2*bh[i-j+9], a[j]); j++;
+        }
+        c[i] = accum & maske;
+        accum >>= 26;
+        i++;
+
+        /* Odd case is easier: all place values are exact. */
+        for (j=0; j<=i; j++) {
+            accum += widemul(b[i-j], a[j]);
+        }
+        for (; j<10; j++) {
+            accum += widemul(bh[i-j+9], a[j]);
+        }
+        c[i] = accum & masko;
+        accum >>= 25;
+        i++;
+    }
+    
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & maske;
+    accum >>= 26;
+    
+    assert(accum < masko);
+    c[1] += accum;
+}
+
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
+    const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
+    uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
+    uint32_t *c = cs->limb;
+    uint64_t accum = 0;
+
+    accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]);
+    c[0] = accum & maske;
+    accum >>= 26;
+
+    accum += widemul(blo, a[1]) + widemul(bhi,a[0]);
+    c[1] = accum & masko;
+    accum >>= 25;
+
+    for (int i=2; i<10; /*i+=2*/) {
+        accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]);
+        c[i] = accum & maske;
+        accum >>= 26;
+        i++;
+
+        accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]);
+        c[i] = accum & masko;
+        accum >>= 25;
+        i++;
+    }
+    
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & maske;
+    accum >>= 26;
+    
+    assert(accum < masko);
+    c[1] += accum;
+}
+
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+    gf_mul(cs,as,as); // PERF
+}
+
+void gf_strong_reduce (gf a) {
+    uint32_t maske = (1<<26)-1, masko = (1<<25)-1;
+
+    /* first, clear high */
+    a->limb[0] += (a->limb[9]>>25)*19;
+    a->limb[9] &= masko;
+
+    /* now the total is less than 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+    int64_t scarry = 0;
+    int i;
+    for (i=0; i<10; /*i+=2*/) {
+        scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske);
+        a->limb[i] = scarry & maske;
+        scarry >>= 26;
+        i++;
+
+        scarry = scarry + a->limb[i] - masko;
+        a->limb[i] = scarry & masko;
+        scarry >>= 25;
+        i++;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
+     * so let's add back in p.  will carry back off the top for 2^255.
+     */
+
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
+
+    uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske;
+    uint64_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<10; /*i+=2*/) {
+        carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske);
+        a->limb[i] = carry & maske;
+        carry >>= 26;
+        i++;
+
+        carry = carry + a->limb[i] + scarry_masko;
+        a->limb[i] = carry & masko;
+        carry >>= 25;
+        i++;
+    }
+
+    assert(word_is_zero(carry + scarry));
+}
+
+#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
+void gf_serialize (uint8_t serial[32], const gf x) {
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    for (unsigned int i=0; i<32; i++) {
+        if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) {
+            buffer |= ((dword_t)red->limb[j]) << fill;
+            fill += LIMB_PLACE_VALUE(j);
+            j++;
+        }
+        serial[i] = buffer;
+        fill -= 8;
+        buffer >>= 8;
+    }
+}
+
+mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    for (unsigned int i=0; i<32; i++) {
+        buffer |= ((dword_t)serial[i]) << fill;
+        fill += 8;
+        if (fill >= LIMB_PLACE_VALUE(j) || i == 31) {
+            assert(j < sizeof(x->limb)/sizeof(x->limb[0]));
+            word_t mask = ((1ull)<<LIMB_PLACE_VALUE(j))-1;
+            x->limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field.
+            buffer >>= LIMB_PLACE_VALUE(j);
+            fill -= LIMB_PLACE_VALUE(j);
+            j++;
+        }
+    }
+    return -1; // FIXME: test whether in field.
+}
diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h
new file mode 100644
index 0000000..5e51bf0
--- /dev/null
+++ b/src/p25519/arch_32/f_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2014-2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26
+#define FIELD_LITERAL(a,b,c,d,e) \
+    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}
+
+void gf_add_RAW (gf out, const gf a, const gf b) {
+    for (unsigned int i=0; i<10; i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    gf_weak_reduce(out);
+}
+
+void gf_sub_RAW (gf out, const gf a, const gf b) {
+    uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36;
+    for (unsigned int i=0; i<10; i+=2) {
+        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe);
+        out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo;
+    }
+    gf_weak_reduce(out);
+}
+
+void gf_bias (gf a, int amt) {
+    (void) a;
+    (void) amt;
+}
+
+void gf_weak_reduce (gf a) {
+    uint32_t maske = (1ull<<26) - 1, masko = (1ull<<25) - 1;
+    uint32_t tmp = a->limb[9] >> 25;
+    for (unsigned int i=8; i>0; i-=2) {
+        a->limb[i+1] = (a->limb[i+1] & masko) + (a->limb[i]>>26);
+        a->limb[i] = (a->limb[i] & maske) + (a->limb[i-1]>>25);
+    }
+    a->limb[1] = (a->limb[1] & masko) + (a->limb[0]>>26);
+    a->limb[0] = (a->limb[0] & maske) + tmp*19;
+}
+
diff --git a/src/p25519/arch_ref64/arch_config.h b/src/p25519/arch_ref64/arch_config.h
deleted file mode 100644
index b9504c3..0000000
--- a/src/p25519/arch_ref64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_255_LIMB_BITS 51
\ No newline at end of file
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index 7afd485..414fd66 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     
@@ -95,7 +83,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^255.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -107,15 +95,15 @@ void gf_strong_reduce (gf a) {
         carry >>= 51;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
-void gf_serialize (uint8_t serial[32], const struct gf x) {
+void gf_serialize (uint8_t serial[32], const gf x) {
     int i,j;
     gf red;
-    gf_copy(&red, x);
-    gf_strong_reduce(&red);
-    uint64_t *r = red.limb;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    uint64_t *r = red->limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
         for (j=0; j<8; j++) {
@@ -149,5 +137,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
     x->limb[4] = ser64[3]>>12;
     
-    return ~is_zero(~ge);
+    return ~word_is_zero(~ge);
 }
diff --git a/src/p25519/arch_x86_64/arch_config.h b/src/p25519/arch_x86_64/arch_config.h
deleted file mode 100644
index 6d2cbd9..0000000
--- a/src/p25519/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_255_LIMB_BITS 51
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 168dbd5..0b02519 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -194,7 +194,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^255.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -206,7 +206,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 51;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t serial[32], const gf x) {
@@ -248,5 +248,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
     x->limb[4] = ser64[3]>>12;
     
-    return ~is_zero(~ge);
+    return ~word_is_zero(~ge);
 }
diff --git a/src/p448/arch_32/arch_config.h b/src/p448/arch_32/arch_config.h
deleted file mode 100644
index d4ada31..0000000
--- a/src/p448/arch_32/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index 739b1fb..24e8fe2 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -4,19 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t is_zero (word_t x) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
-static uint64_t widemul (
-    const uint32_t a,
-    const uint32_t b
-) {
-    return ((uint64_t)a)* b;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
@@ -141,7 +128,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -153,7 +140,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -195,13 +182,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p448/arch_arm_32/arch_config.h b/src/p448/arch_arm_32/arch_config.h
deleted file mode 100644
index d4ada31..0000000
--- a/src/p448/arch_arm_32/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index 62eda0f..b1719ad 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -4,19 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t is_zero (word_t x) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
-static uint64_t widemul (
-    const uint32_t a,
-    const uint32_t b
-) {
-    return ((uint64_t)a)* b;
-}
-
 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
     uint64_t *acc,
@@ -874,7 +861,7 @@ void gf_strong_reduce (
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -886,7 +873,7 @@ void gf_strong_reduce (
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (
@@ -935,12 +922,12 @@ gf_deserialize (
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
+    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon/f_impl.c
similarity index 98%
rename from src/p448/arch_neon_experimental/f_impl.c
rename to src/p448/arch_neon/f_impl.c
index 1225f5e..845f31e 100644
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -4,15 +4,6 @@
 
 #include "f_field.h"
 
-static inline mask_t __attribute__((always_inline))
-is_zero (
-    word_t x
-) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
 xx_vaddup_u64(uint64x2_t x) {
     __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
@@ -629,7 +620,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     word_t scarry_mask = scarry & mask;
     dword_t carry = 0;
@@ -641,7 +632,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 28;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -684,13 +675,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask);
+    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | word_is_zero(x->limb[LIMBPERM(8)] ^ mask);
     
     /* Propagate the rest */
     for (i=9; i<16; i++) {
         ge &= x->limb[LIMBPERM(i)];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p448/arch_neon_experimental/f_impl.h b/src/p448/arch_neon/f_impl.h
similarity index 100%
rename from src/p448/arch_neon_experimental/f_impl.h
rename to src/p448/arch_neon/f_impl.h
diff --git a/src/p448/arch_neon_experimental/arch_config.h b/src/p448/arch_neon_experimental/arch_config.h
deleted file mode 100644
index e65216f..0000000
--- a/src/p448/arch_neon_experimental/arch_config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define WORD_BITS 32
-#define DECAF_448_LIMB_BITS 28
-
diff --git a/src/p448/arch_ref64/arch_config.h b/src/p448/arch_ref64/arch_config.h
deleted file mode 100644
index f58980e..0000000
--- a/src/p448/arch_ref64/arch_config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_448_LIMB_BITS 56
-
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index 74aeeb1..4717b0e 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
@@ -337,7 +325,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -349,7 +337,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 56;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -389,12 +377,12 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
diff --git a/src/p448/arch_x86_64/arch_config.h b/src/p448/arch_x86_64/arch_config.h
deleted file mode 100644
index 3f449f4..0000000
--- a/src/p448/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define WORD_BITS 64
-#define DECAF_448_LIMB_BITS 56
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 07744fa..8ebb569 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^448.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf a) {
         carry >>= 56;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const gf x) {
@@ -367,13 +367,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p480/arch_x86_64/arch_config.h b/src/p480/arch_x86_64/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p480/arch_x86_64/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c
index b3c565b..e021241 100644
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf *a) {
     * so let's add back in p.  will carry back off the top for 2^480.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf *a) {
         carry >>= 60;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const struct gf *x) {
@@ -381,13 +381,13 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
     }
     
     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
     
     /* Propagate the rest */
     for (i=5; i<8; i++) {
         ge &= x->limb[i];
     }
     
-    return ~is_zero(ge ^ mask);
+    return ~word_is_zero(ge ^ mask);
 }
 
diff --git a/src/p521/arch_ref64/arch_config.h b/src/p521/arch_ref64/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p521/arch_ref64/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c
index 03c98ee..c3aee6f 100644
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@
 
 #include "f_field.h"
 
-static __inline__ __uint128_t widemul(
-    const uint64_t a,
-    const uint64_t b
-) {
-    return ((__uint128_t)a) * ((__uint128_t)b);
-}
-
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb, *b = bs->limb;
@@ -318,7 +306,7 @@ void gf_strong_reduce (gf a) {
     * so let's add back in p.  will carry back off the top for 2^521.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -330,7 +318,7 @@ void gf_strong_reduce (gf a) {
         carry >>= (i==8) ? 57 : 58;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 }
 
 void gf_serialize (uint8_t *serial, const struct gf x) {
@@ -367,14 +355,14 @@ mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
     }
     
     /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = is_zero(out>>57);
+    mask_t good = word_is_zero(out>>57);
     
     uint64_t and = -1ull;
     for (i=0; i<8; i++) {
         and &= x->limb[i];
     }
     and &= (2*out+1);
-    good &= is_zero((and+1)>>58);
+    good &= word_is_zero((and+1)>>58);
     
     return good;
 }
diff --git a/src/p521/arch_x86_64_r12/arch_config.h b/src/p521/arch_x86_64_r12/arch_config.h
deleted file mode 100644
index 58758cc..0000000
--- a/src/p521/arch_x86_64_r12/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 64
diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c
index 39d0f1e..2040531 100644
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -8,11 +8,6 @@ typedef struct {
   uint64x3_t lo, hi, hier;
 } nonad_t;
 
-static __inline__ uint64_t is_zero(uint64_t a) {
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
-}
-
 static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
     return ((__uint128_t)(a)) * b;
 }
@@ -378,7 +373,7 @@ void gf_strong_reduce (gf *a) {
     * so let's add back in p.  will carry back off the top for 2^521.
     */
 
-    assert(is_zero(scarry) | is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
     uint64_t scarry_mask = scarry & mask;
     __uint128_t carry = 0;
@@ -390,7 +385,7 @@ void gf_strong_reduce (gf *a) {
         carry >>= (i==8) ? 57 : 58;
     }
 
-    assert(is_zero(carry + scarry));
+    assert(word_is_zero(carry + scarry));
 
     a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }
@@ -429,14 +424,14 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
     }
     
     /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = is_zero(out>>57);
+    mask_t good = word_is_zero(out>>57);
     
     uint64_t and = -1ull;
     for (i=0; i<8; i++) {
         and &= x->limb[LIMBPERM(i)];
     }
     and &= (2*out+1);
-    good &= is_zero((and+1)>>58);
+    good &= word_is_zero((and+1)>>58);
 
     x->limb[3] = x->limb[7] = x->limb[11] = 0;