From cdab4953388af472369616fa32c4285dbee1a499 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Tue, 14 Jul 2015 18:39:28 -0700
Subject: [PATCH] Cross-curve compilation working!  Still a bunch of FIXMEs
 though

---
 Makefile                                   | 146 +++++++++++----------
 src/curve_ed25519/curve_data.inc.c         |  25 +++-
 src/curve_ed25519/field                    |   1 -
 src/curve_ed448goldilocks/curve_data.inc.c |  29 +++-
 src/curve_ed448goldilocks/field            |   1 -
 src/decaf.c                                |  16 ---
 src/decaf_fast.c                           |  99 ++++++++++----
 src/decaf_gen_tables.c                     |  13 +-
 src/p25519/f_field.h                       |   2 +-
 src/p448/arch_32/f_impl.c                  |  28 ++--
 src/p448/arch_32/f_impl.h                  |  64 ++++-----
 src/p448/arch_arm_32/f_impl.c              |  28 ++--
 src/p448/arch_arm_32/f_impl.h              |  64 ++++-----
 src/p448/arch_neon_experimental/f_impl.c   |  28 ++--
 src/p448/arch_neon_experimental/f_impl.h   |  64 ++++-----
 src/p448/arch_ref64/f_impl.c               |  32 ++---
 src/p448/arch_ref64/f_impl.h               |  64 ++++-----
 src/p448/arch_x86_64/f_impl.c              |  32 ++---
 src/p448/arch_x86_64/f_impl.h              |  67 +++++-----
 src/p448/f_field.h                         |   3 +-
 src/public_include/decaf.hxx               |   1 +
 src/public_include/decaf/decaf_255.hxx     |  14 +-
 src/public_include/decaf/decaf_448.h       |  17 ++-
 src/public_include/decaf/decaf_448.hxx     |   8 +-
 test/bench_decaf.cxx                       | 115 +++++++++-------
 test/test_decaf.cxx                        |  17 ++-
 26 files changed, 551 insertions(+), 427 deletions(-)
 delete mode 120000 src/curve_ed25519/field
 delete mode 120000 src/curve_ed448goldilocks/field

diff --git a/Makefile b/Makefile
index 3d34a49..b9221f7 100644
--- a/Makefile
+++ b/Makefile
@@ -39,8 +39,6 @@ else
 ARCH ?= arch_ref32
 endif
 
-FIELD ?= p25519
-
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 
@@ -79,22 +77,13 @@ SAGES= $(shell ls test/*.sage)
 BUILDPYS= $(SAGES:test/%.sage=$(BUILD_PY)/%.py)
 
 .PHONY: clean all test bench todo doc lib bat sage sagetest
-.PRECIOUS: $(BUILD_ASM)/%.s $(BUILD_ASM)/%_impl.s $(BUILD_ASM)/$(DECAF)_%.s $(BUILD_ASM)/decaf_tables_%.c \
-	$(BUILD_IBIN)/decaf_gen_tables_%
-
-HEADERS= Makefile $(shell find src test -name "*.h") $(shell find . -name "*.hxx") $(BUILD_OBJ)/timestamp
+.PRECIOUS: $(BUILD_ASM)/%.s $(BUILD_C)/%.c $(BUILD_IBIN)/%
 
-# components needed by the table generators
-GENCOMPONENTS=  \
-	$(BUILD_OBJ)/$(DECAF)_ed25519.o $(BUILD_OBJ)/p25519_impl.o  $(BUILD_OBJ)/p25519_arithmetic.o \
-	$(BUILD_OBJ)/utils.o \
-	#$(BUILD_OBJ)/p448_impl.o $(BUILD_OBJ)/p448_arithmetic.o
+HEADERS= Makefile $(shell find src test -name "*.h") $(BUILD_OBJ)/timestamp
+HEADERSXX = $(HEADERS) $(shell find . -name "*.hxx") 
 
 # components needed by the lib
-DECAFCOMPONENTS= $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o $(GENCOMPONENTS)
-ifeq ($(DECAF),decaf_fast)
-DECAFCOMPONENTS += $(BUILD_OBJ)/decaf_tables_ed25519.o
-endif
+LIBCOMPONENTS = $(BUILD_OBJ)/utils.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o # and per-field components
 
 BENCHCOMPONENTS = $(BUILD_OBJ)/bench.o $(BUILD_OBJ)/shake.o
 
@@ -105,26 +94,7 @@ scan: clean
 		 -enable-checker deadcode -enable-checker llvm \
 		 -enable-checker osx -enable-checker security -enable-checker unix \
 		make all
-		
-# The shakesum utility is in the public bin directory.
-$(BUILD_BIN)/shakesum: $(BUILD_OBJ)/shakesum.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/utils.o
-	$(LD) $(LDFLAGS) -o $@ $^
-
-# The main decaf library, and its symlinks.
-lib: $(BUILD_LIB)/libdecaf.so
-
-$(BUILD_LIB)/libdecaf.so: $(BUILD_LIB)/libdecaf.so.1
-	ln -sf `basename $^` $@
 
-$(BUILD_LIB)/libdecaf.so.1: $(DECAFCOMPONENTS)
-	rm -f $@
-ifeq ($(UNAME),Darwin)
-	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-		  $(DECAFCOMPONENTS)
-else
-	$(LD) $(LDFLAGS) -shared -Wl,-soname,`basename $@` -Wl,--gc-sections -o $@ $(DECAFCOMPONENTS)
-	strip --discard-all $@
-endif
 
 # Internal test programs, which are not part of the final build/bin directory.
 $(BUILD_IBIN)/test: $(BUILD_OBJ)/test_decaf.o lib
@@ -150,50 +120,86 @@ $(BUILD_OBJ)/timestamp:
 $(BUILD_OBJ)/%.o: $(BUILD_ASM)/%.s
 	$(ASM) $(ASFLAGS) -c -o $@ $<
 
-# I don't know why this rule is necessary... bug in make, or obscure pattern matching rule?
-$(BUILD_OBJ)/decaf_gen_tables_%.o: $(BUILD_ASM)/decaf_gen_tables_%.s
-	$(ASM) $(ASFLAGS) -c -o $@ $<
+################################################################
+# Per-field code: call with field, arch
+################################################################
+define define_field
+ARCH_FOR_$(1) = $(2)
+COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
+LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
+
+$$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -S -c -o $$@ $$<
+
+$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -S -c -o $$@ $$<
+endef
+
+################################################################
+# Per-field, per-curve code: call with curve, field
+################################################################
+define define_curve
+$$(BUILD_IBIN)/decaf_gen_tables_$(1): $$(BUILD_OBJ)/decaf_gen_tables_$(1).o $$(BUILD_OBJ)/decaf_fast_$(1).o $$(BUILD_OBJ)/utils.o \
+		$$(COMPONENTS_OF_$(2))
+	$$(LD) $$(LDFLAGS) -o $$@ $$^
+
+$$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1)
+	./$$< > $$@ || (rm $$@; exit 1)
+
+$$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -S -c -o $$@ $$< \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+
+$$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+		-S -c -o $$@ $$<
+
+$$(BUILD_ASM)/decaf_fast_$(1).s: src/decaf_fast.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+		-S -c -o $$@ $$<
+
+LIBCOMPONENTS += $$(BUILD_OBJ)/decaf_fast_$(1).o $$(BUILD_OBJ)/decaf_tables_$(1).o
+endef
+
+################################################################
+# call code above to generate curves and fields
+$(eval $(call define_field,p25519,arch_x86_64))
+$(eval $(call define_curve,ed25519,p25519))
+$(eval $(call define_field,p448,arch_x86_64))
+$(eval $(call define_curve,ed448goldilocks,p448))
 
-$(BUILD_IBIN)/decaf_gen_tables_%: $(BUILD_OBJ)/decaf_gen_tables_%.o $(GENCOMPONENTS)
+		
+# The shakesum utility is in the public bin directory.
+$(BUILD_BIN)/shakesum: $(BUILD_OBJ)/shakesum.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/utils.o
 	$(LD) $(LDFLAGS) -o $@ $^
-	
-$(BUILD_C)/decaf_tables_%.c: $(BUILD_IBIN)/decaf_gen_tables_%
-	./$< > $@
-	
-$(BUILD_ASM)/decaf_tables_%.s: $(BUILD_C)/decaf_tables_%.c $(HEADERS)
-	$(CC) $(CFLAGS) -S -c -o $@ $< \
-		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
-	
-$(BUILD_ASM)/decaf_gen_tables_%.s: src/decaf_gen_tables.c $(HEADERS)
-	$(CC) $(CFLAGS) \
-		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
-		-S -c -o $@ $<
-	
-$(BUILD_ASM)/decaf_fast_%.s: src/decaf_fast.c $(HEADERS)
-	$(CC) $(CFLAGS) \
-		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
-		-S -c -o $@ $<
-	
-$(BUILD_ASM)/%_arithmetic.s: src/%/f_arithmetic.c $(HEADERS)
-	$(CC) $(CFLAGS) \
-		-I src/$* -I src/$*/$(ARCH) \
-		-S -c -o $@ $<
-	
-$(BUILD_ASM)/%_impl.s: src/%/$(ARCH)/f_impl.c $(HEADERS)
-	$(CC) $(CFLAGS) \
-		-I src/$* -I src/$*/$(ARCH) \
-		-S -c -o $@ $<
-	
+
+# The main decaf library, and its symlinks.
+lib: $(BUILD_LIB)/libdecaf.so
+
+$(BUILD_LIB)/libdecaf.so: $(BUILD_LIB)/libdecaf.so.1
+	ln -sf `basename $^` $@
+
+$(BUILD_LIB)/libdecaf.so.1: $(LIBCOMPONENTS)
+	rm -f $@
+ifeq ($(UNAME),Darwin)
+	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
+		  $(LIBCOMPONENTS)
+else
+	$(LD) $(LDFLAGS) -shared -Wl,-soname,`basename $@` -Wl,--gc-sections -o $@ $(LIBCOMPONENTS)
+	strip --discard-all $@
+endif
+
+
+
 $(BUILD_ASM)/%.s: src/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 	
-$(BUILD_ASM)/%.s: src/%.cxx $(HEADERS)
-	$(CXX) $(CXXFLAGS) -S -c -o $@ $<
-
 $(BUILD_ASM)/%.s: test/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 
-$(BUILD_ASM)/%.s: test/%.cxx $(HEADERS)
+$(BUILD_ASM)/%.s: test/%.cxx $(HEADERSXX)
 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<
 
 # The sage test scripts
diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c
index b669fb0..a9b1a0c 100644
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -1,9 +1,22 @@
-/* Rename table for eventual factoring into .c.inc, MSR ECC style */
+// FIXME move to arch or something
+#define WBITS DECAF_WORD_BITS
+
+#if WBITS == 64
+#define LBITS 51
+typedef __int128_t decaf_sdword_t;
+#define LIMB(x) (x##ull)
+#define SC_LIMB(x) (x##ull)
+#else
+#error "Only supporting 64-bit platforms right now"
+#endif
+
+#define API_NAME "decaf_255"
+#define API_NS(_id) decaf_255_##_id
+#define API_NS2(_pref,_id) _pref##_decaf_255_##_id
+
 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
 #define NLIMBS DECAF_255_LIMBS
-#define API_NS(_id) decaf_255_##_id
-#define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
@@ -21,12 +34,14 @@ static const scalar_t sc_p = {{{
     SC_LIMB(0x1000000000000000)
 }}};
 
+#ifdef GEN_TABLES
 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
-const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
+static const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
     3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
+#endif
 
-const gf SQRT_ONE_MINUS_D = {FIELD_LITERAL(
+static const gf SQRT_ONE_MINUS_D = {FIELD_LITERAL(
     0x6db8831bbddec,
     0x38d7b56c9c165,
     0x016b221394bdc,
diff --git a/src/curve_ed25519/field b/src/curve_ed25519/field
deleted file mode 120000
index 5333fc7..0000000
--- a/src/curve_ed25519/field
+++ /dev/null
@@ -1 +0,0 @@
-../p25519/
\ No newline at end of file
diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c
index 80a674d..77ba847 100644
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -1,8 +1,27 @@
+// FIXME move to arch or something
+#define WBITS DECAF_WORD_BITS
+
+#if WBITS == 64
+#define LBITS 56
+typedef __int128_t decaf_sdword_t;
+#define LIMB(x) (x##ull)
+#define SC_LIMB(x) (x##ull)
+#elif WBITS == 32
+typedef int64_t decaf_sdword_t;
+#define LBITS 28
+#define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
+#define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
+#else
+#error "Only supporting 32- and 64-bit platforms right now"
+#endif
+
+#define API_NAME "decaf_448"
+#define API_NS(_id) decaf_448_##_id
+#define API_NS2(_pref,_id) _pref##_decaf_448_##_id
+
 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
 #define NLIMBS DECAF_448_LIMBS
-#define API_NS(_id) decaf_448_##_id
-#define API_NS2(_pref,_id) _pref##_decaf_448_##_id
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
@@ -22,8 +41,10 @@ static const scalar_t sc_p = {{{
     SC_LIMB(0xffffffffffffffff),
     SC_LIMB(0x3fffffffffffffff)
 }}};
- 
+
+#ifdef GEN_TABLES
 /* sqrt(5) = 2phi-1 from the curve spec.  Not exported, but used by pregen tool. */
-const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
+static const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
     -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
 };
+#endif
diff --git a/src/curve_ed448goldilocks/field b/src/curve_ed448goldilocks/field
deleted file mode 120000
index 7efdcab..0000000
--- a/src/curve_ed448goldilocks/field
+++ /dev/null
@@ -1 +0,0 @@
-../p448/
\ No newline at end of file
diff --git a/src/decaf.c b/src/decaf.c
index 199497a..9b342f2 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -13,22 +13,6 @@
 #include <string.h>
 #include <assert.h>
 
-#define WBITS DECAF_WORD_BITS
-
-#if WBITS == 64
-#define LBITS 56
-typedef __int128_t decaf_sdword_t;
-#define LIMB(x) (x##ull)
-#define SC_LIMB(x) (x##ull)
-#elif WBITS == 32
-typedef int64_t decaf_sdword_t;
-#define LBITS 28
-#define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
-#define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
-#else
-#error "Only supporting 32- and 64-bit platforms right now"
-#endif
-
 #define sv static void
 #define snv static void __attribute__((noinline))
 #define siv static inline void __attribute__((always_inline))
diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index a7d2b93..9ee3d14 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -15,21 +15,13 @@
 #include "field.h"
 #include "decaf_config.h"
 
-#define WBITS DECAF_WORD_BITS
-#if WBITS == 64
-    typedef __int128_t decaf_sdword_t;
-    #define SC_LIMB(x) (x##ull)
-#elif WBITS == 32
-    typedef int64_t decaf_sdword_t;
-    #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
-#else
-    #error "Only supporting 32- and 64-bit platforms right now"
-#endif
-
-
 /* Include the curve data here */
 #include "curve_data.inc.c"
 
+#if (COFACTOR == 8) && !IMAGINE_TWIST
+/* FUTURE: Curve41417 doesn't have these properties. */
+#error "Currently require IMAGINE_TWIST (and thus p=5 mod 8) for cofactor 8"
+#endif
 
 #if IMAGINE_TWIST && (P_MOD_8 != 5)
 #error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
@@ -162,6 +154,7 @@ static decaf_word_t hibit(const gf x) {
     return -(y->limb[0]&1);
 }
 
+#if COFACTOR==8
 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t lobit(const gf x) {
     gf y;
@@ -169,6 +162,7 @@ static decaf_word_t lobit(const gf x) {
     gf_strong_reduce(y);
     return -(y->limb[0]&1);
 }
+#endif
 
 /** {extra,accum} - sub +? p
  * Must have extra <= 1
@@ -408,27 +402,64 @@ static void deisogenize (
     decaf_bool_t toggle_hibit_t_over_s,
     decaf_bool_t toggle_rotation
 ) {
-    gf c, d, x, t;
+#if COFACTOR == 4 && !IMAGINE_TWIST
+    (void) toggle_rotation;
+    
+    /* TODO: Can shave off one mul here; not important but makes consistent with paper */
+    gf b, d;
+    gf_s *a = s, *c = minus_t_over_s;
+    gf_mulw_sgn ( a, p->y, 1-EDWARDS_D );
+    gf_mul ( c, a, p->t );     /* -dYT, with EDWARDS_D = d-1 */
+    gf_mul ( a, p->x, p->z ); 
+    gf_sub ( d, c, a );  /* aXZ-dYT with a=-1 */
+    gf_add ( a, p->z, p->y ); 
+    gf_sub ( b, p->z, p->y ); 
+    gf_mul ( c, b, a );
+    gf_mulw_sgn ( b, c, -EDWARDS_D ); /* (a-d)(Z+Y)(Z-Y) */
+    decaf_bool_t ok = gf_isqrt_chk ( a, b, DECAF_TRUE ); /* r in the paper */
+    (void)ok; assert(ok);
+    gf_mulw_sgn ( b, a, -EDWARDS_D ); /* u in the paper */
+    gf_mul ( c, b, a ); /* ur */
+    gf_mul ( a, c, d ); /* ur (aZX-dYT) */
+    gf_add ( d, b, b );  /* 2u = -2au since a=-1 */
+    gf_mul ( c, d, p->z ); /* 2uZ */
+    cond_neg ( b, toggle_hibit_t_over_s ^ ~hibit(c) ); /* u <- -u if negative. */
+    cond_neg ( c, toggle_hibit_t_over_s ^ ~hibit(c) ); /* u <- -u if negative. */
+    gf_mul ( d, b, p->y ); 
+    gf_add ( s, a, d );
+    cond_neg ( s, toggle_hibit_s ^ hibit(s) );
+#else
+    /* More complicated because of rotation */
+    /* FIXME This code is wrong for certain non-Curve25519 curves; check if it's because of Cofactor==8 or IMAGINE_ROTATION */
+    
+    gf c, d;
     gf_s *b = s, *a = minus_t_over_s;
 
 #if IMAGINE_TWIST
+    gf x, t;
     gf_mul ( x, p->x, SQRT_MINUS_ONE);
     gf_mul ( t, p->t, SQRT_MINUS_ONE);
     gf_sub ( x, ZERO, x );
     gf_sub ( t, ZERO, t );
-#endif
     
-    gf DEBUG;
     gf_add ( a, p->z, x );
     gf_sub ( b, p->z, x );
-    gf_mul ( c, a, b ); /* "zx" = Z^2 - X^2 */
-    gf_cpy(DEBUG,c);
+    gf_mul ( c, a, b ); /* "zx" = Z^2 - aX^2 = Z^2 - X^2 */
+#else
+    const gf_s *x = p->x, *t = p->t;
+    /* Won't hit the cond_sel below because COFACTOR==8 requires IMAGINE_TWIST for now. */
+    
+    gf_sqr ( a, p->z );
+    gf_sqr ( b, p->x );
+    gf_add ( c, a, b ); /* "zx" = Z^2 - aX^2 = Z^2 + X^2 */
+#endif
+    
     gf_mul ( a, p->z, t ); /* "tz" = T*Z */
     gf_sqr ( b, a );
-    gf_mul ( d, b, c ); /* (TZ)^2 * (Z^2-X^2) */
+    gf_mul ( d, b, c ); /* (TZ)^2 * (Z^2-aX^2) */
     decaf_bool_t ok = gf_isqrt_chk ( b, d, DECAF_TRUE );
     (void)ok; assert(ok);
-    gf_mul ( d, b, a ); /* "osx" = 1 / sqrt(z^2-x^2) */
+    gf_mul ( d, b, a ); /* "osx" = 1 / sqrt(z^2-ax^2) */
     gf_mul ( a, b, c ); 
     gf_mul ( b, a, d ); /* 1/tz */
 
@@ -445,6 +476,7 @@ static void deisogenize (
         cond_sel ( x, p->y, x, rotate );
     }
 #else
+    (void)toggle_rotation;
     rotate = 0;
 #endif
     
@@ -458,6 +490,8 @@ static void deisogenize (
     gf_add ( d, d, c );
     gf_mul ( b, d, x ); /* here "x" = y unless rotate */
     cond_neg ( b, toggle_hibit_s ^ hibit(b) );
+    
+#endif
 }
 
 void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
@@ -472,7 +506,7 @@ void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
 static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
     return gf_deserialize((gf_s *)s, ser);
 }
-   
+
 decaf_bool_t API_NS(point_decode) (
     point_t p,
     const unsigned char ser[SER_BYTES],
@@ -483,25 +517,32 @@ decaf_bool_t API_NS(point_decode) (
     succ &= allow_identity | ~zero;
     succ &= ~hibit(s);
     gf_sqr ( a, s );
-    gf_sub ( f, ONE, a ); /* f = 1-s^2 = 1-as^2 since a=1 */
+#if IMAGINE_TWIST
+    gf_sub ( f, ONE, a ); /* f = 1-as^2 = 1-s^2*/
+#else
+    gf_add ( f, ONE, a ); /* f = 1-as^2 = 1+s^2 */
+#endif
     succ &= ~ gf_eq( f, ZERO );
     gf_sqr ( b, f ); 
-    gf_mulw_sgn ( c, a, 4-4*EDWARDS_D ); 
+    gf_mulw_sgn ( c, a, 4*IMAGINE_TWIST-4*EDWARDS_D ); 
     gf_add ( c, c, b ); /* t^2 */
-    gf_mul ( d, f, s ); /* s(1-s^2) for denoms */
+    gf_mul ( d, f, s ); /* s(1-as^2) for denoms */
     gf_sqr ( e, d );
     gf_mul ( b, c, e );
     
-    succ &= gf_isqrt_chk ( e, b, DECAF_TRUE ); /* e = 1/(t s (1-s^2)) */
+    succ &= gf_isqrt_chk ( e, b, DECAF_TRUE ); /* e = 1/(t s (1-as^2)) */
     gf_mul ( b, e, d ); /* 1/t */
-    gf_mul ( d, e, c ); /* d = t / (s(1-s^2)) */
+    gf_mul ( d, e, c ); /* d = t / (s(1-as^2)) */
     gf_mul ( e, d, f ); /* t/s */
     decaf_bool_t negtos = hibit(e);
     cond_neg(b, negtos);
     cond_neg(d, negtos);
-    
-    gf_add ( p->z, ONE, a); /* Z = 1+s^2 */
-    succ &= ~gf_eq( p->z, ZERO ); /* FUTURE: unnecessary? */
+
+#if IMAGINE_TWIST
+    gf_add ( p->z, ONE, a); /* Z = 1+as^2 = 1-s^2 */
+#else
+    gf_sub ( p->z, ONE, a); /* Z = 1+as^2 = 1-s^2 */
+#endif
 
 #if COFACTOR == 8
     gf_mul ( a, p->z, d); /* t(1+s^2) / s(1-s^2) = 2/xy */
@@ -745,7 +786,7 @@ static void pt_to_pniels (
 ) {
     gf_sub ( b->n->a, a->y, a->x );
     gf_add ( b->n->b, a->x, a->y );
-    gf_mulw_sgn ( b->n->c, a->t, 2*EFF_D );
+    gf_mulw_sgn ( b->n->c, a->t, 2*TWISTED_D );
     gf_add ( b->z, a->z, a->z );
 }
 
diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c
index c0aaa29..1db5aa6 100644
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -15,8 +15,8 @@
 #include "decaf_config.h"
 #include "field.h"
 
-#define API_NS(_id) decaf_255_##_id
-#define API_NS2(_pref,_id) _pref##_decaf_255_##_id
+#define GEN_TABLES
+#include "curve_data.inc.c"
 
  /* To satisfy linker. */
 const gf API_NS(precomputed_base_as_fe)[1];
@@ -24,7 +24,6 @@ const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(sc_r2) = {{{0}}};
 const decaf_word_t API_NS(MONTGOMERY_FACTOR) = 0;
-const unsigned char base_point_ser_for_pregen[DECAF_255_SER_BYTES];
 
 const API_NS(point_t) API_NS(point_base);
 
@@ -94,8 +93,8 @@ int main(int argc, char **argv) {
     printf("/** @warning: this file was automatically generated. */\n");
     printf("#include <decaf.h>\n\n");
     printf("#include \"field.h\"\n\n");
-    printf("#define API_NS(_id) decaf_255_##_id\n");
-    printf("#define API_NS2(_pref,_id) _pref##_decaf_255_##_id\n");
+    printf("#define API_NS(_id) %s_##_id\n", API_NAME);
+    printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);
     
     output = (const gf_s *)real_point_base;
     printf("const API_NS(point_t) API_NS(point_base) = {{\n");
@@ -136,8 +135,8 @@ int main(int argc, char **argv) {
     scalar_print("API_NS(precomputed_scalarmul_adjustment)", smadj);
     
     API_NS(scalar_copy)(smadj,API_NS(scalar_one));
-    for (i=0; i<DECAF_255_SCALAR_BITS-1 + DECAF_WINDOW_BITS
-            - ((DECAF_255_SCALAR_BITS-1)%DECAF_WINDOW_BITS); i++) {
+    for (i=0; i<SCALAR_BITS-1 + DECAF_WINDOW_BITS
+            - ((SCALAR_BITS-1) % DECAF_WINDOW_BITS); i++) {
         API_NS(scalar_add)(smadj,smadj,smadj);
     }
     API_NS(scalar_sub)(smadj, smadj, API_NS(scalar_one));
diff --git a/src/p25519/f_field.h b/src/p25519/f_field.h
index b210644..d9e94a3 100644
--- a/src/p25519/f_field.h
+++ b/src/p25519/f_field.h
@@ -15,7 +15,7 @@
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  51
 #define GF_BITS           255
-#define gf              gf_25519_t
+#define gf                gf_25519_t
 #define gf_s              gf_25519_s
 #define gf_mul            gf_25519_mul
 #define gf_sqr            gf_25519_sqr
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index f842b5f..b9a5872 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -23,9 +23,9 @@ static uint64_t widemul_32 (
 
 void
 p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
+    const gf_448_t bs
 ) { 
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
@@ -84,8 +84,8 @@ p448_mul (
 
 void
 p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
     uint64_t b
 ) {
     const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
@@ -128,15 +128,15 @@ p448_mulw (
 
 void
 p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *as
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as
 ) {
     p448_mul(cs,as,as); /* PERF */
 }
 
 void
 p448_strong_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     word_t mask = (1ull<<28)-1;
 
@@ -180,14 +180,14 @@ p448_strong_reduce (
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 ) {
     int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
+    gf_448_t red;
+    p448_copy(red, x);
+    p448_strong_reduce(red);
     for (i=0; i<8; i++) {
-        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
+        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
         for (j=0; j<7; j++) {
             serial[7*i+j] = limb;
             limb >>= 8;
@@ -198,7 +198,7 @@ p448_serialize (
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h
index 89bf763..d1f6f72 100644
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>
 
-typedef struct p448_t {
+typedef struct gf_448_s {
   uint32_t limb[16];
-} __attribute__((aligned(32))) p448_t;
+} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
 
 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
@@ -24,69 +24,69 @@ extern "C" {
 
 static __inline__ void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_weak_reduce (
-    p448_t *inout
+    gf_448_t inout
 ) __attribute__((unused,always_inline));
              
 void
 p448_strong_reduce (
-    p448_t *inout
+    gf_448_t inout
 );
              
 static __inline__ void
 p448_bias (
-    p448_t *inout,
+    gf_448_t inout,
     int amount
 ) __attribute__((unused,always_inline));
 
 void
 p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
+    const gf_448_t b
 );
 
 void
 p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
     uint64_t b
 );
 
 void
 p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
+    gf_448_s *__restrict__ out,
+    const gf_448_t a
 );
 
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 );
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 );
 
@@ -94,9 +94,9 @@ p448_deserialize (
 
 void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (
 
 void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (
 
 void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) {
   *out = *a;
 }
 
 void
 p448_bias (
-    p448_t *a,
+    gf_448_t a,
     int amt
 ) {
     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (
 
 void
 p448_weak_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     uint64_t mask = (1ull<<28) - 1;
     uint64_t tmp = a->limb[15] >> 28;
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index e0edfb9..068774a 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -100,9 +100,9 @@ smull2 (
 
 void
 p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
+    const gf_448_t bs
 ) {
     
     const uint32_t *a = as->limb, *b = bs->limb;
@@ -451,8 +451,8 @@ p448_mul (
 
 void
 p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *as
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as
 ) {
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
@@ -749,8 +749,8 @@ p448_sqr (
 
 void
 p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
     uint64_t b
 ) {
     uint32_t mask = (1ull<<28)-1;  
@@ -863,7 +863,7 @@ p448_mulw (
 
 void
 p448_strong_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     word_t mask = (1ull<<28)-1;
 
@@ -907,14 +907,14 @@ p448_strong_reduce (
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 ) {
     int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
+    gf_448_t red;
+    p448_copy(red, x);
+    p448_strong_reduce(red);
     for (i=0; i<8; i++) {
-        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
+        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
         for (j=0; j<7; j++) {
             serial[7*i+j] = limb;
             limb >>= 8;
@@ -925,7 +925,7 @@ p448_serialize (
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_arm_32/f_impl.h b/src/p448/arch_arm_32/f_impl.h
index 89bf763..d1f6f72 100644
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>
 
-typedef struct p448_t {
+typedef struct gf_448_s {
   uint32_t limb[16];
-} __attribute__((aligned(32))) p448_t;
+} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
 
 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
@@ -24,69 +24,69 @@ extern "C" {
 
 static __inline__ void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_weak_reduce (
-    p448_t *inout
+    gf_448_t inout
 ) __attribute__((unused,always_inline));
              
 void
 p448_strong_reduce (
-    p448_t *inout
+    gf_448_t inout
 );
              
 static __inline__ void
 p448_bias (
-    p448_t *inout,
+    gf_448_t inout,
     int amount
 ) __attribute__((unused,always_inline));
 
 void
 p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
+    const gf_448_t b
 );
 
 void
 p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
     uint64_t b
 );
 
 void
 p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
+    gf_448_s *__restrict__ out,
+    const gf_448_t a
 );
 
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 );
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 );
 
@@ -94,9 +94,9 @@ p448_deserialize (
 
 void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (
 
 void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (
 
 void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) {
   *out = *a;
 }
 
 void
 p448_bias (
-    p448_t *a,
+    gf_448_t a,
     int amt
 ) {
     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (
 
 void
 p448_weak_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     uint64_t mask = (1ull<<28) - 1;
     uint64_t tmp = a->limb[15] >> 28;
diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon_experimental/f_impl.c
index 6e57b8f..371e668 100644
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -70,9 +70,9 @@ smull2 (
 
 void
 p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
+    const gf_448_t bs
 ) {
     #define _bl0 "q0"
     #define _bl0_0 "d0"
@@ -369,8 +369,8 @@ p448_mul (
 
 void
 p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t bs
 ) {
     int32x2_t *vc = (int32x2_t*) cs->limb;
 
@@ -570,8 +570,8 @@ p448_sqr (
 
 void
 p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
     uint64_t b
 ) { 
     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
@@ -621,7 +621,7 @@ p448_mulw (
 /* PERF: vectorize? */
 void
 p448_strong_reduce (
-    p448_t *a
+    gf_448_t a
 ) { 
     word_t mask = (1ull<<28)-1;
 
@@ -665,15 +665,15 @@ p448_strong_reduce (
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 ) {
     int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
+    gf_448_t red;
+    p448_copy(red, x);
+    p448_strong_reduce(red);
     
     for (i=0; i<8; i++) {
-        uint64_t limb = red.limb[LIMBPERM(2*i)] + (((uint64_t)red.limb[LIMBPERM(2*i+1)])<<28);
+        uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28);
         for (j=0; j<7; j++) {
             serial[7*i+j] = limb;
             limb >>= 8;
@@ -684,7 +684,7 @@ p448_serialize (
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_neon_experimental/f_impl.h b/src/p448/arch_neon_experimental/f_impl.h
index 75bd92e..6a26a6f 100644
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>
 
-typedef struct p448_t {
+typedef struct gf_448_s {
   uint32_t limb[16];
-} __attribute__((aligned(32))) p448_t;
+} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
 
 #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
 #define USE_NEON_PERM 1
@@ -30,69 +30,69 @@ extern "C" {
 
 static __inline__ void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_weak_reduce (
-    p448_t *inout
+    gf_448_t inout
 ) __attribute__((unused,always_inline));
              
 void
 p448_strong_reduce (
-    p448_t *inout
+    gf_448_t inout
 );
              
 static __inline__ void
 p448_bias (
-    p448_t *inout,
+    gf_448_t inout,
     int amount
 ) __attribute__((unused,always_inline));
 
 void
 p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
+    const gf_448_t b
 );
 
 void
 p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
     uint64_t b
 );
 
 void
 p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
+    gf_448_s *__restrict__ out,
+    const gf_448_t a
 );
 
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 );
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 );
 
@@ -100,9 +100,9 @@ p448_deserialize (
 
 void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (
 
 void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (
 
 void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) {
   *out = *a;
 }
 
 void
 p448_bias (
-    p448_t *a,
+    gf_448_t a,
     int amt
 ) {
     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (
 
 void
 p448_weak_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
 
     uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index bf08f49..4685188 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -18,9 +18,9 @@ static __inline__ uint64_t is_zero(uint64_t a) {
 
 void
 p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
+    const gf_448_t bs
 ) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
@@ -184,8 +184,8 @@ p448_mul (
 
 void
 p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb;
@@ -213,8 +213,8 @@ p448_mulw (
 
 void
 p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *as
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as
 ) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
@@ -328,7 +328,7 @@ p448_sqr (
 
 void
 p448_strong_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     uint64_t mask = (1ull<<56)-1;
 
@@ -372,24 +372,24 @@ p448_strong_reduce (
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 ) {
     int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
+    gf_448_t red;
+    p448_copy(red, x);
+    p448_strong_reduce(red);
     for (i=0; i<8; i++) {
         for (j=0; j<7; j++) {
-            serial[7*i+j] = red.limb[i];
-            red.limb[i] >>= 8;
+            serial[7*i+j] = red->limb[i];
+            red->limb[i] >>= 8;
         }
-        assert(red.limb[i] == 0);
+        assert(red->limb[i] == 0);
     }
 }
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_ref64/f_impl.h b/src/p448/arch_ref64/f_impl.h
index b7ff50d..5fe6590 100644
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -10,9 +10,9 @@
 
 #include "word.h"
 
-typedef struct p448_t {
+typedef struct gf_448_s {
   uint64_t limb[8];
-} __attribute__((aligned(32))) p448_t;
+} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
 
 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
@@ -23,69 +23,69 @@ extern "C" {
 
 static __inline__ void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused));
              
 static __inline__ void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused));
              
 static __inline__ void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) __attribute__((unused));
              
 static __inline__ void
 p448_weak_reduce (
-    p448_t *inout
+    gf_448_t inout
 ) __attribute__((unused));
              
 void
 p448_strong_reduce (
-    p448_t *inout
+    gf_448_t inout
 );
 
 static __inline__ void
 p448_bias (
-    p448_t *inout,
+    gf_448_t inout,
     int amount
 ) __attribute__((unused));
          
 void
 p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
+    const gf_448_t b
 );
 
 void
 p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
     uint64_t b
 );
 
 void
 p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
+    gf_448_s *__restrict__ out,
+    const gf_448_t a
 );
 
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 );
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 );
 
@@ -93,9 +93,9 @@ p448_deserialize (
 
 void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<8; i++) {
@@ -106,9 +106,9 @@ p448_add_RAW (
 
 void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
@@ -120,15 +120,15 @@ p448_sub_RAW (
 
 void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) {
     memcpy(out,a,sizeof(*a));
 }
 
 void
 p448_bias (
-    p448_t *a,
+    gf_448_t a,
     int amt
 ) {
     (void) a;
@@ -137,7 +137,7 @@ p448_bias (
 
 void
 p448_weak_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     uint64_t mask = (1ull<<56) - 1;
     uint64_t tmp = a->limb[7] >> 56;
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 9df771e..e959dbc 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -7,9 +7,9 @@
 
 void
 p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
+    const gf_448_t bs
 ) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
@@ -147,8 +147,8 @@ p448_mul (
 
 void
 p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb;
@@ -192,8 +192,8 @@ p448_mulw (
 
 void
 p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *as
+    gf_448_s *__restrict__ cs,
+    const gf_448_t as
 ) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
@@ -307,7 +307,7 @@ p448_sqr (
 
 void
 p448_strong_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     uint64_t mask = (1ull<<56)-1;
 
@@ -351,24 +351,24 @@ p448_strong_reduce (
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 ) {
     int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
+    gf_448_t red;
+    p448_copy(red, x);
+    p448_strong_reduce(red);
     for (i=0; i<8; i++) {
         for (j=0; j<7; j++) {
-            serial[7*i+j] = red.limb[i];
-            red.limb[i] >>= 8;
+            serial[7*i+j] = red->limb[i];
+            red->limb[i] >>= 8;
         }
-        assert(red.limb[i] == 0);
+        assert(red->limb[i] == 0);
     }
 }
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_x86_64/f_impl.h b/src/p448/arch_x86_64/f_impl.h
index 6e7c523..aa93b24 100644
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -9,9 +9,12 @@
 
 #include "word.h"
 
-typedef struct p448_t {
+#ifndef __DECAF_448_H__ // HACK FIXME
+#define DECAF_WORD_BITS 64
+typedef struct gf_448_s {
   uint64_t limb[8];
-} __attribute__((aligned(32))) p448_t;
+} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
+#endif
 
 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
@@ -22,69 +25,69 @@ extern "C" {
 
 static __inline__ void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
 p448_weak_reduce (
-    p448_t *inout
+    gf_448_t inout
 ) __attribute__((unused,always_inline));
              
 void
 p448_strong_reduce (
-    p448_t *inout
+    gf_448_t inout
 );
 
 static __inline__ void
 p448_bias (
-    p448_t *inout,
+    gf_448_t inout,
     int amount
 ) __attribute__((unused,always_inline));
          
 void
 p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
+    const gf_448_t b
 );
 
 void
 p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
+    gf_448_s *__restrict__ out,
+    const gf_448_t a,
     uint64_t b
 );
 
 void
 p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
+    gf_448_s *__restrict__ out,
+    const gf_448_t a
 );
 
 void
 p448_serialize (
     uint8_t *serial,
-    const struct p448_t *x
+    const gf_448_t x
 );
 
 mask_t
 p448_deserialize (
-    p448_t *x,
+    gf_448_t x,
     const uint8_t serial[56]
 );
 
@@ -92,9 +95,9 @@ p448_deserialize (
 
 void
 p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -110,9 +113,9 @@ p448_add_RAW (
 
 void
 p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
+    gf_448_t out,
+    const gf_448_t a,
+    const gf_448_t b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -128,8 +131,8 @@ p448_sub_RAW (
 
 void
 p448_copy (
-    p448_t *out,
-    const p448_t *a
+    gf_448_t out,
+    const gf_448_t a
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
@@ -139,7 +142,7 @@ p448_copy (
 
 void
 p448_bias (
-    p448_t *a,
+    gf_448_t a,
     int amt
 ) {
     uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
@@ -166,7 +169,7 @@ p448_bias (
 
 void
 p448_weak_reduce (
-    p448_t *a
+    gf_448_t a
 ) {
     /* PERF: use pshufb/palignr if anyone cares about speed of this */
     uint64_t mask = (1ull<<56) - 1;
diff --git a/src/p448/f_field.h b/src/p448/f_field.h
index 1f55490..59043e0 100644
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -15,7 +15,8 @@
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  56
 #define GF_BITS           448
-#define gf              p448_t
+#define gf                gf_448_t
+#define gf_s              gf_448_s
 #define gf_mul            p448_mul
 #define gf_sqr            p448_sqr
 #define gf_add_RAW        p448_add_RAW
diff --git a/src/public_include/decaf.hxx b/src/public_include/decaf.hxx
index 77df0d4..d1a9900 100644
--- a/src/public_include/decaf.hxx
+++ b/src/public_include/decaf.hxx
@@ -3,6 +3,7 @@
 #define __DECAF_HXX__ 1
 
 #include <decaf/decaf_255.hxx> // MAGIC
+#include <decaf/decaf_448.hxx> // MAGIC
 
 #endif /* __DECAF_H__ */
 
diff --git a/src/public_include/decaf/decaf_255.hxx b/src/public_include/decaf/decaf_255.hxx
index 39a43b0..e84a792 100644
--- a/src/public_include/decaf/decaf_255.hxx
+++ b/src/public_include/decaf/decaf_255.hxx
@@ -46,7 +46,13 @@ namespace decaf {
 /**
  * @brief Curve25519/Decaf instantiation of group.
  */
-struct Ed255 {
+struct IsoEd25519 {
+    
+/** The name of the curve */
+static inline const char *name() { return "IsoEd25519"; }
+
+/** The curve's cofactor (removed, but useful for testing) */
+static const int REMOVED_COFACTOR = 8;
 
 /** @cond internal */
 class Point;
@@ -533,17 +539,17 @@ public:
     /** @endcond */
 };
 
-}; /* struct Ed255 */
+}; /* struct IsoEd25519 */
 
 
 
 /** @cond internal */
-inline SecureBuffer Ed255::Scalar::direct_scalarmul (
+inline SecureBuffer IsoEd25519::Scalar::direct_scalarmul (
     const Block &in,
     decaf_bool_t allow_identity,
     decaf_bool_t short_circuit
 ) const throw(CryptoException) {
-    SecureBuffer out(Ed255::Point::SER_BYTES);
+    SecureBuffer out(IsoEd25519::Point::SER_BYTES);
     if (!decaf_255_direct_scalarmul(out, in.data(), s, allow_identity, short_circuit))
         throw CryptoException();
     return out;
diff --git a/src/public_include/decaf/decaf_448.h b/src/public_include/decaf/decaf_448.h
index 6a7345e..bcf7c91 100644
--- a/src/public_include/decaf/decaf_448.h
+++ b/src/public_include/decaf/decaf_448.h
@@ -426,7 +426,7 @@ decaf_bool_t decaf_448_point_valid (
 ) API_VIS WARN_UNUSED NONNULL1 NOINLINE;
 
 /**
- * @brief 2-torque a point, for debugging purposes.
+ * @brief Torque a point, for debugging purposes.
  *
  * @param [out] q The point to torque.
  * @param [in] p The point to torque.
@@ -436,6 +436,21 @@ void decaf_448_point_debugging_torque (
      const decaf_448_point_t p
 ) API_VIS NONNULL2 NOINLINE;
 
+/**
+ * @brief Projectively scale a point, for debugging purposes.
+ * The output will be equal to the input, and will be valid
+ * even if the factor is zero.
+ *
+ * @param [out] q The point to scale.
+ * @param [in] p The point to scale.
+ * @param [in] factor Serialized GF factor to scale.
+ */
+void decaf_448_point_debugging_pscale (
+     decaf_448_point_t q,
+     const decaf_448_point_t p,
+     const unsigned char factor[DECAF_448_SER_BYTES]
+) API_VIS NONNULL2 NOINLINE;
+
 /**
  * @brief Almost-Elligator-like hash to curve.
  *
diff --git a/src/public_include/decaf/decaf_448.hxx b/src/public_include/decaf/decaf_448.hxx
index 8448a18..c043fcb 100644
--- a/src/public_include/decaf/decaf_448.hxx
+++ b/src/public_include/decaf/decaf_448.hxx
@@ -46,7 +46,13 @@ namespace decaf {
 /**
  * @brief Ed448-Goldilocks/Decaf instantiation of group.
  */
-struct Ed448 {
+struct Ed448Goldilocks {
+    
+/** The name of the curve */
+static inline const char *name() { return "Ed448-Goldilocks"; }
+
+/** The curve's cofactor (removed, but useful for testing) */
+static const int REMOVED_COFACTOR = 4;
 
 /** @cond internal */
 class Point;
diff --git a/test/bench_decaf.cxx b/test/bench_decaf.cxx
index b28e1ac..9b35e8e 100644
--- a/test/bench_decaf.cxx
+++ b/test/bench_decaf.cxx
@@ -20,9 +20,6 @@
 #include <algorithm>
 
 using namespace decaf;
-typedef Ed255::Scalar Scalar;
-typedef Ed255::Point Point;
-typedef Ed255::Precomputed Precomputed;
 
 
 static __inline__ void __attribute__((unused)) ignore_result ( int result ) { (void)result; }
@@ -140,6 +137,13 @@ public:
 
 double Benchmark::totalCy = 0, Benchmark::totalS = 0;
 
+
+template<typename Group> struct Benches {
+
+typedef typename Group::Scalar Scalar;
+typedef typename Group::Point Point;
+typedef typename Group::Precomputed Precomputed;
+
 static void tdh (
     SpongeRng &clientRng,
     SpongeRng &serverRng,
@@ -274,6 +278,62 @@ static void spake2ee(
     server.respec(STROBE_KEYED_128);
 }
 
+static void macro() {
+    printf("\nMacro-benchmarks for %s:\n", Group::name());
+    printf("Protocol benchmarks:\n");
+    SpongeRng clientRng(Block("client rng seed"));
+    SpongeRng serverRng(Block("server rng seed"));
+    SecureBuffer hashedPassword("hello world");
+    for (Benchmark b("Spake2ee c+s",0.1); b.iter(); ) {
+        spake2ee(clientRng, serverRng, hashedPassword,false);
+    }
+    
+    for (Benchmark b("Spake2ee c+s aug",0.1); b.iter(); ) {
+        spake2ee(clientRng, serverRng, hashedPassword,true);
+    }
+    
+    Scalar x(clientRng);
+    SecureBuffer gx(Precomputed::base() * x);
+    Scalar y(serverRng);
+    SecureBuffer gy(Precomputed::base() * y);
+    
+    for (Benchmark b("FHMQV c+s",0.1); b.iter(); ) {
+        fhmqv(clientRng, serverRng,x,gx,y,gy);
+    }
+    
+    for (Benchmark b("TripleDH anon c+s",0.1); b.iter(); ) {
+        tdh(clientRng, serverRng, x,gx,y,gy);
+    }
+}
+
+static void micro() {
+    SpongeRng rng(Block("per-curve-benchmarks"));
+    Precomputed pBase;
+    Point p,q;
+    Scalar s,t;
+    SecureBuffer ep, ep2(Point::SER_BYTES*2);
+    
+    printf("\nMicro-benchmarks for %s:\n", Group::name());
+    for (Benchmark b("Scalar add", 1000); b.iter(); ) { s+=t; }
+    for (Benchmark b("Scalar times", 100); b.iter(); ) { s*=t; }
+    for (Benchmark b("Scalar inv", 1); b.iter(); ) { s.inverse(); }
+    for (Benchmark b("Point add", 100); b.iter(); ) { p += q; }
+    for (Benchmark b("Point double", 100); b.iter(); ) { p.double_in_place(); }
+    for (Benchmark b("Point scalarmul"); b.iter(); ) { p * s; }
+    for (Benchmark b("Point encode"); b.iter(); ) { ep = SecureBuffer(p); }
+    for (Benchmark b("Point decode"); b.iter(); ) { p = Point(ep); }
+    for (Benchmark b("Point create/destroy"); b.iter(); ) { Point r; }
+    for (Benchmark b("Point hash nonuniform"); b.iter(); ) { Point::from_hash(ep); }
+    for (Benchmark b("Point hash uniform"); b.iter(); ) { Point::from_hash(ep2); }
+    for (Benchmark b("Point unhash nonuniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep,0)); }
+    for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
+    for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
+    for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
+    for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
+}
+
+}; /* template <typename group> struct Benches */
+
 int main(int argc, char **argv) {
     bool micro = false;
     if (argc >= 2 && !strcmp(argv[1], "--micro"))
@@ -293,10 +353,6 @@ int main(int argc, char **argv) {
 
 
     if (micro) {
-        Precomputed pBase;
-        Point p,q;
-        Scalar s,t;
-        SecureBuffer ep, ep2(Point::SER_BYTES*2);
         SpongeRng rng(Block("micro-benchmarks"));
         
         printf("\nMicro-benchmarks:\n");
@@ -325,25 +381,12 @@ int main(int argc, char **argv) {
         for (Benchmark b("STROBEk256 1kiB", 10); b.iter(); ) {
             strobe.encrypt_no_auth(TmpBuffer(b1024,1024),TmpBuffer(b1024,1024),b.i>1);
         }
-        for (Benchmark b("Scalar add", 1000); b.iter(); ) { s+=t; }
-        for (Benchmark b("Scalar times", 100); b.iter(); ) { s*=t; }
-        for (Benchmark b("Scalar inv", 1); b.iter(); ) { s.inverse(); }
-        for (Benchmark b("Point add", 100); b.iter(); ) { p += q; }
-        for (Benchmark b("Point double", 100); b.iter(); ) { p.double_in_place(); }
-        for (Benchmark b("Point scalarmul"); b.iter(); ) { p * s; }
-        for (Benchmark b("Point encode"); b.iter(); ) { ep = SecureBuffer(p); }
-        for (Benchmark b("Point decode"); b.iter(); ) { p = Point(ep); }
-        for (Benchmark b("Point create/destroy"); b.iter(); ) { Point r; }
-        for (Benchmark b("Point hash nonuniform"); b.iter(); ) { Point::from_hash(ep); }
-        for (Benchmark b("Point hash uniform"); b.iter(); ) { Point::from_hash(ep2); }
-        for (Benchmark b("Point unhash nonuniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep,0)); }
-        for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
-        for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
-        for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
-        for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
         /* TODO: scalarmul for verif, etc */
+        Benches<IsoEd25519>::micro();
+        Benches<Ed448Goldilocks>::micro();
     }
 
+    /* TODO: 255->448 */
     printf("\nMacro-benchmarks:\n");
     for (Benchmark b("Keygen"); b.iter(); ) {
         decaf_255_derive_private_key(s1,r1);
@@ -369,31 +412,9 @@ int main(int argc, char **argv) {
         umessage[1]^=umessage[0];
         ignore_result(ret);
     }
-
-    printf("\nProtocol benchmarks:\n");
-    SpongeRng clientRng(Block("client rng seed"));
-    SpongeRng serverRng(Block("server rng seed"));
-    SecureBuffer hashedPassword("hello world");
-    for (Benchmark b("Spake2ee c+s",0.1); b.iter(); ) {
-        spake2ee(clientRng, serverRng, hashedPassword,false);
-    }
     
-    for (Benchmark b("Spake2ee c+s aug",0.1); b.iter(); ) {
-        spake2ee(clientRng, serverRng, hashedPassword,true);
-    }
-    
-    Scalar x(clientRng);
-    SecureBuffer gx(Precomputed::base() * x);
-    Scalar y(serverRng);
-    SecureBuffer gy(Precomputed::base() * y);
-    
-    for (Benchmark b("FHMQV c+s",0.1); b.iter(); ) {
-        fhmqv(clientRng, serverRng,x,gx,y,gy);
-    }
-    
-    for (Benchmark b("TripleDH anon c+s",0.1); b.iter(); ) {
-        tdh(clientRng, serverRng, x,gx,y,gy);
-    }
+    Benches<IsoEd25519>::macro();
+    Benches<Ed448Goldilocks>::macro();
     
     printf("\n");
     Benchmark::calib();
diff --git a/test/test_decaf.cxx b/test/test_decaf.cxx
index 9e98222..2a9103a 100644
--- a/test/test_decaf.cxx
+++ b/test/test_decaf.cxx
@@ -164,7 +164,7 @@ static void test_elligator() {
     decaf::SpongeRng rng(decaf::Block("test_elligator"));
     Test test("Elligator");
     
-    const int NHINTS = 1<<4;
+    const int NHINTS = Group::REMOVED_COFACTOR * 2;
     decaf::SecureBuffer *alts[NHINTS];
     bool successes[NHINTS];
     decaf::SecureBuffer *alts2[NHINTS];
@@ -312,7 +312,7 @@ static void test_ec() {
 
 }; // template<decaf::GroupId GROUP>
 
-
+// FIXME cross-field
 static void test_decaf() {
     Test test("Sample crypto");
     decaf::SpongeRng rng(decaf::Block("test_decaf"));
@@ -350,11 +350,18 @@ static void test_decaf() {
 int main(int argc, char **argv) {
     (void) argc; (void) argv;
     
-    Tests<decaf::Ed255>::test_arithmetic();
-    Tests<decaf::Ed255>::test_elligator();
-    Tests<decaf::Ed255>::test_ec();
+    printf("Testing %s:\n", decaf::IsoEd25519::name());
+    Tests<decaf::IsoEd25519>::test_arithmetic();
+    Tests<decaf::IsoEd25519>::test_elligator();
+    Tests<decaf::IsoEd25519>::test_ec();
     test_decaf();
     
+    printf("\n");
+    printf("Testing %s:\n", decaf::Ed448Goldilocks::name());
+    Tests<decaf::Ed448Goldilocks>::test_arithmetic();
+    Tests<decaf::Ed448Goldilocks>::test_elligator();
+    Tests<decaf::Ed448Goldilocks>::test_ec();
+    
     if (passing) printf("Passed all tests.\n");
     
     return passing ? 0 : 1;