Cross-curve compilation working! Still a bunch of FIXMEs though

9 years ago · cdab495338
--- a/+ 76
+++ b/+ 76
@@ -39,8 +39,6 @@ else
 ARCH ?= arch_ref32
 endif

 FIELD ?= p25519

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)

@@ -79,22 +77,13 @@ SAGES= $(shell ls test/*.sage)
 BUILDPYS= $(SAGES:test/%.sage=$(BUILD_PY)/%.py)

 .PHONY: clean all test bench todo doc lib bat sage sagetest
 .PRECIOUS: $(BUILD_ASM)/%.s $(BUILD_ASM)/%_impl.s $(BUILD_ASM)/$(DECAF)_%.s $(BUILD_ASM)/decaf_tables_%.c \
 	$(BUILD_IBIN)/decaf_gen_tables_%

 HEADERS= Makefile $(shell find src test -name "*.h") $(shell find . -name "*.hxx") $(BUILD_OBJ)/timestamp
 .PRECIOUS: $(BUILD_ASM)/%.s $(BUILD_C)/%.c $(BUILD_IBIN)/%

 # components needed by the table generators
 GENCOMPONENTS=  \
 	$(BUILD_OBJ)/$(DECAF)_ed25519.o $(BUILD_OBJ)/p25519_impl.o  $(BUILD_OBJ)/p25519_arithmetic.o \
 	$(BUILD_OBJ)/utils.o \
 	#$(BUILD_OBJ)/p448_impl.o $(BUILD_OBJ)/p448_arithmetic.o
 HEADERS= Makefile $(shell find src test -name "*.h") $(BUILD_OBJ)/timestamp
 HEADERSXX = $(HEADERS) $(shell find . -name "*.hxx") 

 # components needed by the lib
 DECAFCOMPONENTS= $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o $(GENCOMPONENTS)
 ifeq ($(DECAF),decaf_fast)
 DECAFCOMPONENTS += $(BUILD_OBJ)/decaf_tables_ed25519.o
 endif
 LIBCOMPONENTS = $(BUILD_OBJ)/utils.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o # and per-field components

 BENCHCOMPONENTS = $(BUILD_OBJ)/bench.o $(BUILD_OBJ)/shake.o

@@ -105,26 +94,7 @@ scan: clean
 		 -enable-checker deadcode -enable-checker llvm \
 		 -enable-checker osx -enable-checker security -enable-checker unix \
 		make all
 		
 # The shakesum utility is in the public bin directory.
 $(BUILD_BIN)/shakesum: $(BUILD_OBJ)/shakesum.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/utils.o
 	$(LD) $(LDFLAGS) -o $@ $^

 # The main decaf library, and its symlinks.
 lib: $(BUILD_LIB)/libdecaf.so

 $(BUILD_LIB)/libdecaf.so: $(BUILD_LIB)/libdecaf.so.1
 	ln -sf `basename $^` $@

 $(BUILD_LIB)/libdecaf.so.1: $(DECAFCOMPONENTS)
 	rm -f $@
 ifeq ($(UNAME),Darwin)
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  $(DECAFCOMPONENTS)
 else
 	$(LD) $(LDFLAGS) -shared -Wl,-soname,`basename $@` -Wl,--gc-sections -o $@ $(DECAFCOMPONENTS)
 	strip --discard-all $@
 endif

 # Internal test programs, which are not part of the final build/bin directory.
 $(BUILD_IBIN)/test: $(BUILD_OBJ)/test_decaf.o lib
@@ -150,50 +120,86 @@ $(BUILD_OBJ)/timestamp:
 $(BUILD_OBJ)/%.o: $(BUILD_ASM)/%.s
 	$(ASM) $(ASFLAGS) -c -o $@ $<

 # I don't know why this rule is necessary... bug in make, or obscure pattern matching rule?
 $(BUILD_OBJ)/decaf_gen_tables_%.o: $(BUILD_ASM)/decaf_gen_tables_%.s
 	$(ASM) $(ASFLAGS) -c -o $@ $<
 ################################################################
 # Per-field code: call with field, arch
 ################################################################
 define define_field
 ARCH_FOR_$(1) = $(2)
 COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))

 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -S -c -o $$@ $$<

 $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -S -c -o $$@ $$<
 endef

 ################################################################
 # Per-field, per-curve code: call with curve, field
 ################################################################
 define define_curve
 $$(BUILD_IBIN)/decaf_gen_tables_$(1): $$(BUILD_OBJ)/decaf_gen_tables_$(1).o $$(BUILD_OBJ)/decaf_fast_$(1).o $$(BUILD_OBJ)/utils.o \
 		$$(COMPONENTS_OF_$(2))
 	$$(LD) $$(LDFLAGS) -o $$@ $$^

 $$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1)
 	./$$< > $$@ || (rm $$@; exit 1)

 $$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -S -c -o $$@ $$< \
 		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \

 $$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) \
 		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
 		-S -c -o $$@ $$<

 $$(BUILD_ASM)/decaf_fast_$(1).s: src/decaf_fast.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) \
 		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
 		-S -c -o $$@ $$<

 LIBCOMPONENTS += $$(BUILD_OBJ)/decaf_fast_$(1).o $$(BUILD_OBJ)/decaf_tables_$(1).o
 endef

 ################################################################
 # call code above to generate curves and fields
 $(eval $(call define_field,p25519,arch_x86_64))
 $(eval $(call define_curve,ed25519,p25519))
 $(eval $(call define_field,p448,arch_x86_64))
 $(eval $(call define_curve,ed448goldilocks,p448))

 $(BUILD_IBIN)/decaf_gen_tables_%: $(BUILD_OBJ)/decaf_gen_tables_%.o $(GENCOMPONENTS)
 		
 # The shakesum utility is in the public bin directory.
 $(BUILD_BIN)/shakesum: $(BUILD_OBJ)/shakesum.o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/utils.o
 	$(LD) $(LDFLAGS) -o $@ $^
 	
 $(BUILD_C)/decaf_tables_%.c: $(BUILD_IBIN)/decaf_gen_tables_%
 	./$< > $@
 	
 $(BUILD_ASM)/decaf_tables_%.s: $(BUILD_C)/decaf_tables_%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $< \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 	
 $(BUILD_ASM)/decaf_gen_tables_%.s: src/decaf_gen_tables.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/decaf_fast_%.s: src/decaf_fast.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/%_arithmetic.s: src/%/f_arithmetic.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/$* -I src/$*/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/%_impl.s: src/%/$(ARCH)/f_impl.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/$* -I src/$*/$(ARCH) \
 		-S -c -o $@ $<
 	

 # The main decaf library, and its symlinks.
 lib: $(BUILD_LIB)/libdecaf.so

 $(BUILD_LIB)/libdecaf.so: $(BUILD_LIB)/libdecaf.so.1
 	ln -sf `basename $^` $@

 $(BUILD_LIB)/libdecaf.so.1: $(LIBCOMPONENTS)
 	rm -f $@
 ifeq ($(UNAME),Darwin)
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  $(LIBCOMPONENTS)
 else
 	$(LD) $(LDFLAGS) -shared -Wl,-soname,`basename $@` -Wl,--gc-sections -o $@ $(LIBCOMPONENTS)
 	strip --discard-all $@
 endif



 $(BUILD_ASM)/%.s: src/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 	
 $(BUILD_ASM)/%.s: src/%.cxx $(HEADERS)
 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<

 $(BUILD_ASM)/%.s: test/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 $(BUILD_ASM)/%.s: test/%.cxx $(HEADERS)
 $(BUILD_ASM)/%.s: test/%.cxx $(HEADERSXX)
 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<

 # The sage test scripts
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -1,9 +1,22 @@
 /* Rename table for eventual factoring into .c.inc, MSR ECC style */
 // FIXME move to arch or something
 #define WBITS DECAF_WORD_BITS

 #if WBITS == 64
 #define LBITS 51
 typedef __int128_t decaf_sdword_t;
 #define LIMB(x) (x##ull)
 #define SC_LIMB(x) (x##ull)
 #else
 #error "Only supporting 64-bit platforms right now"
 #endif

 #define API_NAME "decaf_255"
 #define API_NS(_id) decaf_255_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id

 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
 #define NLIMBS DECAF_255_LIMBS
 #define API_NS(_id) decaf_255_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
@@ -21,12 +34,14 @@ static const scalar_t sc_p = {{{
    SC_LIMB(0x1000000000000000)
 }}};

 #ifdef GEN_TABLES
 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
 static const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
 #endif

 const gf SQRT_ONE_MINUS_D = {FIELD_LITERAL(
 static const gf SQRT_ONE_MINUS_D = {FIELD_LITERAL(
    0x6db8831bbddec,
    0x38d7b56c9c165,
    0x016b221394bdc,
--- a/src/curve_ed25519/field
+++ b/src/curve_ed25519/field
@@ -1 +0,0 @@
 ../p25519/
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -1,8 +1,27 @@
 // FIXME move to arch or something
 #define WBITS DECAF_WORD_BITS

 #if WBITS == 64
 #define LBITS 56
 typedef __int128_t decaf_sdword_t;
 #define LIMB(x) (x##ull)
 #define SC_LIMB(x) (x##ull)
 #elif WBITS == 32
 typedef int64_t decaf_sdword_t;
 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #else
 #error "Only supporting 32- and 64-bit platforms right now"
 #endif

 #define API_NAME "decaf_448"
 #define API_NS(_id) decaf_448_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_448_##_id

 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
 #define NLIMBS DECAF_448_LIMBS
 #define API_NS(_id) decaf_448_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_448_##_id
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
@@ -22,8 +41,10 @@ static const scalar_t sc_p = {{{
    SC_LIMB(0xffffffffffffffff),
    SC_LIMB(0x3fffffffffffffff)
 }}};
 

 #ifdef GEN_TABLES
 /* sqrt(5) = 2phi-1 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
 static const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
 };
 #endif
--- a/src/curve_ed448goldilocks/field
+++ b/src/curve_ed448goldilocks/field
@@ -1 +0,0 @@
 ../p448/
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -13,22 +13,6 @@
 #include <string.h>
 #include <assert.h>

 #define WBITS DECAF_WORD_BITS

 #if WBITS == 64
 #define LBITS 56
 typedef __int128_t decaf_sdword_t;
 #define LIMB(x) (x##ull)
 #define SC_LIMB(x) (x##ull)
 #elif WBITS == 32
 typedef int64_t decaf_sdword_t;
 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #else
 #error "Only supporting 32- and 64-bit platforms right now"
 #endif

 #define sv static void
 #define snv static void __attribute__((noinline))
 #define siv static inline void __attribute__((always_inline))
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -15,21 +15,13 @@
 #include "field.h"
 #include "decaf_config.h"

 #define WBITS DECAF_WORD_BITS
 #if WBITS == 64
    typedef __int128_t decaf_sdword_t;
    #define SC_LIMB(x) (x##ull)
 #elif WBITS == 32
    typedef int64_t decaf_sdword_t;
    #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #else
    #error "Only supporting 32- and 64-bit platforms right now"
 #endif


 /* Include the curve data here */
 #include "curve_data.inc.c"

 #if (COFACTOR == 8) && !IMAGINE_TWIST
 /* FUTURE: Curve41417 doesn't have these properties. */
 #error "Currently require IMAGINE_TWIST (and thus p=5 mod 8) for cofactor 8"
 #endif

 #if IMAGINE_TWIST && (P_MOD_8 != 5)
 #error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
@@ -162,6 +154,7 @@ static decaf_word_t hibit(const gf x) {
    return -(y->limb[0]&1);
 }

 #if COFACTOR==8
 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t lobit(const gf x) {
    gf y;
@@ -169,6 +162,7 @@ static decaf_word_t lobit(const gf x) {
    gf_strong_reduce(y);
    return -(y->limb[0]&1);
 }
 #endif

 /** {extra,accum} - sub +? p
 * Must have extra <= 1
@@ -408,27 +402,64 @@ static void deisogenize (
    decaf_bool_t toggle_hibit_t_over_s,
    decaf_bool_t toggle_rotation
 ) {
    gf c, d, x, t;
 #if COFACTOR == 4 && !IMAGINE_TWIST
    (void) toggle_rotation;
    
    /* TODO: Can shave off one mul here; not important but makes consistent with paper */
    gf b, d;
    gf_s *a = s, *c = minus_t_over_s;
    gf_mulw_sgn ( a, p->y, 1-EDWARDS_D );
    gf_mul ( c, a, p->t );     /* -dYT, with EDWARDS_D = d-1 */
    gf_mul ( a, p->x, p->z ); 
    gf_sub ( d, c, a );  /* aXZ-dYT with a=-1 */
    gf_add ( a, p->z, p->y ); 
    gf_sub ( b, p->z, p->y ); 
    gf_mul ( c, b, a );
    gf_mulw_sgn ( b, c, -EDWARDS_D ); /* (a-d)(Z+Y)(Z-Y) */
    decaf_bool_t ok = gf_isqrt_chk ( a, b, DECAF_TRUE ); /* r in the paper */
    (void)ok; assert(ok);
    gf_mulw_sgn ( b, a, -EDWARDS_D ); /* u in the paper */
    gf_mul ( c, b, a ); /* ur */
    gf_mul ( a, c, d ); /* ur (aZX-dYT) */
    gf_add ( d, b, b );  /* 2u = -2au since a=-1 */
    gf_mul ( c, d, p->z ); /* 2uZ */
    cond_neg ( b, toggle_hibit_t_over_s ^ ~hibit(c) ); /* u <- -u if negative. */
    cond_neg ( c, toggle_hibit_t_over_s ^ ~hibit(c) ); /* u <- -u if negative. */
    gf_mul ( d, b, p->y ); 
    gf_add ( s, a, d );
    cond_neg ( s, toggle_hibit_s ^ hibit(s) );
 #else
    /* More complicated because of rotation */
    /* FIXME This code is wrong for certain non-Curve25519 curves; check if it's because of Cofactor==8 or IMAGINE_ROTATION */
    
    gf c, d;
    gf_s *b = s, *a = minus_t_over_s;

 #if IMAGINE_TWIST
    gf x, t;
    gf_mul ( x, p->x, SQRT_MINUS_ONE);
    gf_mul ( t, p->t, SQRT_MINUS_ONE);
    gf_sub ( x, ZERO, x );
    gf_sub ( t, ZERO, t );
 #endif
    
    gf DEBUG;
    gf_add ( a, p->z, x );
    gf_sub ( b, p->z, x );
    gf_mul ( c, a, b ); /* "zx" = Z^2 - X^2 */
    gf_cpy(DEBUG,c);
    gf_mul ( c, a, b ); /* "zx" = Z^2 - aX^2 = Z^2 - X^2 */
 #else
    const gf_s *x = p->x, *t = p->t;
    /* Won't hit the cond_sel below because COFACTOR==8 requires IMAGINE_TWIST for now. */
    
    gf_sqr ( a, p->z );
    gf_sqr ( b, p->x );
    gf_add ( c, a, b ); /* "zx" = Z^2 - aX^2 = Z^2 + X^2 */
 #endif
    
    gf_mul ( a, p->z, t ); /* "tz" = T*Z */
    gf_sqr ( b, a );
    gf_mul ( d, b, c ); /* (TZ)^2 * (Z^2-X^2) */
    gf_mul ( d, b, c ); /* (TZ)^2 * (Z^2-aX^2) */
    decaf_bool_t ok = gf_isqrt_chk ( b, d, DECAF_TRUE );
    (void)ok; assert(ok);
    gf_mul ( d, b, a ); /* "osx" = 1 / sqrt(z^2-x^2) */
    gf_mul ( d, b, a ); /* "osx" = 1 / sqrt(z^2-ax^2) */
    gf_mul ( a, b, c ); 
    gf_mul ( b, a, d ); /* 1/tz */

@@ -445,6 +476,7 @@ static void deisogenize (
        cond_sel ( x, p->y, x, rotate );
    }
 #else
    (void)toggle_rotation;
    rotate = 0;
 #endif
    
@@ -458,6 +490,8 @@ static void deisogenize (
    gf_add ( d, d, c );
    gf_mul ( b, d, x ); /* here "x" = y unless rotate */
    cond_neg ( b, toggle_hibit_s ^ hibit(b) );
    
 #endif
 }

 void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
@@ -472,7 +506,7 @@ void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
 static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
    return gf_deserialize((gf_s *)s, ser);
 }
   

 decaf_bool_t API_NS(point_decode) (
    point_t p,
    const unsigned char ser[SER_BYTES],
@@ -483,25 +517,32 @@ decaf_bool_t API_NS(point_decode) (
    succ &= allow_identity | ~zero;
    succ &= ~hibit(s);
    gf_sqr ( a, s );
    gf_sub ( f, ONE, a ); /* f = 1-s^2 = 1-as^2 since a=1 */
 #if IMAGINE_TWIST
    gf_sub ( f, ONE, a ); /* f = 1-as^2 = 1-s^2*/
 #else
    gf_add ( f, ONE, a ); /* f = 1-as^2 = 1+s^2 */
 #endif
    succ &= ~ gf_eq( f, ZERO );
    gf_sqr ( b, f ); 
    gf_mulw_sgn ( c, a, 4-4*EDWARDS_D ); 
    gf_mulw_sgn ( c, a, 4*IMAGINE_TWIST-4*EDWARDS_D ); 
    gf_add ( c, c, b ); /* t^2 */
    gf_mul ( d, f, s ); /* s(1-s^2) for denoms */
    gf_mul ( d, f, s ); /* s(1-as^2) for denoms */
    gf_sqr ( e, d );
    gf_mul ( b, c, e );
    
    succ &= gf_isqrt_chk ( e, b, DECAF_TRUE ); /* e = 1/(t s (1-s^2)) */
    succ &= gf_isqrt_chk ( e, b, DECAF_TRUE ); /* e = 1/(t s (1-as^2)) */
    gf_mul ( b, e, d ); /* 1/t */
    gf_mul ( d, e, c ); /* d = t / (s(1-s^2)) */
    gf_mul ( d, e, c ); /* d = t / (s(1-as^2)) */
    gf_mul ( e, d, f ); /* t/s */
    decaf_bool_t negtos = hibit(e);
    cond_neg(b, negtos);
    cond_neg(d, negtos);
    
    gf_add ( p->z, ONE, a); /* Z = 1+s^2 */
    succ &= ~gf_eq( p->z, ZERO ); /* FUTURE: unnecessary? */

 #if IMAGINE_TWIST
    gf_add ( p->z, ONE, a); /* Z = 1+as^2 = 1-s^2 */
 #else
    gf_sub ( p->z, ONE, a); /* Z = 1+as^2 = 1-s^2 */
 #endif

 #if COFACTOR == 8
    gf_mul ( a, p->z, d); /* t(1+s^2) / s(1-s^2) = 2/xy */
@@ -745,7 +786,7 @@ static void pt_to_pniels (
 ) {
    gf_sub ( b->n->a, a->y, a->x );
    gf_add ( b->n->b, a->x, a->y );
    gf_mulw_sgn ( b->n->c, a->t, 2*EFF_D );
    gf_mulw_sgn ( b->n->c, a->t, 2*TWISTED_D );
    gf_add ( b->z, a->z, a->z );
 }

--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -15,8 +15,8 @@
 #include "decaf_config.h"
 #include "field.h"

 #define API_NS(_id) decaf_255_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 #define GEN_TABLES
 #include "curve_data.inc.c"

 /* To satisfy linker. */
 const gf API_NS(precomputed_base_as_fe)[1];
@@ -24,7 +24,6 @@ const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(sc_r2) = {{{0}}};
 const decaf_word_t API_NS(MONTGOMERY_FACTOR) = 0;
 const unsigned char base_point_ser_for_pregen[DECAF_255_SER_BYTES];

 const API_NS(point_t) API_NS(point_base);

@@ -94,8 +93,8 @@ int main(int argc, char **argv) {
    printf("/** @warning: this file was automatically generated. */\n");
    printf("#include <decaf.h>\n\n");
    printf("#include \"field.h\"\n\n");
    printf("#define API_NS(_id) decaf_255_##_id\n");
    printf("#define API_NS2(_pref,_id) _pref##_decaf_255_##_id\n");
    printf("#define API_NS(_id) %s_##_id\n", API_NAME);
    printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);
    
    output = (const gf_s *)real_point_base;
    printf("const API_NS(point_t) API_NS(point_base) = {{\n");
@@ -136,8 +135,8 @@ int main(int argc, char **argv) {
    scalar_print("API_NS(precomputed_scalarmul_adjustment)", smadj);
    
    API_NS(scalar_copy)(smadj,API_NS(scalar_one));
    for (i=0; i<DECAF_255_SCALAR_BITS-1 + DECAF_WINDOW_BITS
            - ((DECAF_255_SCALAR_BITS-1)%DECAF_WINDOW_BITS); i++) {
    for (i=0; i<SCALAR_BITS-1 + DECAF_WINDOW_BITS
            - ((SCALAR_BITS-1) % DECAF_WINDOW_BITS); i++) {
        API_NS(scalar_add)(smadj,smadj,smadj);
    }
    API_NS(scalar_sub)(smadj, smadj, API_NS(scalar_one));
--- a/src/p25519/f_field.h
+++ b/src/p25519/f_field.h
@@ -15,7 +15,7 @@
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  51
 #define GF_BITS           255
 #define gf              gf_25519_t
 #define gf                gf_25519_t
 #define gf_s              gf_25519_s
 #define gf_mul            gf_25519_mul
 #define gf_sqr            gf_25519_sqr
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -23,9 +23,9 @@ static uint64_t widemul_32 (

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    const gf_448_t bs
 ) { 
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
@@ -84,8 +84,8 @@ p448_mul (

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    uint64_t b
 ) {
    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
@@ -128,15 +128,15 @@ p448_mulw (

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
    gf_448_s *__restrict__ cs,
    const gf_448_t as
 ) {
    p448_mul(cs,as,as); /* PERF */
 }

 void
 p448_strong_reduce (
    p448_t *a
    gf_448_t a
 ) {
    word_t mask = (1ull<<28)-1;

@@ -180,14 +180,14 @@ p448_strong_reduce (
 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    gf_448_t red;
    p448_copy(red, x);
    p448_strong_reduce(red);
    for (i=0; i<8; i++) {
        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
        for (j=0; j<7; j++) {
            serial[7*i+j] = limb;
            limb >>= 8;
@@ -198,7 +198,7 @@ p448_serialize (

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 ) {
    int i,j;
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
@@ -24,69 +24,69 @@ extern "C" {

 static __inline__ void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
    gf_448_t inout
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 );

@@ -94,9 +94,9 @@ p448_deserialize (

 void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (

 void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    gf_448_t a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (

 void
 p448_weak_reduce (
    p448_t *a
    gf_448_t a
 ) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -100,9 +100,9 @@ smull2 (

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    const gf_448_t bs
 ) {
    
    const uint32_t *a = as->limb, *b = bs->limb;
@@ -451,8 +451,8 @@ p448_mul (

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
    gf_448_s *__restrict__ cs,
    const gf_448_t as
 ) {
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;
@@ -749,8 +749,8 @@ p448_sqr (

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    uint64_t b
 ) {
    uint32_t mask = (1ull<<28)-1;  
@@ -863,7 +863,7 @@ p448_mulw (

 void
 p448_strong_reduce (
    p448_t *a
    gf_448_t a
 ) {
    word_t mask = (1ull<<28)-1;

@@ -907,14 +907,14 @@ p448_strong_reduce (
 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    gf_448_t red;
    p448_copy(red, x);
    p448_strong_reduce(red);
    for (i=0; i<8; i++) {
        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
        for (j=0; j<7; j++) {
            serial[7*i+j] = limb;
            limb >>= 8;
@@ -925,7 +925,7 @@ p448_serialize (

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 ) {
    int i,j;
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
@@ -24,69 +24,69 @@ extern "C" {

 static __inline__ void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
    gf_448_t inout
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 );

@@ -94,9 +94,9 @@ p448_deserialize (

 void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (

 void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    gf_448_t a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (

 void
 p448_weak_reduce (
    p448_t *a
    gf_448_t a
 ) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -70,9 +70,9 @@ smull2 (

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    const gf_448_t bs
 ) {
    #define _bl0 "q0"
    #define _bl0_0 "d0"
@@ -369,8 +369,8 @@ p448_mul (

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t bs
 ) {
    int32x2_t *vc = (int32x2_t*) cs->limb;

@@ -570,8 +570,8 @@ p448_sqr (

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    uint64_t b
 ) { 
    uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
@@ -621,7 +621,7 @@ p448_mulw (
 /* PERF: vectorize? */
 void
 p448_strong_reduce (
    p448_t *a
    gf_448_t a
 ) { 
    word_t mask = (1ull<<28)-1;

@@ -665,15 +665,15 @@ p448_strong_reduce (
 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    gf_448_t red;
    p448_copy(red, x);
    p448_strong_reduce(red);
    
    for (i=0; i<8; i++) {
        uint64_t limb = red.limb[LIMBPERM(2*i)] + (((uint64_t)red.limb[LIMBPERM(2*i+1)])<<28);
        uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28);
        for (j=0; j<7; j++) {
            serial[7*i+j] = limb;
            limb >>= 8;
@@ -684,7 +684,7 @@ p448_serialize (

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 ) {
    int i,j;
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
@@ -9,9 +9,9 @@
 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
 #define USE_NEON_PERM 1
@@ -30,69 +30,69 @@ extern "C" {

 static __inline__ void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
    gf_448_t inout
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 );

@@ -100,9 +100,9 @@ p448_deserialize (

 void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -112,9 +112,9 @@ p448_add_RAW (

 void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
@@ -130,15 +130,15 @@ p448_sub_RAW (

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    gf_448_t a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
@@ -152,7 +152,7 @@ p448_bias (

 void
 p448_weak_reduce (
    p448_t *a
    gf_448_t a
 ) {

    uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -18,9 +18,9 @@ static __inline__ uint64_t is_zero(uint64_t a) {

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    const gf_448_t bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;
@@ -184,8 +184,8 @@ p448_mul (

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
@@ -213,8 +213,8 @@ p448_mulw (

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
    gf_448_s *__restrict__ cs,
    const gf_448_t as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;
@@ -328,7 +328,7 @@ p448_sqr (

 void
 p448_strong_reduce (
    p448_t *a
    gf_448_t a
 ) {
    uint64_t mask = (1ull<<56)-1;

@@ -372,24 +372,24 @@ p448_strong_reduce (
 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    gf_448_t red;
    p448_copy(red, x);
    p448_strong_reduce(red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
            serial[7*i+j] = red->limb[i];
            red->limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
        assert(red->limb[i] == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 ) {
    int i,j;
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -10,9 +10,9 @@

 #include "word.h"

 typedef struct p448_t {
 typedef struct gf_448_s {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
@@ -23,69 +23,69 @@ extern "C" {

 static __inline__ void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
    gf_448_t inout
 ) __attribute__((unused));
             
 void
 p448_strong_reduce (
    p448_t *inout
    gf_448_t inout
 );

 static __inline__ void
 p448_bias (
    p448_t *inout,
    gf_448_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 );

@@ -93,9 +93,9 @@ p448_deserialize (

 void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
@@ -106,9 +106,9 @@ p448_add_RAW (

 void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
@@ -120,15 +120,15 @@ p448_sub_RAW (

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p448_bias (
    p448_t *a,
    gf_448_t a,
    int amt
 ) {
    (void) a;
@@ -137,7 +137,7 @@ p448_bias (

 void
 p448_weak_reduce (
    p448_t *a
    gf_448_t a
 ) {
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -7,9 +7,9 @@

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    const gf_448_t bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;
@@ -147,8 +147,8 @@ p448_mul (

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    gf_448_s *__restrict__ cs,
    const gf_448_t as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
@@ -192,8 +192,8 @@ p448_mulw (

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
    gf_448_s *__restrict__ cs,
    const gf_448_t as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;
@@ -307,7 +307,7 @@ p448_sqr (

 void
 p448_strong_reduce (
    p448_t *a
    gf_448_t a
 ) {
    uint64_t mask = (1ull<<56)-1;

@@ -351,24 +351,24 @@ p448_strong_reduce (
 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    gf_448_t red;
    p448_copy(red, x);
    p448_strong_reduce(red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
            serial[7*i+j] = red->limb[i];
            red->limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
        assert(red->limb[i] == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 ) {
    int i,j;
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -9,9 +9,12 @@

 #include "word.h"

 typedef struct p448_t {
 #ifndef __DECAF_448_H__ // HACK FIXME
 #define DECAF_WORD_BITS 64
 typedef struct gf_448_s {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
 #endif

 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
@@ -22,69 +25,69 @@ extern "C" {

 static __inline__ void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
    gf_448_t inout
 );

 static __inline__ void
 p448_bias (
    p448_t *inout,
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
    const gf_448_t x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    gf_448_t x,
    const uint8_t serial[56]
 );

@@ -92,9 +95,9 @@ p448_deserialize (

 void
 p448_add_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -110,9 +113,9 @@ p448_add_RAW (

 void
 p448_sub_RAW (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -128,8 +131,8 @@ p448_sub_RAW (

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
    gf_448_t out,
    const gf_448_t a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
@@ -139,7 +142,7 @@ p448_copy (

 void
 p448_bias (
    p448_t *a,
    gf_448_t a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
@@ -166,7 +169,7 @@ p448_bias (

 void
 p448_weak_reduce (
    p448_t *a
    gf_448_t a
 ) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<56) - 1;
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -15,7 +15,8 @@
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  56
 #define GF_BITS           448
 #define gf              p448_t
 #define gf                gf_448_t
 #define gf_s              gf_448_s
 #define gf_mul            p448_mul
 #define gf_sqr            p448_sqr
 #define gf_add_RAW        p448_add_RAW
--- a/src/public_include/decaf.hxx
+++ b/src/public_include/decaf.hxx
@@ -3,6 +3,7 @@
 #define __DECAF_HXX__ 1

 #include <decaf/decaf_255.hxx> // MAGIC
 #include <decaf/decaf_448.hxx> // MAGIC

 #endif /* __DECAF_H__ */

--- a/src/public_include/decaf/decaf_255.hxx
+++ b/src/public_include/decaf/decaf_255.hxx
@@ -46,7 +46,13 @@ namespace decaf {
 /**
 * @brief Curve25519/Decaf instantiation of group.
 */
 struct Ed255 {
 struct IsoEd25519 {
    
 /** The name of the curve */
 static inline const char *name() { return "IsoEd25519"; }

 /** The curve's cofactor (removed, but useful for testing) */
 static const int REMOVED_COFACTOR = 8;

 /** @cond internal */
 class Point;
@@ -533,17 +539,17 @@ public:
    /** @endcond */
 };

 }; /* struct Ed255 */
 }; /* struct IsoEd25519 */



 /** @cond internal */
 inline SecureBuffer Ed255::Scalar::direct_scalarmul (
 inline SecureBuffer IsoEd25519::Scalar::direct_scalarmul (
    const Block &in,
    decaf_bool_t allow_identity,
    decaf_bool_t short_circuit
 ) const throw(CryptoException) {
    SecureBuffer out(Ed255::Point::SER_BYTES);
    SecureBuffer out(IsoEd25519::Point::SER_BYTES);
    if (!decaf_255_direct_scalarmul(out, in.data(), s, allow_identity, short_circuit))
        throw CryptoException();
    return out;
--- a/src/public_include/decaf/decaf_448.h
+++ b/src/public_include/decaf/decaf_448.h
@@ -426,7 +426,7 @@ decaf_bool_t decaf_448_point_valid (
 ) API_VIS WARN_UNUSED NONNULL1 NOINLINE;

 /**
 * @brief 2-torque a point, for debugging purposes.
 * @brief Torque a point, for debugging purposes.
 *
 * @param [out] q The point to torque.
 * @param [in] p The point to torque.
@@ -436,6 +436,21 @@ void decaf_448_point_debugging_torque (
     const decaf_448_point_t p
 ) API_VIS NONNULL2 NOINLINE;

 /**
 * @brief Projectively scale a point, for debugging purposes.
 * The output will be equal to the input, and will be valid
 * even if the factor is zero.
 *
 * @param [out] q The point to scale.
 * @param [in] p The point to scale.
 * @param [in] factor Serialized GF factor to scale.
 */
 void decaf_448_point_debugging_pscale (
     decaf_448_point_t q,
     const decaf_448_point_t p,
     const unsigned char factor[DECAF_448_SER_BYTES]
 ) API_VIS NONNULL2 NOINLINE;

 /**
 * @brief Almost-Elligator-like hash to curve.
 *
--- a/src/public_include/decaf/decaf_448.hxx
+++ b/src/public_include/decaf/decaf_448.hxx
@@ -46,7 +46,13 @@ namespace decaf {
 /**
 * @brief Ed448-Goldilocks/Decaf instantiation of group.
 */
 struct Ed448 {
 struct Ed448Goldilocks {
    
 /** The name of the curve */
 static inline const char *name() { return "Ed448-Goldilocks"; }

 /** The curve's cofactor (removed, but useful for testing) */
 static const int REMOVED_COFACTOR = 4;

 /** @cond internal */
 class Point;
--- a/test/bench_decaf.cxx
+++ b/test/bench_decaf.cxx
@@ -20,9 +20,6 @@
 #include <algorithm>

 using namespace decaf;
 typedef Ed255::Scalar Scalar;
 typedef Ed255::Point Point;
 typedef Ed255::Precomputed Precomputed;


 static __inline__ void __attribute__((unused)) ignore_result ( int result ) { (void)result; }
@@ -140,6 +137,13 @@ public:

 double Benchmark::totalCy = 0, Benchmark::totalS = 0;


 template<typename Group> struct Benches {

 typedef typename Group::Scalar Scalar;
 typedef typename Group::Point Point;
 typedef typename Group::Precomputed Precomputed;

 static void tdh (
    SpongeRng &clientRng,
    SpongeRng &serverRng,
@@ -274,6 +278,62 @@ static void spake2ee(
    server.respec(STROBE_KEYED_128);
 }

 static void macro() {
    printf("\nMacro-benchmarks for %s:\n", Group::name());
    printf("Protocol benchmarks:\n");
    SpongeRng clientRng(Block("client rng seed"));
    SpongeRng serverRng(Block("server rng seed"));
    SecureBuffer hashedPassword("hello world");
    for (Benchmark b("Spake2ee c+s",0.1); b.iter(); ) {
        spake2ee(clientRng, serverRng, hashedPassword,false);
    }
    
    for (Benchmark b("Spake2ee c+s aug",0.1); b.iter(); ) {
        spake2ee(clientRng, serverRng, hashedPassword,true);
    }
    
    Scalar x(clientRng);
    SecureBuffer gx(Precomputed::base() * x);
    Scalar y(serverRng);
    SecureBuffer gy(Precomputed::base() * y);
    
    for (Benchmark b("FHMQV c+s",0.1); b.iter(); ) {
        fhmqv(clientRng, serverRng,x,gx,y,gy);
    }
    
    for (Benchmark b("TripleDH anon c+s",0.1); b.iter(); ) {
        tdh(clientRng, serverRng, x,gx,y,gy);
    }
 }

 static void micro() {
    SpongeRng rng(Block("per-curve-benchmarks"));
    Precomputed pBase;
    Point p,q;
    Scalar s,t;
    SecureBuffer ep, ep2(Point::SER_BYTES*2);
    
    printf("\nMicro-benchmarks for %s:\n", Group::name());
    for (Benchmark b("Scalar add", 1000); b.iter(); ) { s+=t; }
    for (Benchmark b("Scalar times", 100); b.iter(); ) { s*=t; }
    for (Benchmark b("Scalar inv", 1); b.iter(); ) { s.inverse(); }
    for (Benchmark b("Point add", 100); b.iter(); ) { p += q; }
    for (Benchmark b("Point double", 100); b.iter(); ) { p.double_in_place(); }
    for (Benchmark b("Point scalarmul"); b.iter(); ) { p * s; }
    for (Benchmark b("Point encode"); b.iter(); ) { ep = SecureBuffer(p); }
    for (Benchmark b("Point decode"); b.iter(); ) { p = Point(ep); }
    for (Benchmark b("Point create/destroy"); b.iter(); ) { Point r; }
    for (Benchmark b("Point hash nonuniform"); b.iter(); ) { Point::from_hash(ep); }
    for (Benchmark b("Point hash uniform"); b.iter(); ) { Point::from_hash(ep2); }
    for (Benchmark b("Point unhash nonuniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep,0)); }
    for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
    for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
    for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
    for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
 }

 }; /* template <typename group> struct Benches */

 int main(int argc, char **argv) {
    bool micro = false;
    if (argc >= 2 && !strcmp(argv[1], "--micro"))
@@ -293,10 +353,6 @@ int main(int argc, char **argv) {


    if (micro) {
        Precomputed pBase;
        Point p,q;
        Scalar s,t;
        SecureBuffer ep, ep2(Point::SER_BYTES*2);
        SpongeRng rng(Block("micro-benchmarks"));
        
        printf("\nMicro-benchmarks:\n");
@@ -325,25 +381,12 @@ int main(int argc, char **argv) {
        for (Benchmark b("STROBEk256 1kiB", 10); b.iter(); ) {
            strobe.encrypt_no_auth(TmpBuffer(b1024,1024),TmpBuffer(b1024,1024),b.i>1);
        }
        for (Benchmark b("Scalar add", 1000); b.iter(); ) { s+=t; }
        for (Benchmark b("Scalar times", 100); b.iter(); ) { s*=t; }
        for (Benchmark b("Scalar inv", 1); b.iter(); ) { s.inverse(); }
        for (Benchmark b("Point add", 100); b.iter(); ) { p += q; }
        for (Benchmark b("Point double", 100); b.iter(); ) { p.double_in_place(); }
        for (Benchmark b("Point scalarmul"); b.iter(); ) { p * s; }
        for (Benchmark b("Point encode"); b.iter(); ) { ep = SecureBuffer(p); }
        for (Benchmark b("Point decode"); b.iter(); ) { p = Point(ep); }
        for (Benchmark b("Point create/destroy"); b.iter(); ) { Point r; }
        for (Benchmark b("Point hash nonuniform"); b.iter(); ) { Point::from_hash(ep); }
        for (Benchmark b("Point hash uniform"); b.iter(); ) { Point::from_hash(ep2); }
        for (Benchmark b("Point unhash nonuniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep,0)); }
        for (Benchmark b("Point unhash uniform"); b.iter(); ) { ignore_result(p.invert_elligator(ep2,0)); }
        for (Benchmark b("Point steg"); b.iter(); ) { p.steg_encode(rng); }
        for (Benchmark b("Point double scalarmul"); b.iter(); ) { Point::double_scalarmul(p,s,q,t); }
        for (Benchmark b("Point precmp scalarmul"); b.iter(); ) { pBase * s; }
        /* TODO: scalarmul for verif, etc */
        Benches<IsoEd25519>::micro();
        Benches<Ed448Goldilocks>::micro();
    }

    /* TODO: 255->448 */
    printf("\nMacro-benchmarks:\n");
    for (Benchmark b("Keygen"); b.iter(); ) {
        decaf_255_derive_private_key(s1,r1);
@@ -369,31 +412,9 @@ int main(int argc, char **argv) {
        umessage[1]^=umessage[0];
        ignore_result(ret);
    }

    printf("\nProtocol benchmarks:\n");
    SpongeRng clientRng(Block("client rng seed"));
    SpongeRng serverRng(Block("server rng seed"));
    SecureBuffer hashedPassword("hello world");
    for (Benchmark b("Spake2ee c+s",0.1); b.iter(); ) {
        spake2ee(clientRng, serverRng, hashedPassword,false);
    }
    
    for (Benchmark b("Spake2ee c+s aug",0.1); b.iter(); ) {
        spake2ee(clientRng, serverRng, hashedPassword,true);
    }
    
    Scalar x(clientRng);
    SecureBuffer gx(Precomputed::base() * x);
    Scalar y(serverRng);
    SecureBuffer gy(Precomputed::base() * y);
    
    for (Benchmark b("FHMQV c+s",0.1); b.iter(); ) {
        fhmqv(clientRng, serverRng,x,gx,y,gy);
    }
    
    for (Benchmark b("TripleDH anon c+s",0.1); b.iter(); ) {
        tdh(clientRng, serverRng, x,gx,y,gy);
    }
    Benches<IsoEd25519>::macro();
    Benches<Ed448Goldilocks>::macro();
    
    printf("\n");
    Benchmark::calib();
--- a/test/test_decaf.cxx
+++ b/test/test_decaf.cxx
@@ -164,7 +164,7 @@ static void test_elligator() {
    decaf::SpongeRng rng(decaf::Block("test_elligator"));
    Test test("Elligator");
    
    const int NHINTS = 1<<4;
    const int NHINTS = Group::REMOVED_COFACTOR * 2;
    decaf::SecureBuffer *alts[NHINTS];
    bool successes[NHINTS];
    decaf::SecureBuffer *alts2[NHINTS];
@@ -312,7 +312,7 @@ static void test_ec() {

 }; // template<decaf::GroupId GROUP>


 // FIXME cross-field
 static void test_decaf() {
    Test test("Sample crypto");
    decaf::SpongeRng rng(decaf::Block("test_decaf"));
@@ -350,11 +350,18 @@ static void test_decaf() {
 int main(int argc, char **argv) {
    (void) argc; (void) argv;
    
    Tests<decaf::Ed255>::test_arithmetic();
    Tests<decaf::Ed255>::test_elligator();
    Tests<decaf::Ed255>::test_ec();
    printf("Testing %s:\n", decaf::IsoEd25519::name());
    Tests<decaf::IsoEd25519>::test_arithmetic();
    Tests<decaf::IsoEd25519>::test_elligator();
    Tests<decaf::IsoEd25519>::test_ec();
    test_decaf();
    
    printf("\n");
    printf("Testing %s:\n", decaf::Ed448Goldilocks::name());
    Tests<decaf::Ed448Goldilocks>::test_arithmetic();
    Tests<decaf::Ed448Goldilocks>::test_elligator();
    Tests<decaf::Ed448Goldilocks>::test_ec();
    
    if (passing) printf("Passed all tests.\n");
    
    return passing ? 0 : 1;