working on multicurve build system

10 years ago · 719fcacc58
--- a/+ 62
+++ b/+ 62
@@ -44,7 +44,7 @@ FIELD ?= p25519
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)

 INCFLAGS = -Isrc/include -Isrc/public_include -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 INCFLAGS = -Isrc/include -Isrc/public_include
 LANGFLAGS = -std=c99 -fno-strict-aliasing
 LANGXXFLAGS = -fno-strict-aliasing
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
@@ -83,10 +83,16 @@ BUILDPYS= $(SAGES:test/%.sage=$(BUILD_PY)/%.py)

 HEADERS= Makefile $(shell find src test -name "*.h") $(shell find . -name "*.hxx") $(BUILD_OBJ)/timestamp

 DECAFCOMPONENTS= $(BUILD_OBJ)/$(DECAF).o $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o \
 	$(BUILD_OBJ)/$(FIELD).o $(BUILD_OBJ)/f_arithmetic.o $(BUILD_OBJ)/utils.o
 # components needed by the table generators
 GENCOMPONENTS=  \
 	$(BUILD_OBJ)/$(DECAF)_ed25519.o $(BUILD_OBJ)/p25519_impl.o  $(BUILD_OBJ)/p25519_arithmetic.o \
 	$(BUILD_OBJ)/utils.o \
 	#$(BUILD_OBJ)/p448_impl.o $(BUILD_OBJ)/p448_arithmetic.o

 # components needed by the lib
 DECAFCOMPONENTS= $(BUILD_OBJ)/shake.o $(BUILD_OBJ)/decaf_crypto.o $(GENCOMPONENTS)
 ifeq ($(DECAF),decaf_fast)
 DECAFCOMPONENTS += $(BUILD_OBJ)/decaf_tables.o
 DECAFCOMPONENTS += $(BUILD_OBJ)/decaf_tables_ed25519.o
 endif

 BENCHCOMPONENTS = $(BUILD_OBJ)/bench.o $(BUILD_OBJ)/shake.o
@@ -143,15 +149,39 @@ $(BUILD_OBJ)/timestamp:
 $(BUILD_OBJ)/%.o: $(BUILD_ASM)/%.s
 	$(ASM) $(ASFLAGS) -c -o $@ $<

 $(BUILD_IBIN)/decaf_gen_tables: $(BUILD_OBJ)/decaf_gen_tables.o \
 		$(BUILD_OBJ)/$(DECAF).o $(BUILD_OBJ)/$(FIELD).o $(BUILD_OBJ)/f_arithmetic.o $(BUILD_OBJ)/utils.o
 # I don't know why this rule is necessary... bug in make, or obscure pattern matching rule?
 $(BUILD_OBJ)/decaf_gen_tables_%.o: $(BUILD_ASM)/decaf_gen_tables_%.s
 	$(ASM) $(ASFLAGS) -c -o $@ $<

 $(BUILD_IBIN)/decaf_gen_tables_%: $(BUILD_OBJ)/decaf_gen_tables_%.o $(GENCOMPONENTS)
 	$(LD) $(LDFLAGS) -o $@ $^
 	
 $(BUILD_C)/decaf_tables.c: $(BUILD_IBIN)/decaf_gen_tables
 $(BUILD_C)/decaf_tables_%.c: $(BUILD_IBIN)/decaf_gen_tables_%
 	./$< > $@
 	
 $(BUILD_ASM)/decaf_tables.s: $(BUILD_C)/decaf_tables.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 $(BUILD_ASM)/decaf_tables_%.s: $(BUILD_C)/decaf_tables_%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $< \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 	
 $(BUILD_ASM)/decaf_gen_tables_%.s: src/decaf_gen_tables.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/decaf_fast_%.s: src/decaf_fast.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/curve_$*/ -I src/curve_$*/field -I src/curve_$*/field/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/%_arithmetic.s: src/%/f_arithmetic.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/$* -I src/$*/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/%_impl.s: src/%/$(ARCH)/f_impl.c $(HEADERS)
 	$(CC) $(CFLAGS) \
 		-I src/$* -I src/$*/$(ARCH) \
 		-S -c -o $@ $<
 	
 $(BUILD_ASM)/%.s: src/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
@@ -165,12 +195,6 @@ $(BUILD_ASM)/%.s: test/%.c $(HEADERS)
 $(BUILD_ASM)/%.s: test/%.cxx $(HEADERS)
 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<

 $(BUILD_ASM)/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 $(BUILD_ASM)/%.s: src/$(FIELD)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 # The sage test scripts
 sage: $(BUILDPYS)

@@ -191,29 +215,29 @@ $(BUILDPYS): $(SAGES) $(BUILD_OBJ)/timestamp
 $(BUILD_DOC)/timestamp:
 	mkdir -p `dirname $@`
 	touch $@

 doc: Doxyfile $(BUILD_OBJ)/timestamp $(HEADERS) src/*.c src/$(FIELD)/$(ARCH)/*.c src/$(FIELD)/$(ARCH)/*.h
 	doxygen > /dev/null

 # The eBATS benchmarking script
 bat: $(BATNAME)

 $(BATNAME): include/* src/* src/*/* test/batarch.map $(BUILD_C)/decaf_tables.c # TODO tables some other way
 	rm -fr $@
 	for prim in dh sign; do \
          targ="$@/crypto_$$prim/ed448goldilocks_decaf"; \
 	  (while read arch where; do \
 	    mkdir -p $$targ/`basename $$arch`; \
 	    cp include/*.h $(BUILD_C)/decaf_tables.c src/decaf_fast.c src/decaf_crypto.c src/shake.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
 	    cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
 	    perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h;  \
 	    perl -p -i -e 's/__TODAY__/'$(TODAY)'/g' $$targ/`basename $$arch`/api.h;  \
 	    done \
 	  ) < test/batarch.map; \
 	  echo 'Mike Hamburg' > $$targ/designers; \
 	  echo 'Ed448-Goldilocks Decaf sign and dh' > $$targ/description; \
        done
 	(cd $(BATNAME)/.. && tar czf $(BATBASE).tgz $(BATBASE) )
 #
 # doc: Doxyfile $(BUILD_OBJ)/timestamp $(HEADERS) src/*.c src/$(FIELD)/$(ARCH)/*.c src/$(FIELD)/$(ARCH)/*.h
 # 	doxygen > /dev/null

 # # The eBATS benchmarking script
 # bat: $(BATNAME)
 #
 # $(BATNAME): include/* src/* src/*/* test/batarch.map $(BUILD_C)/decaf_tables.c # TODO tables some other way
 # 	rm -fr $@
 # 	for prim in dh sign; do \
 #           targ="$@/crypto_$$prim/ed448goldilocks_decaf"; \
 # 	  (while read arch where; do \
 # 	    mkdir -p $$targ/`basename $$arch`; \
 # 	    cp include/*.h $(BUILD_C)/decaf_tables.c src/decaf_fast.c src/decaf_crypto.c src/shake.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
 # 	    cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
 # 	    perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h;  \
 # 	    perl -p -i -e 's/__TODAY__/'$(TODAY)'/g' $$targ/`basename $$arch`/api.h;  \
 # 	    done \
 # 	  ) < test/batarch.map; \
 # 	  echo 'Mike Hamburg' > $$targ/designers; \
 # 	  echo 'Ed448-Goldilocks Decaf sign and dh' > $$targ/description; \
 #         done
 # 	(cd $(BATNAME)/.. && tar czf $(BATBASE).tgz $(BATBASE) )
 	
 # Finds todo items in .h and .c files
 TODO_TYPES ?= HACK TODO FIXME BUG XXX PERF FUTURE REMOVE MAGIC
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -0,0 +1,35 @@
 /* Rename table for eventual factoring into .c.inc, MSR ECC style */
 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
 #define NLIMBS DECAF_255_LIMBS
 #define API_NS(_id) decaf_255_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
 #define SER_BYTES DECAF_255_SER_BYTES
 #define IMAGINE_TWIST 1
 #define P_MOD_8 5
 #define COFACTOR 8

 static const int EDWARDS_D = -121665;

 static const scalar_t sc_p = {{{
    SC_LIMB(0x5812631a5cf5d3ed),
    SC_LIMB(0x14def9dea2f79cd6),
    SC_LIMB(0),
    SC_LIMB(0x1000000000000000)
 }}};

 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };

 const gf SQRT_ONE_MINUS_D = {FIELD_LITERAL(
    0x6db8831bbddec,
    0x38d7b56c9c165,
    0x016b221394bdc,
    0x7540f7816214a,
    0x0a0d85b4032b1
 )};
--- a/src/curve_ed25519/field
+++ b/src/curve_ed25519/field
@@ -0,0 +1 @@
 ../p25519/
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -0,0 +1,29 @@
 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
 #define NLIMBS DECAF_448_LIMBS
 #define API_NS(_id) decaf_448_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_448_##_id
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
 #define SER_BYTES DECAF_448_SER_BYTES
 #define IMAGINE_TWIST 0
 #define P_MOD_8 7
 #define COFACTOR 4

 static const int EDWARDS_D = -39081;

 static const scalar_t sc_p = {{{
    SC_LIMB(0x2378c292ab5844f3),
    SC_LIMB(0x216cc2728dc58f55),
    SC_LIMB(0xc44edb49aed63690),
    SC_LIMB(0xffffffff7cca23e9),
    SC_LIMB(0xffffffffffffffff),
    SC_LIMB(0xffffffffffffffff),
    SC_LIMB(0x3fffffffffffffff)
 }}};
 
 /* sqrt(5) = 2phi-1 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
 };
--- a/src/curve_ed448goldilocks/field
+++ b/src/curve_ed448goldilocks/field
@@ -0,0 +1 @@
 ../p448/
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -16,26 +16,35 @@
 #include "decaf_config.h"

 #define WBITS DECAF_WORD_BITS

 /* Rename table for eventual factoring into .c.inc, MSR ECC style */
 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
 #define NLIMBS DECAF_255_LIMBS
 #define API_NS(_id) decaf_255_##_id
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
 #define SER_BYTES DECAF_255_SER_BYTES

 #if WBITS == 64
 typedef __int128_t decaf_sdword_t;
 #define SC_LIMB(x) (x##ull)
    typedef __int128_t decaf_sdword_t;
    #define SC_LIMB(x) (x##ull)
 #elif WBITS == 32
 typedef int64_t decaf_sdword_t;
 #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
    typedef int64_t decaf_sdword_t;
    #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #else
 #error "Only supporting 32- and 64-bit platforms right now"
    #error "Only supporting 32- and 64-bit platforms right now"
 #endif


 /* Include the curve data here */
 #include "curve_data.inc.c"


 #if IMAGINE_TWIST && (P_MOD_8 != 5)
 #error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
 #endif

 #if (COFACTOR != 8) && (COFACTOR != 4)
 #error "COFACTOR must be 4 or 8"
 #endif
 
 #if IMAGINE_TWIST
 extern const gf SQRT_MINUS_ONE;
 #endif

 #if COFACTOR == 8
 extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
 #endif

 #define sv static void
@@ -43,23 +52,9 @@ typedef int64_t decaf_sdword_t;
 #define siv static inline void __attribute__((always_inline))
 static const gf ZERO = {{{0}}}, ONE = {{{1}}};

 static const int EDWARDS_D = -121665;

 static const scalar_t sc_p = {{{
    SC_LIMB(0x5812631a5cf5d3ed),
    SC_LIMB(0x14def9dea2f79cd6),
    SC_LIMB(0),
    SC_LIMB(0x1000000000000000)
 }}};

 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const scalar_t sc_r2;
 extern const decaf_word_t MONTGOMERY_FACTOR;

 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
 extern const scalar_t API_NS(sc_r2);
 extern const decaf_word_t API_NS(MONTGOMERY_FACTOR);

 extern const point_t API_NS(point_base);

@@ -77,6 +72,7 @@ const precomputed_s *API_NS(precomputed_base) =
 const size_t API_NS2(sizeof,precomputed_s) = sizeof(precomputed_s);
 const size_t API_NS2(alignof,precomputed_s) = 32;

 /* FIXME PERF: Vectorize vs unroll */
 #ifdef __clang__
 #if 100*__clang_major__ + __clang_minor__ > 305
 #define UNROLL _Pragma("clang loop unroll(full)") // PERF FIXME: vectorize?
@@ -222,7 +218,7 @@ snv sc_montmul (
        }
        accum[j] = chain;
        
        mand = accum[0] * MONTGOMERY_FACTOR;
        mand = accum[0] * API_NS(MONTGOMERY_FACTOR);
        chain = 0;
        mier = sc_p->limb;
        for (j=0; j<SCALAR_LIMBS; j++) {
@@ -245,7 +241,7 @@ void API_NS(scalar_mul) (
    const scalar_t b
 ) {
    sc_montmul(out,a,b);
    sc_montmul(out,out,sc_r2);
    sc_montmul(out,out,API_NS(sc_r2));
 }

 /* PERF: could implement this */
@@ -263,7 +259,7 @@ decaf_bool_t API_NS(scalar_invert) (
 #if 0
    /* FIELD MAGIC.  TODO PERF: not updated for 25519 */
    scalar_t chain[7], tmp;
    sc_montmul(chain[0],a,sc_r2);
    sc_montmul(chain[0],a,API_NS(sc_r2));
    
    unsigned int i,j;
    /* Addition chain generated by a not-too-clever SAGE script.  First part: compute a^(2^222-1) */
@@ -311,8 +307,8 @@ decaf_bool_t API_NS(scalar_invert) (
 #else
    scalar_t b, ma;
    int i;
    sc_montmul(b,API_NS(scalar_one),sc_r2);
    sc_montmul(ma,a,sc_r2);
    sc_montmul(b,API_NS(scalar_one),API_NS(sc_r2));
    sc_montmul(ma,a,API_NS(sc_r2));
    for (i=SCALAR_BITS-1; i>=0; i--) {
        sc_montsqr(b,b);
            
@@ -403,8 +399,6 @@ const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 static void gf_encode ( unsigned char ser[SER_BYTES], gf a ) {
    gf_serialize(ser, (gf_s *)a);
 }
 
 extern const gf SQRT_MINUS_ONE, SQRT_ONE_MINUS_D; /* Intern this? */

 static void deisogenize (
    gf_s *__restrict__ s,
@@ -416,11 +410,13 @@ static void deisogenize (
 ) {
    gf c, d, x, t;
    gf_s *b = s, *a = minus_t_over_s;
    

 #if IMAGINE_TWIST
    gf_mul ( x, p->x, SQRT_MINUS_ONE);
    gf_mul ( t, p->t, SQRT_MINUS_ONE);
    gf_sub ( x, ZERO, x );
    gf_sub ( t, ZERO, t );
 #endif
    
    gf DEBUG;
    gf_add ( a, p->z, x );
@@ -437,6 +433,7 @@ static void deisogenize (
    gf_mul ( b, a, d ); /* 1/tz */

    decaf_bool_t rotate;
 #if (COFACTOR == 8)
    {
        gf e;
        gf_sqr(e, p->z);
@@ -447,7 +444,9 @@ static void deisogenize (
        cond_sel ( a, a, SQRT_ONE_MINUS_D, rotate );
        cond_sel ( x, p->y, x, rotate );
    }
    
 #else
    rotate = 0;
 #endif
    
    gf_mul ( c, a, d ); // new "osx"
    gf_mul ( a, c, p->z );
@@ -503,24 +502,45 @@ decaf_bool_t API_NS(point_decode) (
    
    gf_add ( p->z, ONE, a); /* Z = 1+s^2 */
    succ &= ~gf_eq( p->z, ZERO ); /* FUTURE: unnecessary? */
    

 #if COFACTOR == 8
    gf_mul ( a, p->z, d); /* t(1+s^2) / s(1-s^2) = 2/xy */
    succ &= ~lobit(a); /* = ~hibit(a/2), since hibit(x) = lobit(2x) */
 #endif
    
    gf_mul ( a, f, b ); /* y = (1-s^2) / t */
    gf_mul ( p->y, p->z, a ); /* Y = yZ */
    gf_add ( a, s, s );
    gf_mul(p->x, a, SQRT_MINUS_ONE); /* Curve25519 */
 #if IMAGINE_TWIST
    gf_add ( b, s, s );
    gf_mul(p->x, b, SQRT_MINUS_ONE); /* Curve25519 */
 #else
    gf_add ( p->x, s, s );
 #endif
    gf_mul ( p->t, p->x, a ); /* T = 2s (1-as^2)/t */
    
    p->y->limb[0] -= zero;
    
    /* Curve25519 */
    assert(API_NS(point_valid)(p) | ~succ);
    
    return succ;
 }

 #if IMAGINE_TWIST
 #define TWISTED_D (-(EDWARDS_D))
 #else
 #define TWISTED_D ((EDWARDS_D)-1)
 #endif

 #if TWISTED_D < 0
 #define EFF_D (-(TWISTED_D))
 #define NEG_D 1
 #else
 #define EFF_D TWISTED_D
 #define NEG_D 0
 #endif



 void API_NS(point_sub) (
    point_t p,
    const point_t q,
@@ -534,13 +554,18 @@ void API_NS(point_sub) (
    gf_add_nr ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
    gf_mulw_sgn ( p->x, b, 2*EFF_D );
    gf_add_nr ( b, a, p->y );
    gf_sub_nr ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
    gf_add_nr ( a, a, a );
 #if NEG_D
    gf_sub_nr ( p->y, a, p->x );
    gf_add_nr ( a, a, p->x );
 #else
    gf_add_nr ( p->y, a, p->x );
    gf_sub_nr ( a, a, p->x );
 #endif
    gf_mul ( p->z, a, p->y );
    gf_mul ( p->x, p->y, c );
    gf_mul ( p->y, a, b );
@@ -560,13 +585,18 @@ void API_NS(point_add) (
    gf_add_nr ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
    gf_mulw_sgn ( p->x, b, 2*EFF_D );
    gf_add_nr ( b, a, p->y );
    gf_sub_nr ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
    gf_add_nr ( a, a, a );
 #if NEG_D
    gf_add_nr ( p->y, a, p->x );
    gf_sub_nr ( a, a, p->x );
 #else
    gf_sub_nr ( p->y, a, p->x );
    gf_add_nr ( a, a, p->x );
 #endif
    gf_mul ( p->z, a, p->y );
    gf_mul ( p->x, p->y, c );
    gf_mul ( p->y, a, b );
@@ -678,7 +708,7 @@ void API_NS(scalar_decode_long)(

    while (i) {
        i -= SER_BYTES;
        sc_montmul(t1,t1,sc_r2);
        sc_montmul(t1,t1,API_NS(sc_r2));
        ignore_result( API_NS(scalar_decode)(t2, ser+i) );
        API_NS(scalar_add)(t1, t1, t2);
    }
@@ -715,7 +745,7 @@ static void pt_to_pniels (
 ) {
    gf_sub ( b->n->a, a->y, a->x );
    gf_add ( b->n->b, a->x, a->y );
    gf_mulw_sgn ( b->n->c, a->t, -2*EDWARDS_D );
    gf_mulw_sgn ( b->n->c, a->t, 2*EFF_D );
    gf_add ( b->z, a->z, a->z );
 }

@@ -967,13 +997,21 @@ decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
    gf_mul ( b, q->y, p->x );
    decaf_bool_t succ = gf_eq(a,b);
    
    /* Interesting note: the 4tor would normally be rotation.
     * But because of the *i twist, it's actually
     * (x,y) <-> (iy,ix)
     */
    gf_mul ( a, p->y, q->y );
    gf_mul ( b, q->x, p->x );
    succ |= gf_eq(a,b);
    #if (COFACTOR == 8) && IMAGINE_TWIST
        gf_mul ( a, p->y, q->y );
        gf_mul ( b, q->x, p->x );
        #if !(IMAGINE_TWIST)
            gf_sub ( a, ZERO, a );
        #else
           /* Interesting note: the 4tor would normally be rotation.
            * But because of the *i twist, it's actually
            * (x,y) <-> (iy,ix)
            */
    
           /* No code, just a comment. */
        #endif
        succ |= gf_eq(a,b);
    #endif
    
    return succ;
 }
@@ -987,8 +1025,12 @@ void API_NS(point_from_hash_nonuniform) (
    gf_deser(r0,ser);
    gf_strong_reduce(r0);
    gf_sqr(a,r0);
    //gf_sub(r,ZERO,a); /*gf_mulw_sgn(r,a,QUADRATIC_NONRESIDUE);*/
        gf_mul(r,a,SQRT_MINUS_ONE);
 #if P_MOD_8 == 5
    /* r = QNR * a */
    gf_mul(r,a,SQRT_MINUS_ONE);
 #else
    gf_sub(r,ZERO,a);
 #endif
    gf_mulw_sgn(dee,ONE,EDWARDS_D);
    gf_mulw_sgn(c,r,EDWARDS_D);
    
@@ -1044,8 +1086,10 @@ void API_NS(point_from_hash_nonuniform) (
    cond_sel(b,c,ONE,gf_eq(c,ZERO)); /* 0,0 -> 1,0 */

    /* isogenize */
 #if IMAGINE_TWIST
    gf_mul(c,a,SQRT_MINUS_ONE);
    gf_cpy(a,c); // TODO rename
 #endif
    
    gf_sqr(c,a); /* s^2 */
    gf_add(a,a,a); /* 2s */
@@ -1061,7 +1105,7 @@ void API_NS(point_from_hash_nonuniform) (

 decaf_bool_t
 API_NS(invert_elligator_nonuniform) (
    unsigned char recovered_hash[DECAF_255_SER_BYTES],
    unsigned char recovered_hash[SER_BYTES],
    const point_t p,
    uint16_t hint_
 ) {
@@ -1087,17 +1131,23 @@ API_NS(invert_elligator_nonuniform) (
        
    }
    gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
    gf_add(a,d,b); /* num? */
    gf_add(a,b,d); /* num? */
    gf_sub(d,d,b); /* den? */
    gf_mul(b,a,d); /* n*d */
    cond_sel(a,d,a,sgn_s);
 #if P_MOD_8 == 5
    gf_mul(d,b,SQRT_MINUS_ONE);
 #else
    gf_sub(d,ZERO,b);
 #endif
    decaf_bool_t succ = gf_isqrt_chk(c,d,DECAF_TRUE);
    gf_mul(b,a,c);
    cond_neg(b, sgn_r0^hibit(b));
    
    succ &= ~(gf_eq(b,ZERO) & sgn_r0);
 #if COFACTOR == 8
    succ &= ~(is_identity & sgn_ed_T); /* NB: there are no preimages of rotated identity. */
 #endif
    
    gf_encode(recovered_hash, b); 
    /* TODO: deal with overflow flag */
@@ -1137,7 +1187,7 @@ decaf_bool_t API_NS(point_valid) (
    gf_sqr(b,p->y);
    gf_sub(a,b,a);
    gf_sqr(b,p->t);
    gf_mulw_sgn(c,b,-EDWARDS_D);
    gf_mulw_sgn(c,b,TWISTED_D);
    gf_sqr(b,p->z);
    gf_add(b,b,c);
    out &= gf_eq(a,b);
@@ -1149,18 +1199,18 @@ void API_NS(point_debugging_torque) (
    point_t q,
    const point_t p
 ) {
 #if 0
    gf_sub(q->x,ZERO,p->x);
    gf_sub(q->y,ZERO,p->y);
    gf_cpy(q->z,p->z);
    gf_cpy(q->t,p->t);
 #else
 #if COFACTOR == 8
    gf tmp;
    gf_mul(tmp,p->x,SQRT_MINUS_ONE);
    gf_mul(q->x,p->y,SQRT_MINUS_ONE);
    gf_cpy(q->y,tmp);
    gf_cpy(q->z,p->z);
    gf_sub(q->t,ZERO,p->t);
 #else
    gf_sub(q->x,ZERO,p->x);
    gf_sub(q->y,ZERO,p->y);
    gf_cpy(q->z,p->z);
    gf_cpy(q->t,p->t);
 #endif
 }

--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -22,8 +22,8 @@
 const gf API_NS(precomputed_base_as_fe)[1];
 const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 const API_NS(scalar_t) sc_r2 = {{{0}}};
 const decaf_word_t MONTGOMERY_FACTOR = 0;
 const API_NS(scalar_t) API_NS(sc_r2) = {{{0}}};
 const decaf_word_t API_NS(MONTGOMERY_FACTOR) = 0;
 const unsigned char base_point_ser_for_pregen[DECAF_255_SER_BYTES];

 const API_NS(point_t) API_NS(point_base);
@@ -147,7 +147,7 @@ int main(int argc, char **argv) {
    for (i=0; i<sizeof(API_NS(scalar_t))*8*2; i++) {
        API_NS(scalar_add)(smadj,smadj,smadj);
    }
    scalar_print("sc_r2", smadj);
    scalar_print("API_NS(sc_r2)", smadj);
    
    
    API_NS(scalar_sub)(smadj,API_NS(scalar_zero),API_NS(scalar_one)); /* get p-1 */
@@ -159,7 +159,7 @@ int main(int argc, char **argv) {
    for (i=0; i<6; i++) {
        w *= w*plo + 2;
    }
    printf("const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x%016llxull;\n\n", w);
    printf("const decaf_word_t API_NS(MONTGOMERY_FACTOR) = (decaf_word_t)0x%016llxull;\n\n", w);
    
    return 0;
 }
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p25519.h"
 #include "f_impl.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
--- a/src/p25519/arch_ref64/f_impl.h
+++ b/src/p25519/arch_ref64/f_impl.h
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p25519.h"
 #include "f_impl.h"
 #include "x86-64-arith.h"

 static inline uint64_t shr(__uint128_t x, int n) {
--- a/src/p25519/arch_x86_64/f_impl.h
+++ b/src/p25519/arch_x86_64/f_impl.h
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -18,14 +18,6 @@ const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
    0x2b8324804fc1d
 )};
    
 const gf_25519_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere?
    0x6db8831bbddec,
    0x38d7b56c9c165,
    0x016b221394bdc,
    0x7540f7816214a,
    0x0a0d85b4032b1
 )};
    
 static const gf_25519_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
    1,0,0,0,0
 )}; 
--- a/src/p25519/f_field.h
+++ b/src/p25519/f_field.h
@@ -12,7 +12,7 @@
 #include "constant_time.h"
 #include <string.h>

 #include "p25519.h"
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  51
 #define GF_BITS           255
 #define gf              gf_25519_t
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -3,7 +3,7 @@
 */

 #include "word.h"
 #include "p448.h"
 #include "f_impl.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -3,7 +3,7 @@
 */

 #include "word.h"
 #include "p448.h"
 #include "f_impl.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -3,7 +3,7 @@
 */

 #include "word.h"
 #include "p448.h"
 #include "f_impl.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"
 #include "f_impl.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"
 #include "f_impl.h"
 #include "x86-64-arith.h"

 void
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -12,10 +12,10 @@

 void 
 gf_isr (
    gf_a_t a,
    const gf_a_t x
    gf a,
    const gf x
 ) {
    gf_a_t L0, L1, L2;
    gf L0, L1, L2;
    gf_sqr  (   L1,     x );
    gf_mul  (   L2,     x,   L1 );
    gf_sqr  (   L1,   L2 );
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -12,7 +12,7 @@
 #include "constant_time.h"
 #include <string.h>

 #include "p448.h"
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  56
 #define GF_BITS           448
 #define gf              p448_t
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p480.h"
 #include "f_impl.h"
 #include "x86-64-arith.h"

 void
--- a/src/p480/arch_x86_64/f_impl.h
+++ b/src/p480/arch_x86_64/f_impl.h
--- a/src/p480/f_field.h
+++ b/src/p480/f_field.h
@@ -12,7 +12,7 @@
 #include "constant_time.h"
 #include <string.h>

 #include "p480.h"
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  60
 #define GF_BITS           480
 #define gf              p480_t
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p521.h"
 #include "f_impl.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
--- a/src/p521/arch_ref64/f_impl.h
+++ b/src/p521/arch_ref64/f_impl.h
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p521.h"
 #include "f_impl.h"

 typedef struct {
  uint64x3_t lo, hi, hier;
--- a/src/p521/arch_x86_64_r12/f_impl.h
+++ b/src/p521/arch_x86_64_r12/f_impl.h
--- a/src/p521/f_field.h
+++ b/src/p521/f_field.h
@@ -12,7 +12,7 @@
 #include <string.h>
 #include "constant_time.h"

 #include "p521.h"
 #include "f_impl.h"
 #define GF_LIT_LIMB_BITS  58
 #define GF_BITS           521
 #define gf              p521_t