From 5af980b85a299b584062a17278835c0794b0ba45 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Sat, 11 Jul 2015 23:44:20 -0700
Subject: [PATCH] wipe out the multiple layers of rename between decaf_fast and
 field.  still some serious HACKs in the include prio to avoid multiple
 definition of struct gf

---
 src/decaf_fast.c                      | 131 ++++-------
 src/decaf_gen_tables.c                |  50 ++--
 src/include/field.h                   |  98 ++++----
 src/p25519/arch_ref64/p25519.c        |  40 ++--
 src/p25519/arch_ref64/p25519.h        | 106 ++++-----
 src/p25519/arch_x86_64/p25519.c       |  40 ++--
 src/p25519/arch_x86_64/p25519.h       | 105 +++++----
 src/p25519/arch_x86_64/x86-64-arith.h | 324 +++++++++++++++++++++++++-
 src/p25519/f_arithmetic.c             |  28 +--
 src/p25519/f_field.h                  |  31 +--
 src/p448/f_arithmetic.c               |  56 ++---
 src/p448/f_field.h                    |  28 +--
 src/p480/f_arithmetic.c               |  56 ++---
 src/p480/f_field.h                    |  28 +--
 src/p521/f_arithmetic.c               |  56 ++---
 src/p521/f_field.h                    |  28 +--
 src/public_include/decaf/decaf_255.h  |   8 +-
 17 files changed, 748 insertions(+), 465 deletions(-)
 mode change 120000 => 100644 src/p25519/arch_x86_64/x86-64-arith.h

diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index 4e1baa2..d0d9d4a 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -27,8 +27,6 @@
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
 #define SER_BYTES DECAF_255_SER_BYTES
-#define gf_s gf_255_s
-#define gf gf_255_t
 
 #if WBITS == 64
 typedef __int128_t decaf_sdword_t;
@@ -72,7 +70,7 @@ typedef struct { niels_t n; gf z; } __attribute__((aligned(32))) pniels_s, pniel
 /* Precomputed base */
 struct precomputed_s { niels_t table [DECAF_COMBS_N<<(DECAF_COMBS_T-1)]; };
 
-extern const field_t API_NS(precomputed_base_as_fe)[];
+extern const gf API_NS(precomputed_base_as_fe)[];
 const precomputed_s *API_NS(precomputed_base) =
     (const precomputed_s *) &API_NS(precomputed_base_as_fe);
 
@@ -95,52 +93,6 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 /** Copy x = y */
 siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
 
-/** Mostly-unoptimized multiply, but at least it's unrolled. */
-siv gf_mul (gf c, const gf a, const gf b) {
-    field_mul((field_t *)c, (const field_t *)a, (const field_t *)b);
-}
-
-/** Dedicated square */
-siv gf_sqr (gf c, const gf a) {
-    field_sqr((field_t *)c, (const field_t *)a);
-}
-
-/** Add mod p.  Conservatively always weak-reduce. */
-snv gf_add ( gf_s *__restrict__ c, const gf a, const gf b ) {
-    field_add((field_t *)c, (const field_t *)a, (const field_t *)b);
-}
-
-/** Subtract mod p.  Conservatively always weak-reduce. */
-snv gf_sub ( gf c, const gf a, const gf b ) {
-    field_sub((field_t *)c, (const field_t *)a, (const field_t *)b);
-}
-
-/** Add mod p.  Conservatively always weak-reduce.) */
-siv gf_bias ( gf c, int amt) {
-    field_bias((field_t *)c, amt);
-}
-
-/** Subtract mod p.  Bias by 2 and don't reduce  */
-siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
-//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
-    field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
-    gf_bias(c, 2);
-    if (WBITS==32) field_weak_reduce((field_t*) c); // HACK
-}
-
-/** Subtract mod p. Bias by amt but don't reduce.  */
-siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
-    field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
-    gf_bias(c, amt);
-    if (WBITS==32) field_weak_reduce((field_t*) c); // HACK
-}
-
-/** Add mod p.  Don't reduce. */
-siv gf_add_nr ( gf c, const gf a, const gf b ) {
-//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
-    field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
-}
-
 /** Constant time, x = is_z ? z : y */
 siv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
     constant_time_select(x,z,y,sizeof(gf),is_z);
@@ -162,29 +114,11 @@ siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
     });
 }
 
-/**
- * Mul by signed int.  Not constant-time WRT the sign of that int.
- * Just uses a full mul (PERF)
- */
-siv gf_mlw(gf c, const gf a, int w) {
-    if (w>0) {
-        field_mulw((field_t *)c, (const field_t *)a, w);
-    } else {
-        field_mulw((field_t *)c, (const field_t *)a, -w);
-        gf_sub(c,ZERO,c);
-    }
-}
-
-/** Canonicalize */
-siv gf_canon ( gf a ) {
-    field_strong_reduce((field_t *)a);
-}
-
 /** Compare a==b */
 static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
     gf c;
     gf_sub(c,a,b);
-    gf_canon(c);
+    gf_strong_reduce(c);
     decaf_word_t ret=0;
     FOR_LIMB(i, ret |= c->limb[i] );
     /* Hope the compiler is too dumb to optimize this, thus noinline */
@@ -194,7 +128,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
 /** Inverse square root using addition chain. */
 static decaf_bool_t gf_isqrt_chk(gf y, const gf x, decaf_bool_t allow_zero) {
     gf tmp0, tmp1;
-    field_isr((field_t *)y, (const field_t *)x);
+    gf_isr((gf_s *)y, (const gf_s *)x);
     gf_sqr(tmp0,y);
     gf_mul(tmp1,tmp0,x);
     return gf_eq(tmp1,ONE) | (allow_zero & gf_eq(tmp1,ZERO));
@@ -211,11 +145,24 @@ sv gf_invert(gf y, const gf x) {
     gf_cpy(y, t2);
 }
 
+/**
+ * Mul by signed int.  Not constant-time WRT the sign of that int.
+ * Just uses a full mul (PERF)
+ */
+static inline void gf_mulw_sgn(gf c, const gf a, int w) {
+    if (w>0) {
+        gf_mulw(c, a, w);
+    } else {
+        gf_mulw(c, a, -w);
+        gf_sub(c,ZERO,c);
+    }
+}
+
 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t hibit(const gf x) {
     gf y;
     gf_add(y,x,x);
-    gf_canon(y);
+    gf_strong_reduce(y);
     return -(y->limb[0]&1);
 }
 
@@ -223,7 +170,7 @@ static decaf_word_t hibit(const gf x) {
 static decaf_word_t lobit(const gf x) {
     gf y;
     gf_cpy(y,x);
-    gf_canon(y);
+    gf_strong_reduce(y);
     return -(y->limb[0]&1);
 }
 
@@ -454,7 +401,7 @@ decaf_bool_t API_NS(scalar_eq) (
 const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 
 static void gf_encode ( unsigned char ser[SER_BYTES], gf a ) {
-    field_serialize(ser, (field_t *)a);
+    gf_serialize(ser, (gf_s *)a);
 }
  
 extern const gf SQRT_MINUS_ONE, SQRT_ONE_MINUS_D; /* Intern this? */
@@ -528,7 +475,7 @@ void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
  * Deserialize a bool, return TRUE if < p.
  */
 static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
-    return field_deserialize((field_t *)s, ser);
+    return gf_deserialize((gf_s *)s, ser);
 }
    
 decaf_bool_t API_NS(point_decode) (
@@ -544,7 +491,7 @@ decaf_bool_t API_NS(point_decode) (
     gf_sub ( f, ONE, a ); /* f = 1-s^2 = 1-as^2 since a=1 */
     succ &= ~ gf_eq( f, ZERO );
     gf_sqr ( b, f ); 
-    gf_mlw ( c, a, 4-4*EDWARDS_D ); 
+    gf_mulw_sgn ( c, a, 4-4*EDWARDS_D ); 
     gf_add ( c, c, b ); /* t^2 */
     gf_mul ( d, f, s ); /* s(1-s^2) for denoms */
     gf_sqr ( e, d );
@@ -596,7 +543,7 @@ void API_NS(point_sub) (
     gf_add_nr ( b, q->y, q->x );
     gf_mul ( p->y, d, b );
     gf_mul ( b, r->t, q->t );
-    gf_mlw ( p->x, b, -2*EDWARDS_D );
+    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
     gf_add_nr ( b, a, p->y );
     gf_sub_nr ( c, p->y, a );
     gf_mul ( a, q->z, r->z );
@@ -622,7 +569,7 @@ void API_NS(point_add) (
     gf_add_nr ( b, q->y, q->x );
     gf_mul ( p->y, d, b );
     gf_mul ( b, r->t, q->t );
-    gf_mlw ( p->x, b, -2*EDWARDS_D );
+    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
     gf_add_nr ( b, a, p->y );
     gf_sub_nr ( c, p->y, a );
     gf_mul ( a, q->z, r->z );
@@ -646,11 +593,11 @@ snv point_double_internal (
     gf_add_nr ( d, c, a );
     gf_add_nr ( p->t, q->y, q->x );
     gf_sqr ( b, p->t );
-    gf_sub_nr_x ( b, b, d, 3 );
+    gf_subx_nr ( b, b, d, 3 );
     gf_sub_nr ( p->t, a, c );
     gf_sqr ( p->x, q->z );
     gf_add_nr ( p->z, p->x, p->x );
-    gf_sub_nr_x ( a, p->z, p->t, 4 );
+    gf_subx_nr ( a, p->z, p->t, 4 );
     gf_mul ( p->x, a, b );
     gf_mul ( p->z, p->t, a );
     gf_mul ( p->y, p->t, d );
@@ -777,7 +724,7 @@ static void pt_to_pniels (
 ) {
     gf_sub ( b->n->a, a->y, a->x );
     gf_add ( b->n->b, a->x, a->y );
-    gf_mlw ( b->n->c, a->t, -2*EDWARDS_D );
+    gf_mulw_sgn ( b->n->c, a->t, -2*EDWARDS_D );
     gf_add ( b->z, a->z, a->z );
 }
 
@@ -1047,12 +994,12 @@ void API_NS(point_from_hash_nonuniform) (
     // TODO: simplify since we don't return a hint anymore
     gf r0,r,a,b,c,dee,D,N,rN,e;
     gf_deser(r0,ser);
-    gf_canon(r0);
+    gf_strong_reduce(r0);
     gf_sqr(a,r0);
-    //gf_sub(r,ZERO,a); /*gf_mlw(r,a,QUADRATIC_NONRESIDUE);*/
+    //gf_sub(r,ZERO,a); /*gf_mulw_sgn(r,a,QUADRATIC_NONRESIDUE);*/
         gf_mul(r,a,SQRT_MINUS_ONE);
-    gf_mlw(dee,ONE,EDWARDS_D);
-    gf_mlw(c,r,EDWARDS_D);
+    gf_mulw_sgn(dee,ONE,EDWARDS_D);
+    gf_mulw_sgn(c,r,EDWARDS_D);
     
     /* Compute D := (dr+a-d)(dr-ar-d) with a=1 */
     gf_sub(a,c,dee);
@@ -1064,7 +1011,7 @@ void API_NS(point_from_hash_nonuniform) (
     
     /* compute N := (r+1)(a-2d) */
     gf_add(a,r,ONE);
-    gf_mlw(N,a,1-2*EDWARDS_D);
+    gf_mulw_sgn(N,a,1-2*EDWARDS_D);
     
     /* e = +-1/sqrt(+-ND) */
     gf_mul(rN,r,N);
@@ -1078,8 +1025,8 @@ void API_NS(point_from_hash_nonuniform) (
     /* b <- t/s */
     cond_sel(c,r0,r,square); /* r? = sqr ? r : 1 */
     /* In two steps to avoid overflow on 32-bit arch */
-    gf_mlw(a,c,1-2*EDWARDS_D);
-    gf_mlw(b,a,1-2*EDWARDS_D);
+    gf_mulw_sgn(a,c,1-2*EDWARDS_D);
+    gf_mulw_sgn(b,a,1-2*EDWARDS_D);
     gf_sub(c,r,ONE);
     gf_mul(a,b,c); /* = r? * (r-1) * (a-2d)^2 with a=1 */
     gf_mul(b,a,e);
@@ -1148,7 +1095,7 @@ API_NS(invert_elligator_nonuniform) (
         cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
         
     }
-    gf_mlw(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
+    gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
     gf_add(a,d,b); /* num? */
     gf_sub(d,d,b); /* den? */
     gf_mul(b,a,d); /* n*d */
@@ -1199,7 +1146,7 @@ decaf_bool_t API_NS(point_valid) (
     gf_sqr(b,p->y);
     gf_sub(a,b,a);
     gf_sqr(b,p->t);
-    gf_mlw(c,b,-EDWARDS_D);
+    gf_mulw_sgn(c,b,-EDWARDS_D);
     gf_sqr(b,p->z);
     gf_add(b,b,c);
     out &= gf_eq(a,b);
@@ -1281,15 +1228,15 @@ static void batch_normalize_niels (
 
     for (i=0; i<n; i++) {
         gf_mul(product, table[i]->a, zis[i]);
-        gf_canon(product);
+        gf_strong_reduce(product);
         gf_cpy(table[i]->a, product);
         
         gf_mul(product, table[i]->b, zis[i]);
-        gf_canon(product);
+        gf_strong_reduce(product);
         gf_cpy(table[i]->b, product);
         
         gf_mul(product, table[i]->c, zis[i]);
-        gf_canon(product);
+        gf_strong_reduce(product);
         gf_cpy(table[i]->c, product);
     }
 }
@@ -1510,7 +1457,7 @@ sv prepare_wnaf_table(
     }
 }
 
-extern const field_t API_NS(precomputed_wnaf_as_fe)[];
+extern const gf API_NS(precomputed_wnaf_as_fe)[];
 static const niels_t *API_NS(wnaf_base) = (const niels_t *)API_NS(precomputed_wnaf_as_fe);
 const size_t API_NS2(sizeof,precomputed_wnafs) __attribute((visibility("hidden")))
     = sizeof(niels_t)<<DECAF_WNAF_FIXED_TABLE_BITS;
diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c
index 67c5043..b23e1c7 100644
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -19,7 +19,7 @@
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id
 
  /* To satisfy linker. */
-const field_t API_NS(precomputed_base_as_fe)[1];
+const gf API_NS(precomputed_base_as_fe)[1];
 const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 const API_NS(scalar_t) sc_r2 = {{{0}}};
@@ -29,7 +29,7 @@ const unsigned char base_point_ser_for_pregen[DECAF_255_SER_BYTES];
 const API_NS(point_t) API_NS(point_base);
 
 struct niels_s;
-const field_t *API_NS(precomputed_wnaf_as_fe);
+const gf_s *API_NS(precomputed_wnaf_as_fe);
 extern const size_t API_NS2(sizeof,precomputed_wnafs);
 
 void API_NS(precompute_wnafs) (
@@ -48,26 +48,26 @@ static void scalar_print(const char *name, const API_NS(scalar_t) sc) {
     printf("}}};\n\n");
 }
 
-static void field_print(const field_t *f) {
-    const int FIELD_SER_BYTES = (FIELD_BITS + 7) / 8;
-    unsigned char ser[FIELD_SER_BYTES];
-    field_serialize(ser,f);
+static void field_print(const gf f) {
+    const int GF_SER_BYTES = (GF_BITS + 7) / 8;
+    unsigned char ser[GF_SER_BYTES];
+    gf_serialize(ser,f);
     int b=0, i, comma=0;
     unsigned long long limb = 0;
-    printf("FIELD_LITERAL(");
-    for (i=0; i<FIELD_SER_BYTES; i++) {
+    printf("{FIELD_LITERAL(");
+    for (i=0; i<GF_SER_BYTES; i++) {
         limb |= ((uint64_t)ser[i])<<b;
         b += 8;
-        if (b >= FIELD_LIT_LIMB_BITS) {
-            limb &= (1ull<<FIELD_LIT_LIMB_BITS) -1;
-            b -= FIELD_LIT_LIMB_BITS;
+        if (b >= GF_LIT_LIMB_BITS) {
+            limb &= (1ull<<GF_LIT_LIMB_BITS) -1;
+            b -= GF_LIT_LIMB_BITS;
             if (comma) printf(",");
             comma = 1;
             printf("0x%016llx", limb);
             limb = ((uint64_t)ser[i])>>(8-b);
         }
     }
-    printf(")");
+    printf(")}");
     assert(b<8);
 }
 
@@ -88,41 +88,39 @@ int main(int argc, char **argv) {
     if (ret || !preWnaf) return 1;
     API_NS(precompute_wnafs)(preWnaf, real_point_base);
 
-    const field_t *output;
+    const gf_s *output;
     unsigned i;
     
     printf("/** @warning: this file was automatically generated. */\n");
+    printf("#include <decaf.h>\n\n");
     printf("#include \"field.h\"\n\n");
-    printf("#include \"decaf.h\"\n\n");
     printf("#define API_NS(_id) decaf_255_##_id\n");
     printf("#define API_NS2(_pref,_id) _pref##_decaf_255_##_id\n");
     
-    output = (const field_t *)real_point_base;
+    output = (const gf_s *)real_point_base;
     printf("const API_NS(point_t) API_NS(point_base) = {{\n");
-    for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(field_t)) {
+    for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(gf)) {
         if (i) printf(",\n  ");
-        printf("{");
         field_print(output++);
-        printf("}");
     }
     printf("\n}};\n");
     
-    output = (const field_t *)pre;
-    printf("const field_t API_NS(precomputed_base_as_fe)[%d]\n", 
-        (int)(API_NS2(sizeof,precomputed_s) / sizeof(field_t)));
+    output = (const gf_s *)pre;
+    printf("const gf API_NS(precomputed_base_as_fe)[%d]\n", 
+        (int)(API_NS2(sizeof,precomputed_s) / sizeof(gf)));
     printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
     
-    for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(field_t)) {
+    for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(gf)) {
         if (i) printf(",\n  ");
         field_print(output++);
     }
     printf("\n};\n");
     
-    output = (const field_t *)preWnaf;
-    printf("const field_t API_NS(precomputed_wnaf_as_fe)[%d]\n", 
-        (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(field_t)));
+    output = (const gf_s *)preWnaf;
+    printf("const gf API_NS(precomputed_wnaf_as_fe)[%d]\n", 
+        (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(gf)));
     printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
-    for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(field_t)) {
+    for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(gf)) {
         if (i) printf(",\n  ");
         field_print(output++);
     }
diff --git a/src/include/field.h b/src/include/field.h
index d5c8fbc..1012416 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -1,23 +1,20 @@
 /**
  * @file field.h
- * @brief Generic field header.
+ * @brief Generic gf header.
  * @copyright
  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
  *   Released under the MIT License.  See LICENSE.txt for license information.
  * @author Mike Hamburg
  */
 
-#ifndef __FIELD_H__
-#define __FIELD_H__
+#ifndef __GF_H__
+#define __GF_H__
 
 #include "constant_time.h"
 #include "f_field.h"
 #include <string.h>
 
-typedef struct field_t field_a_t[1];
-#define field_a_restrict_t struct field_t *__restrict__
-
-#define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
+#define is32 (GOLDI_BITS == 32 || GF_BITS != 448)
 #if (is32)
 #define IF32(s) (s)
 #else
@@ -33,9 +30,9 @@ typedef struct field_t field_a_t[1];
  * If x=0, returns 0.
  */
 void
-field_isr (
-    field_a_t       a,
-    const field_a_t x
+gf_isr (
+    gf       a,
+    const gf x
 );
     
 /**
@@ -43,62 +40,75 @@ field_isr (
  */
 static __inline__ void
 __attribute__((unused,always_inline))
-field_sqrn (
-    field_a_restrict_t y,
-    const field_a_t x,
+gf_sqrn (
+    gf_s *__restrict__ y,
+    const gf x,
     int n
 ) {
-    field_a_t tmp;
+    gf tmp;
     assert(n>0);
     if (n&1) {
-        field_sqr(y,x);
+        gf_sqr(y,x);
         n--;
     } else {
-        field_sqr(tmp,x);
-        field_sqr(y,tmp);
+        gf_sqr(tmp,x);
+        gf_sqr(y,tmp);
         n-=2;
     }
     for (; n; n-=2) {
-        field_sqr(tmp,y);
-        field_sqr(y,tmp);
+        gf_sqr(tmp,y);
+        gf_sqr(y,tmp);
     }
 }
 
 static __inline__ void
-field_subx_RAW (
-    field_a_t d,
-    const field_a_t a,
-    const field_a_t b
+gf_subx_RAW (
+    gf d,
+    const gf a,
+    const gf b
 ) {
-    field_sub_RAW ( d, a, b );
-    field_bias( d, 2 );
-    IF32( field_weak_reduce ( d ) );
+    gf_sub_RAW ( d, a, b );
+    gf_bias( d, 2 );
+    IF32( gf_weak_reduce ( d ) );
 }
 
 static __inline__ void
-field_sub (
-    field_a_t d,
-    const field_a_t a,
-    const field_a_t b
+gf_sub (
+    gf d,
+    const gf a,
+    const gf b
 ) {
-    field_sub_RAW ( d, a, b );
-    field_bias( d, 2 );
-    field_weak_reduce ( d );
+    gf_sub_RAW ( d, a, b );
+    gf_bias( d, 2 );
+    gf_weak_reduce ( d );
 }
 
 static __inline__ void
-field_add (
-    field_a_t d,
-    const field_a_t a,
-    const field_a_t b
+gf_add (
+    gf d,
+    const gf a,
+    const gf b
 ) {
-    field_add_RAW ( d, a, b );
-    field_weak_reduce ( d );
+    gf_add_RAW ( d, a, b );
+    gf_weak_reduce ( d );
+}
+
+#define gf_add_nr gf_add_RAW
+
+/** Subtract mod p.  Bias by 2 and don't reduce  */
+static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
+//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
+    gf_sub_RAW(c,a,b);
+    gf_bias(c, 2);
+    if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
+}
+
+/** Subtract mod p. Bias by amt but don't reduce.  */
+static inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) {
+    gf_sub_RAW(c,a,b);
+    gf_bias(c, amt);
+    if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
 }
 
-/* FIXME: no warnings on RAW routines */
-#define field_add_nr field_add_RAW
-#define field_sub_nr field_sub_RAW
-#define field_subx_nr field_subx_RAW
 
-#endif // __FIELD_H__
+#endif // __GF_H__
diff --git a/src/p25519/arch_ref64/p25519.c b/src/p25519/arch_ref64/p25519.c
index 37cedb0..4381188 100644
--- a/src/p25519/arch_ref64/p25519.c
+++ b/src/p25519/arch_ref64/p25519.c
@@ -17,10 +17,10 @@ static __inline__ uint64_t is_zero(uint64_t a) {
 }
 
 void
-p255_mul (
-    p255_t *__restrict__ cs,
-    const p255_t *as,
-    const p255_t *bs
+gf_25519_mul (
+    gf_25519_t __restrict__ cs,
+    const gf_25519_t as,
+    const gf_25519_t bs
 ) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     
@@ -52,9 +52,9 @@ p255_mul (
 }
 
 void
-p255_mulw (
-    p255_t *__restrict__ cs,
-    const p255_t *as,
+gf_25519_mulw (
+    gf_25519_t __restrict__ cs,
+    const gf_25519_t as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
@@ -79,16 +79,16 @@ p255_mulw (
 }
 
 void
-p255_sqr (
-    p255_t *__restrict__ cs,
-    const p255_t *as
+gf_25519_t qr (
+    gf_25519_t __restrict__ cs,
+    const gf_25519_t as
 ) {
-    p255_mul(cs,as,as); // TODO
+    gf_25519_mul(cs,as,as); // TODO
 }
 
 void
-p255_strong_reduce (
-    p255_t *a
+gf_25519_t trong_reduce (
+    gf_25519_t a
 ) {
     uint64_t mask = (1ull<<51)-1;
 
@@ -128,14 +128,14 @@ p255_strong_reduce (
 }
 
 void
-p255_serialize (
+gf_25519_t erialize (
     uint8_t serial[32],
-    const struct p255_t *x
+    const struct gf_25519_t x
 ) {
     int i,j;
-    p255_t red;
-    p255_copy(&red, x);
-    p255_strong_reduce(&red);
+    gf_25519_t red;
+    gf_25519_copy(&red, x);
+    gf_25519_t trong_reduce(&red);
     uint64_t *r = red.limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
@@ -147,8 +147,8 @@ p255_serialize (
 }
 
 mask_t
-p255_deserialize (
-    p255_t *x,
+gf_25519_deserialize (
+    gf_25519_t x,
     const uint8_t serial[32]
 ) {
     int i,j;
diff --git a/src/p25519/arch_ref64/p25519.h b/src/p25519/arch_ref64/p25519.h
index be64923..12e9c52 100644
--- a/src/p25519/arch_ref64/p25519.h
+++ b/src/p25519/arch_ref64/p25519.h
@@ -1,8 +1,8 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
-#ifndef __P255_H__
-#define __P255_H__ 1
+#ifndef __P25519_H__
+#define __P25519_H__ 1
 
 #include <stdint.h>
 #include <assert.h>
@@ -10,9 +10,9 @@
 
 #include "word.h"
 
-typedef struct p255_t {
+typedef struct gf_25519_s {
   uint64_t limb[5];
-} p255_t;
+} gf_25519_s, gf_25519_t[1];
 
 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
@@ -32,113 +32,113 @@ extern "C" {
 #endif
 
 static __inline__ void
-p255_add_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_add_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) __attribute__((unused));
              
 static __inline__ void
-p255_sub_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_sub_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) __attribute__((unused));
              
 static __inline__ void
-p255_copy (
-    p255_t *out,
-    const p255_t *a
+gf_25519_copy (
+    gf_25519_t out,
+    const gf_25519_t a
 ) __attribute__((unused));
              
 static __inline__ void
-p255_weak_reduce (
-    p255_t *inout
+gf_25519_weak_reduce (
+    gf_25519_t inout
 ) __attribute__((unused));
              
 void
-p255_strong_reduce (
-    p255_t *inout
+gf_25519_strong_reduce (
+    gf_25519_t inout
 );
 
 static __inline__ void
-p255_bias (
-    p255_t *inout,
+gf_25519_bias (
+    gf_25519_t inout,
     int amount
 ) __attribute__((unused));
          
 void
-p255_mul (
-    p255_t *__restrict__ out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_mul (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a,
+    const gf_25519_t b
 );
 
 void
-p255_mulw (
-    p255_t *__restrict__ out,
-    const p255_t *a,
+gf_25519_mulw (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a,
     uint64_t b
 );
 
 void
-p255_sqr (
-    p255_t *__restrict__ out,
-    const p255_t *a
+gf_25519_sqr (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a
 );
 
 void
-p255_serialize (
+gf_25519_serialize (
     uint8_t serial[32],
-    const struct p255_t *x
+    const gf_25519_t x
 );
 
 mask_t
-p255_deserialize (
-    p255_t *x,
+gf_25519_deserialize (
+    gf_25519_t x,
     const uint8_t serial[32]
 );
 
 /* -------------- Inline functions begin here -------------- */
 
 void
-p255_add_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_add_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) {
     unsigned int i;
     for (i=0; i<5; i++) {
         out->limb[i] = a->limb[i] + b->limb[i];
     }
-    p255_weak_reduce(out);
+    gf_25519_weak_reduce(out);
 }
 
 void
-p255_sub_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_sub_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) {
     unsigned int i;
     uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
     for (i=0; i<5; i++) {
         out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
     }
-    p255_weak_reduce(out);
+    gf_25519_weak_reduce(out);
 }
 
 void
-p255_copy (
-    p255_t *out,
-    const p255_t *a
+gf_25519_copy (
+    gf_25519_t out,
+    const gf_25519_t a
 ) {
     memcpy(out,a,sizeof(*a));
 }
 
 void
-p255_bias (
-    p255_t *a,
+gf_25519_bias (
+    gf_25519_t a,
     int amt
 ) {
     (void) a;
@@ -146,8 +146,8 @@ p255_bias (
 }
 
 void
-p255_weak_reduce (
-    p255_t *a
+gf_25519_weak_reduce (
+    gf_25519_t a
 ) {
     uint64_t mask = (1ull<<51) - 1;
     uint64_t tmp = a->limb[4] >> 51;
@@ -162,4 +162,4 @@ p255_weak_reduce (
 }; /* extern "C" */
 #endif
 
-#endif /* __P255_H__ */
+#endif /* __P25519_H__ */
diff --git a/src/p25519/arch_x86_64/p25519.c b/src/p25519/arch_x86_64/p25519.c
index 464522c..0e09086 100644
--- a/src/p25519/arch_x86_64/p25519.c
+++ b/src/p25519/arch_x86_64/p25519.c
@@ -10,10 +10,10 @@ static inline uint64_t shr(__uint128_t x, int n) {
 }
 
 void
-p255_mul (
-    p255_t *__restrict__ cs,
-    const p255_t *as,
-    const p255_t *bs
+gf_25519_mul (
+    gf_25519_s *__restrict__ cs,
+    const gf_25519_t as,
+    const gf_25519_t bs
 ) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
@@ -92,9 +92,9 @@ p255_mul (
 }
 
 void
-p255_sqr (
-    p255_t *__restrict__ cs,
-    const p255_t *as
+gf_25519_sqr (
+    gf_25519_s *__restrict__ cs,
+    const gf_25519_t as
 ) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
@@ -156,9 +156,9 @@ p255_sqr (
 }
 
 void
-p255_mulw (
-    p255_t *__restrict__ cs,
-    const p255_t *as,
+gf_25519_mulw (
+    gf_25519_s *__restrict__ cs,
+    const gf_25519_t as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
@@ -191,8 +191,8 @@ p255_mulw (
 }
 
 void
-p255_strong_reduce (
-    p255_t *a
+gf_25519_strong_reduce (
+    gf_25519_t a
 ) {
     uint64_t mask = (1ull<<51)-1;
 
@@ -232,15 +232,15 @@ p255_strong_reduce (
 }
 
 void
-p255_serialize (
+gf_25519_serialize (
     uint8_t serial[32],
-    const struct p255_t *x
+    const gf_25519_t x
 ) {
     int i,j;
-    p255_t red;
-    p255_copy(&red, x);
-    p255_strong_reduce(&red);
-    uint64_t *r = red.limb;
+    gf_25519_t red;
+    gf_25519_copy(red, x);
+    gf_25519_strong_reduce(red);
+    uint64_t *r = red->limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
         for (j=0; j<8; j++) {
@@ -251,8 +251,8 @@ p255_serialize (
 }
 
 mask_t
-p255_deserialize (
-    p255_t *x,
+gf_25519_deserialize (
+    gf_25519_t x,
     const uint8_t serial[32]
 ) {
     int i,j;
diff --git a/src/p25519/arch_x86_64/p25519.h b/src/p25519/arch_x86_64/p25519.h
index 4106fcc..203b89a 100644
--- a/src/p25519/arch_x86_64/p25519.h
+++ b/src/p25519/arch_x86_64/p25519.h
@@ -1,8 +1,8 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
-#ifndef __P255_H__
-#define __P255_H__ 1
+#ifndef __P25519_H__
+#define __P25519_H__ 1
 
 #include <stdint.h>
 #include <assert.h>
@@ -10,9 +10,12 @@
 
 #include "word.h"
 
-typedef struct p255_t {
+#ifndef __DECAF_255_H__ // HACK FIXME
+#define DECAF_WORD_BITS 64
+typedef struct gf_25519_s {
   uint64_t limb[5];
-} p255_t;
+} gf_25519_s, gf_25519_t[1];
+#endif
 
 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
@@ -32,80 +35,80 @@ extern "C" {
 #endif
 
 static __inline__ void
-p255_add_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_add_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) __attribute__((unused));
              
 static __inline__ void
-p255_sub_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_sub_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) __attribute__((unused));
              
 static __inline__ void
-p255_copy (
-    p255_t *out,
-    const p255_t *a
+gf_25519_copy (
+    gf_25519_t out,
+    const gf_25519_t a
 ) __attribute__((unused));
              
 static __inline__ void
-p255_weak_reduce (
-    p255_t *inout
+gf_25519_weak_reduce (
+    gf_25519_t inout
 ) __attribute__((unused));
              
 void
-p255_strong_reduce (
-    p255_t *inout
+gf_25519_strong_reduce (
+    gf_25519_t inout
 );
 
 static __inline__ void
-p255_bias (
-    p255_t *inout,
+gf_25519_bias (
+    gf_25519_t inout,
     int amount
 ) __attribute__((unused));
          
 void
-p255_mul (
-    p255_t *__restrict__ out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_mul (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a,
+    const gf_25519_t b
 );
 
 void
-p255_mulw (
-    p255_t *__restrict__ out,
-    const p255_t *a,
+gf_25519_mulw (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a,
     uint64_t b
 );
 
 void
-p255_sqr (
-    p255_t *__restrict__ out,
-    const p255_t *a
+gf_25519_sqr (
+    gf_25519_s *__restrict__ out,
+    const gf_25519_t a
 );
 
 void
-p255_serialize (
+gf_25519_serialize (
     uint8_t serial[32],
-    const struct p255_t *x
+    const gf_25519_t x
 );
 
 mask_t
-p255_deserialize (
-    p255_t *x,
+gf_25519_deserialize (
+    gf_25519_t x,
     const uint8_t serial[32]
 );
 
 /* -------------- Inline functions begin here -------------- */
 
 void
-p255_add_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_add_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) {
     unsigned int i;
     for (i=0; i<5; i++) {
@@ -114,10 +117,10 @@ p255_add_RAW (
 }
 
 void
-p255_sub_RAW (
-    p255_t *out,
-    const p255_t *a,
-    const p255_t *b
+gf_25519_sub_RAW (
+    gf_25519_t out,
+    const gf_25519_t a,
+    const gf_25519_t b
 ) {
     unsigned int i;
     uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
@@ -127,16 +130,16 @@ p255_sub_RAW (
 }
 
 void
-p255_copy (
-    p255_t *out,
-    const p255_t *a
+gf_25519_copy (
+    gf_25519_t out,
+    const gf_25519_t a
 ) {
     memcpy(out,a,sizeof(*a));
 }
 
 void
-p255_bias (
-    p255_t *a,
+gf_25519_bias (
+    gf_25519_t a,
     int amt
 ) {
     a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
@@ -147,8 +150,8 @@ p255_bias (
 }
 
 void
-p255_weak_reduce (
-    p255_t *a
+gf_25519_weak_reduce (
+    gf_25519_t a
 ) {
     uint64_t mask = (1ull<<51) - 1;
     uint64_t tmp = a->limb[4] >> 51;
@@ -163,4 +166,4 @@ p255_weak_reduce (
 }; /* extern "C" */
 #endif
 
-#endif /* __P255_H__ */
+#endif /* __P25519_H__ */
diff --git a/src/p25519/arch_x86_64/x86-64-arith.h b/src/p25519/arch_x86_64/x86-64-arith.h
deleted file mode 120000
index 93c6c47..0000000
--- a/src/p25519/arch_x86_64/x86-64-arith.h
+++ /dev/null
@@ -1 +0,0 @@
-../../p448/arch_x86_64/x86-64-arith.h
\ No newline at end of file
diff --git a/src/p25519/arch_x86_64/x86-64-arith.h b/src/p25519/arch_x86_64/x86-64-arith.h
new file mode 100644
index 0000000..00fcc1e
--- /dev/null
+++ b/src/p25519/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,323 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __X86_64_ARITH_H__
+#define __X86_64_ARITH_H__
+
+#include <stdint.h>
+
+/* TODO: non x86-64 versions of these.
+ * FUTURE: autogenerate
+ */
+
+static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"r"(a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"d"(a));
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"r"(b), "a"(a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"r"(b), [a]"d"(a));
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "leaq (,%%rdx,2), %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       "addq %[c], %[lo2]; "
+       "adcq %[d], %[hi2]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       "addq %%rax, %[lo2]; "
+       "adcq %%rdx, %[hi2]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
+}
+
+static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"d"(a)
+       : "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"r"(a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"r"(b), [a]"d"(a)
+       : "cc");
+  #else
+  __asm__ volatile
+      ("mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"r"(b), "a"(a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  
+}
+
+static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d, lo = *acc, hi = *acc>>64;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[lo], %[c]; "
+       "sbbq %[hi], %[d]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  *acc = (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
+  return ((__uint128_t)(a)) * b;
+}
+
+static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
+  return ((__int128_t)(a)) * b;
+}
+ 
+static __inline__ uint64_t opacify(uint64_t x) {
+  __asm__ volatile("" : "+r"(x));
+  return x;
+}
+
+static __inline__ mask_t is_zero(uint64_t x) {
+  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
+  return ~x;
+}
+
+#endif /* __X86_64_ARITH_H__ */
diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c
index c211388..82600db 100644
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -10,7 +10,7 @@
 
 #include "field.h"
 
-const field_a_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
+const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
     0x61b274a0ea0b0,
     0x0d5a5fc8f189d,
     0x7ef5e9cbd0c60,
@@ -18,7 +18,7 @@ const field_a_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
     0x2b8324804fc1d
 )};
     
-const field_a_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere?
+const gf_25519_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere?
     0x6db8831bbddec,
     0x38d7b56c9c165,
     0x016b221394bdc,
@@ -26,15 +26,15 @@ const field_a_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere
     0x0a0d85b4032b1
 )};
     
-static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
+static const gf_25519_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
     1,0,0,0,0
 )}; 
 
 // ARCH MAGIC FIXME copy-pasted from decaf_fast.c
-static mask_t gf_eq(const field_a_t a, const field_a_t b) {
-    field_a_t c;
-    field_sub(c,a,b);
-    field_strong_reduce(c);
+static mask_t gf_eq(const gf_25519_t a, const gf_25519_t b) {
+    gf_25519_t c;
+    gf_sub(c,a,b);
+    gf_strong_reduce(c);
     mask_t ret=0;
     int i;
     for (i=0; i<5; i++) { ret |= c->limb[i]; }
@@ -43,19 +43,19 @@ static mask_t gf_eq(const field_a_t a, const field_a_t b) {
 
 /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
 void 
-field_isr (
-    field_a_t a,
-    const field_a_t x
+gf_isr (
+    gf_25519_t a,
+    const gf_25519_t x
 ) {
-    field_a_t st[3], tmp1, tmp2;
+    gf_25519_t st[3], tmp1, tmp2;
     const struct { unsigned char sh, idx; } ops[] = {
         {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
     };
     st[0][0] = st[1][0] = st[2][0] = x[0];
     unsigned int i;
     for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
-        field_sqrn(tmp1, st[1^(i&1)], ops[i].sh);
-        field_mul(tmp2, tmp1, st[ops[i].idx]);
+        gf_sqrn(tmp1, st[1^(i&1)], ops[i].sh);
+        gf_mul(tmp2, tmp1, st[ops[i].idx]);
         st[i&1][0] = tmp2[0];
     }
     
@@ -64,5 +64,5 @@ field_isr (
     // ARCH MAGIC FIXME: should be cond_sel
     for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i]            &  mask)
                                       | (SQRT_MINUS_ONE->limb[i] & ~mask);
-    field_mul(a,tmp1,st[0]);
+    gf_mul(a,tmp1,st[0]);
 }
diff --git a/src/p25519/f_field.h b/src/p25519/f_field.h
index e63596a..c2a7ee0 100644
--- a/src/p25519/f_field.h
+++ b/src/p25519/f_field.h
@@ -13,20 +13,21 @@
 #include <string.h>
 
 #include "p25519.h"
-#define FIELD_LIT_LIMB_BITS  51
-#define FIELD_BITS           255
-#define field_t              p255_t
-#define field_mul            p255_mul
-#define field_sqr            p255_sqr
-#define field_add_RAW        p255_add_RAW
-#define field_sub_RAW        p255_sub_RAW
-#define field_mulw           p255_mulw
-#define field_bias           p255_bias
-#define field_isr            p255_isr
-#define field_weak_reduce    p255_weak_reduce
-#define field_strong_reduce  p255_strong_reduce
-#define field_serialize      p255_serialize
-#define field_deserialize    p255_deserialize
-#define SQRT_MINUS_ONE P25519_SQRT_MINUS_ONE
+#define GF_LIT_LIMB_BITS  51
+#define GF_BITS           255
+#define gf              gf_25519_t
+#define gf_s              gf_25519_s
+#define gf_mul            gf_25519_mul
+#define gf_sqr            gf_25519_sqr
+#define gf_add_RAW        gf_25519_add_RAW
+#define gf_sub_RAW        gf_25519_sub_RAW
+#define gf_mulw           gf_25519_mulw
+#define gf_bias           gf_25519_bias
+#define gf_isr            gf_25519_isr
+#define gf_weak_reduce    gf_25519_weak_reduce
+#define gf_strong_reduce  gf_25519_strong_reduce
+#define gf_serialize      gf_25519_serialize
+#define gf_deserialize    gf_25519_deserialize
+#define SQRT_MINUS_ONE    P25519_SQRT_MINUS_ONE
 
 #endif /* __F_FIELD_H__ */
diff --git a/src/p448/f_arithmetic.c b/src/p448/f_arithmetic.c
index 12e2b07..d73832a 100644
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"
 
 void 
-field_isr (
-    field_a_t a,
-    const field_a_t x
+gf_isr (
+    gf_a_t a,
+    const gf_a_t x
 ) {
-    field_a_t L0, L1, L2;
-    field_sqr  (   L1,     x );
-    field_mul  (   L2,     x,   L1 );
-    field_sqr  (   L1,   L2 );
-    field_mul  (   L2,     x,   L1 );
-    field_sqrn (   L1,   L2,     3 );
-    field_mul  (   L0,   L2,   L1 );
-    field_sqrn (   L1,   L0,     3 );
-    field_mul  (   L0,   L2,   L1 );
-    field_sqrn (   L2,   L0,     9 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqr  (   L0,   L1 );
-    field_mul  (   L2,     x,   L0 );
-    field_sqrn (   L0,   L2,    18 );
-    field_mul  (   L2,   L1,   L0 );
-    field_sqrn (   L0,   L2,    37 );
-    field_mul  (   L1,   L2,   L0 );
-    field_sqrn (   L0,   L1,    37 );
-    field_mul  (   L1,   L2,   L0 );
-    field_sqrn (   L0,   L1,   111 );
-    field_mul  (   L2,   L1,   L0 );
-    field_sqr  (   L0,   L2 );
-    field_mul  (   L1,     x,   L0 );
-    field_sqrn (   L0,   L1,   223 );
-    field_mul  (     a,   L2,   L0 );
+    gf_a_t L0, L1, L2;
+    gf_sqr  (   L1,     x );
+    gf_mul  (   L2,     x,   L1 );
+    gf_sqr  (   L1,   L2 );
+    gf_mul  (   L2,     x,   L1 );
+    gf_sqrn (   L1,   L2,     3 );
+    gf_mul  (   L0,   L2,   L1 );
+    gf_sqrn (   L1,   L0,     3 );
+    gf_mul  (   L0,   L2,   L1 );
+    gf_sqrn (   L2,   L0,     9 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqr  (   L0,   L1 );
+    gf_mul  (   L2,     x,   L0 );
+    gf_sqrn (   L0,   L2,    18 );
+    gf_mul  (   L2,   L1,   L0 );
+    gf_sqrn (   L0,   L2,    37 );
+    gf_mul  (   L1,   L2,   L0 );
+    gf_sqrn (   L0,   L1,    37 );
+    gf_mul  (   L1,   L2,   L0 );
+    gf_sqrn (   L0,   L1,   111 );
+    gf_mul  (   L2,   L1,   L0 );
+    gf_sqr  (   L0,   L2 );
+    gf_mul  (   L1,     x,   L0 );
+    gf_sqrn (   L0,   L1,   223 );
+    gf_mul  (     a,   L2,   L0 );
 }
diff --git a/src/p448/f_field.h b/src/p448/f_field.h
index cc06ab7..29188e7 100644
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -13,19 +13,19 @@
 #include <string.h>
 
 #include "p448.h"
-#define FIELD_LIT_LIMB_BITS  56
-#define FIELD_BITS           448
-#define field_t              p448_t
-#define field_mul            p448_mul
-#define field_sqr            p448_sqr
-#define field_add_RAW        p448_add_RAW
-#define field_sub_RAW        p448_sub_RAW
-#define field_mulw           p448_mulw
-#define field_bias           p448_bias
-#define field_isr            p448_isr
-#define field_weak_reduce    p448_weak_reduce
-#define field_strong_reduce  p448_strong_reduce
-#define field_serialize      p448_serialize
-#define field_deserialize    p448_deserialize
+#define GF_LIT_LIMB_BITS  56
+#define GF_BITS           448
+#define gf              p448_t
+#define gf_mul            p448_mul
+#define gf_sqr            p448_sqr
+#define gf_add_RAW        p448_add_RAW
+#define gf_sub_RAW        p448_sub_RAW
+#define gf_mulw           p448_mulw
+#define gf_bias           p448_bias
+#define gf_isr            p448_isr
+#define gf_weak_reduce    p448_weak_reduce
+#define gf_strong_reduce  p448_strong_reduce
+#define gf_serialize      p448_serialize
+#define gf_deserialize    p448_deserialize
 
 #endif /* __F_FIELD_H__ */
diff --git a/src/p480/f_arithmetic.c b/src/p480/f_arithmetic.c
index 1166c3c..227cdfe 100644
--- a/src/p480/f_arithmetic.c
+++ b/src/p480/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"
 
 void 
-field_isr (
-    field_a_t a,
-    const field_a_t x
+gf_isr (
+    gf_a_t a,
+    const gf_a_t x
 ) {
-    field_a_t L0, L1, L2, L3;
-    field_sqr  (   L2,     x );
-    field_mul  (   L1,     x,   L2 );
-    field_sqrn (   L0,   L1,     2 );
-    field_mul  (   L2,   L1,   L0 );
-    field_sqrn (   L0,   L2,     4 );
-    field_mul  (   L1,   L2,   L0 );
-    field_sqr  (   L0,   L1 );
-    field_mul  (   L2,     x,   L0 );
-    field_sqrn (   L0,   L2,     8 );
-    field_mul  (   L2,   L1,   L0 );
-    field_sqrn (   L0,   L2,    17 );
-    field_mul  (   L1,   L2,   L0 );
-    field_sqrn (   L0,   L1,    17 );
-    field_mul  (   L1,   L2,   L0 );
-    field_sqrn (   L3,   L1,    17 );
-    field_mul  (   L0,   L2,   L3 );
-    field_sqrn (   L2,   L0,    51 );
-    field_mul  (   L0,   L1,   L2 );
-    field_sqrn (   L1,   L0,   119 );
-    field_mul  (   L2,   L0,   L1 );
-    field_sqr  (   L0,   L2 );
-    field_mul  (   L1,     x,   L0 );
-    field_sqrn (   L0,   L1,   239 );
-    field_mul  (     a,   L2,   L0 );
+    gf_a_t L0, L1, L2, L3;
+    gf_sqr  (   L2,     x );
+    gf_mul  (   L1,     x,   L2 );
+    gf_sqrn (   L0,   L1,     2 );
+    gf_mul  (   L2,   L1,   L0 );
+    gf_sqrn (   L0,   L2,     4 );
+    gf_mul  (   L1,   L2,   L0 );
+    gf_sqr  (   L0,   L1 );
+    gf_mul  (   L2,     x,   L0 );
+    gf_sqrn (   L0,   L2,     8 );
+    gf_mul  (   L2,   L1,   L0 );
+    gf_sqrn (   L0,   L2,    17 );
+    gf_mul  (   L1,   L2,   L0 );
+    gf_sqrn (   L0,   L1,    17 );
+    gf_mul  (   L1,   L2,   L0 );
+    gf_sqrn (   L3,   L1,    17 );
+    gf_mul  (   L0,   L2,   L3 );
+    gf_sqrn (   L2,   L0,    51 );
+    gf_mul  (   L0,   L1,   L2 );
+    gf_sqrn (   L1,   L0,   119 );
+    gf_mul  (   L2,   L0,   L1 );
+    gf_sqr  (   L0,   L2 );
+    gf_mul  (   L1,     x,   L0 );
+    gf_sqrn (   L0,   L1,   239 );
+    gf_mul  (     a,   L2,   L0 );
 }
diff --git a/src/p480/f_field.h b/src/p480/f_field.h
index 1c94a98..471e90d 100644
--- a/src/p480/f_field.h
+++ b/src/p480/f_field.h
@@ -13,19 +13,19 @@
 #include <string.h>
 
 #include "p480.h"
-#define FIELD_LIT_LIMB_BITS  60
-#define FIELD_BITS           480
-#define field_t              p480_t
-#define field_mul            p480_mul
-#define field_sqr            p480_sqr
-#define field_add_RAW        p480_add_RAW
-#define field_sub_RAW        p480_sub_RAW
-#define field_mulw           p480_mulw
-#define field_bias           p480_bias
-#define field_isr            p480_isr
-#define field_weak_reduce    p480_weak_reduce
-#define field_strong_reduce  p480_strong_reduce
-#define field_serialize      p480_serialize
-#define field_deserialize    p480_deserialize
+#define GF_LIT_LIMB_BITS  60
+#define GF_BITS           480
+#define gf              p480_t
+#define gf_mul            p480_mul
+#define gf_sqr            p480_sqr
+#define gf_add_RAW        p480_add_RAW
+#define gf_sub_RAW        p480_sub_RAW
+#define gf_mulw           p480_mulw
+#define gf_bias           p480_bias
+#define gf_isr            p480_isr
+#define gf_weak_reduce    p480_weak_reduce
+#define gf_strong_reduce  p480_strong_reduce
+#define gf_serialize      p480_serialize
+#define gf_deserialize    p480_deserialize
 
 #endif /* __F_FIELD_H__ */
diff --git a/src/p521/f_arithmetic.c b/src/p521/f_arithmetic.c
index 7c36478..7ce39d8 100644
--- a/src/p521/f_arithmetic.c
+++ b/src/p521/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"
 
 void 
-field_isr (
-    field_a_t a,
-    const field_a_t x
+gf_isr (
+    gf_a_t a,
+    const gf_a_t x
 ) {
-    field_a_t L0, L1, L2;
-    field_sqr  (   L1,     x );
-    field_mul  (   L0,     x,   L1 );
-    field_sqrn (   L2,   L0,     2 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqrn (   L2,   L1,     4 );
-    field_mul  (   L0,   L1,   L2 );
-    field_sqrn (   L2,   L0,     8 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqrn (   L2,   L1,    16 );
-    field_mul  (   L0,   L1,   L2 );
-    field_sqrn (   L2,   L0,    32 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqr  (   L2,   L1 );
-    field_mul  (   L0,     x,   L2 );
-    field_sqrn (   L2,   L0,    64 );
-    field_mul  (   L0,   L1,   L2 );
-    field_sqrn (   L2,   L0,   129 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqr  (   L2,   L1 );
-    field_mul  (   L0,     x,   L2 );
-    field_sqrn (   L2,   L0,   259 );
-    field_mul  (   L1,   L0,   L2 );
-    field_sqr  (   L0,   L1 );
-    field_mul  (     a,     x,   L0 );
+    gf_a_t L0, L1, L2;
+    gf_sqr  (   L1,     x );
+    gf_mul  (   L0,     x,   L1 );
+    gf_sqrn (   L2,   L0,     2 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqrn (   L2,   L1,     4 );
+    gf_mul  (   L0,   L1,   L2 );
+    gf_sqrn (   L2,   L0,     8 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqrn (   L2,   L1,    16 );
+    gf_mul  (   L0,   L1,   L2 );
+    gf_sqrn (   L2,   L0,    32 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqr  (   L2,   L1 );
+    gf_mul  (   L0,     x,   L2 );
+    gf_sqrn (   L2,   L0,    64 );
+    gf_mul  (   L0,   L1,   L2 );
+    gf_sqrn (   L2,   L0,   129 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqr  (   L2,   L1 );
+    gf_mul  (   L0,     x,   L2 );
+    gf_sqrn (   L2,   L0,   259 );
+    gf_mul  (   L1,   L0,   L2 );
+    gf_sqr  (   L0,   L1 );
+    gf_mul  (     a,     x,   L0 );
 }
diff --git a/src/p521/f_field.h b/src/p521/f_field.h
index ebbb666..6a72ea7 100644
--- a/src/p521/f_field.h
+++ b/src/p521/f_field.h
@@ -13,19 +13,19 @@
 #include "constant_time.h"
 
 #include "p521.h"
-#define FIELD_LIT_LIMB_BITS  58
-#define FIELD_BITS           521
-#define field_t              p521_t
-#define field_mul            p521_mul
-#define field_sqr            p521_sqr
-#define field_add_RAW        p521_add_RAW
-#define field_sub_RAW        p521_sub_RAW
-#define field_mulw           p521_mulw
-#define field_bias           p521_bias
-#define field_isr            p521_isr
-#define field_weak_reduce    p521_weak_reduce
-#define field_strong_reduce  p521_strong_reduce
-#define field_serialize      p521_serialize
-#define field_deserialize    p521_deserialize
+#define GF_LIT_LIMB_BITS  58
+#define GF_BITS           521
+#define gf              p521_t
+#define gf_mul            p521_mul
+#define gf_sqr            p521_sqr
+#define gf_add_RAW        p521_add_RAW
+#define gf_sub_RAW        p521_sub_RAW
+#define gf_mulw           p521_mulw
+#define gf_bias           p521_bias
+#define gf_isr            p521_isr
+#define gf_weak_reduce    p521_weak_reduce
+#define gf_strong_reduce  p521_strong_reduce
+#define gf_serialize      p521_serialize
+#define gf_deserialize    p521_deserialize
 
 #endif /* __F_FIELD_H__ */
diff --git a/src/public_include/decaf/decaf_255.h b/src/public_include/decaf/decaf_255.h
index cbf09c8..fa6d939 100644
--- a/src/public_include/decaf/decaf_255.h
+++ b/src/public_include/decaf/decaf_255.h
@@ -21,11 +21,13 @@ extern "C" {
 #define DECAF_255_SCALAR_BITS 254 // Curve25519: 253
 #define DECAF_255_SCALAR_LIMBS (256/DECAF_WORD_BITS)
 
+#ifndef __DECAF_GF_ALREADY_DEFINED__
 /** Galois field element internal structure */
-typedef struct gf_255_s {
+typedef struct gf_25519_s {
     decaf_word_t limb[DECAF_255_LIMBS];
-} gf_255_s, gf_255_t[1];
+} gf_25519_s, gf_25519_t[1];
 /** @endcond */
+#endif /* __DECAF_GF_ALREADY_DEFINED__ */
 
 /** Number of bytes in a serialized point. */
 #define DECAF_255_SER_BYTES 32
@@ -34,7 +36,7 @@ typedef struct gf_255_s {
 #define DECAF_255_SCALAR_BYTES 32
 
 /** Twisted Edwards (-1,d-1) extended homogeneous coordinates */
-typedef struct decaf_255_point_s { /**@cond internal*/gf_255_t x,y,z,t;/**@endcond*/ } decaf_255_point_t[1];
+typedef struct decaf_255_point_s { /**@cond internal*/gf_25519_t x,y,z,t;/**@endcond*/ } decaf_255_point_t[1];
 
 /** Precomputed table based on a point.  Can be trivial implementation. */
 struct decaf_255_precomputed_s;