From 38455f34f2319686138859353ac669f8816d8683 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 15 Jan 2016 13:35:04 -0800
Subject: [PATCH] one ser/deser to rule them all (TODO test on NEON and other
 places with LIMBPERM)

---
 src/decaf.c                       | 126 ++++++++++++++++--------------
 src/gen_headers/f_field_h.py      |  11 ++-
 src/p25519/arch_32/f_impl.c       |  77 +++---------------
 src/p25519/arch_32/f_impl.h       |   5 +-
 src/p25519/arch_ref64/f_impl.c    |  42 ----------
 src/p25519/arch_ref64/f_impl.h    |   2 +
 src/p25519/arch_x86_64/f_impl.c   |  42 ----------
 src/p25519/arch_x86_64/f_impl.h   |   2 +
 src/p25519/f_arithmetic.c         |   7 +-
 src/p448/arch_32/f_impl.c         |  50 ------------
 src/p448/arch_32/f_impl.h         |   2 +
 src/p448/arch_arm_32/f_impl.c     |  60 +-------------
 src/p448/arch_arm_32/f_impl.h     |   2 +
 src/p448/arch_neon/f_impl.c       |   1 -
 src/p448/arch_neon/f_impl.h       |   2 +
 src/p448/arch_ref64/f_impl.h      |   2 +
 src/p448/arch_x86_64/f_impl.c     |  48 ------------
 src/p448/arch_x86_64/f_impl.h     |   1 +
 src/p448/f_arithmetic.c           |   5 ++
 src/p480/arch_x86_64/f_impl.c     |  62 ---------------
 src/p480/arch_x86_64/f_impl.h     |   8 +-
 src/p480/f_arithmetic.c           |   6 ++
 src/p521/arch_ref64/f_impl.c      |  46 -----------
 src/p521/arch_ref64/f_impl.h      |   2 +
 src/p521/arch_x86_64_r12/f_impl.c |  48 ------------
 src/p521/arch_x86_64_r12/f_impl.h |   1 +
 src/p521/f_arithmetic.c           |   6 ++
 27 files changed, 136 insertions(+), 530 deletions(-)

diff --git a/src/decaf.c b/src/decaf.c
index a690678..7a6e08d 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -86,12 +86,45 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 #define UNROLL
 #endif
 
-#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
-#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
+#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
+#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
+
+void gf_serialize (uint8_t serial[SER_BYTES], const gf x) {
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    UNROLL for (unsigned int i=0; i<SER_BYTES; i++) {
+        if (fill < 8 && j < NLIMBS) {
+            buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill;
+            fill += LIMB_PLACE_VALUE(LIMBPERM(j));
+            j++;
+        }
+        serial[i] = buffer;
+        fill -= 8;
+        buffer >>= 8;
+    }
+}
 
-/** Copy x = y */
-static INLINE void
-gf_cpy(gf x, const gf y) { x[0] = y[0]; }
+mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) {
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    dsword_t scarry = 0;
+    UNROLL for (unsigned int i=0; i<NLIMBS; i++) {
+        UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < SER_BYTES) {
+            buffer |= ((dword_t)serial[j]) << fill;
+            fill += 8;
+            j++;
+        }
+        x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
+        fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
+        buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
+        scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t));
+    }
+    return word_is_zero(buffer) & ~word_is_zero(scarry);
+}
 
 /** Constant time, x = is_z ? z : y */
 static INLINE void
@@ -120,9 +153,7 @@ cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
 /** Compare a==b */
 /* Not static because it's used in inverse square root. */
 decaf_word_t gf_eq(const gf a, const gf b);
-
-decaf_word_t
-gf_eq(const gf a, const gf b) {
+decaf_word_t gf_eq(const gf a, const gf b) {
     gf c;
     gf_sub(c,a,b);
     gf_strong_reduce(c);
@@ -153,13 +184,10 @@ gf_invert(gf y, const gf x) {
     (void)ret; assert(ret);
     gf_sqr(t1, t2);
     gf_mul(t2, t1, x); // not direct to y in case of alias.
-    gf_cpy(y, t2);
+    gf_copy(y, t2);
 }
 
-/**
- * Mul by signed int.  Not constant-time WRT the sign of that int.
- * Just uses a full mul (PERF)
- */
+/** Mul by signed int.  Not constant-time WRT the sign of that int. */
 static INLINE void
 gf_mulw_sgn(gf c, const gf a, int w) {
     if (w>0) {
@@ -182,7 +210,7 @@ static decaf_word_t hibit(const gf x) {
 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t lobit(const gf x) {
     gf y;
-    gf_cpy(y,x);
+    gf_copy(y,x);
     gf_strong_reduce(y);
     return -(y->limb[0]&1);
 }
@@ -394,16 +422,9 @@ API_NS(scalar_eq) (
     return word_is_zero(diff);
 }
 
-/* *** API begins here *** */    
-
 /** identity = (0,1) */
 const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 
-static void
-gf_encode ( unsigned char ser[SER_BYTES], gf a ) {
-    gf_serialize(ser, (gf_s *)a);
-}
-
 static void
 deisogenize (
     gf_s *__restrict__ s,
@@ -508,14 +529,7 @@ deisogenize (
 void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
     gf s, mtos;
     deisogenize(s,mtos,p,0,0,0);
-    gf_encode ( ser, s );
-}
-
-/**
- * Deserialize a field element, return TRUE if < p.
- */
-static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
-    return gf_deserialize((gf_s *)s, ser);
+    gf_serialize ( ser, s );
 }
 
 decaf_error_t API_NS(point_decode) (
@@ -524,7 +538,7 @@ decaf_error_t API_NS(point_decode) (
     decaf_bool_t allow_identity
 ) {
     gf s, a, b, c, d, e, f;
-    decaf_bool_t succ = gf_deser(s, ser), zero = gf_eq(s, ZERO);
+    decaf_bool_t succ = gf_deserialize(s, ser), zero = gf_eq(s, ZERO);
     allow_identity = ~word_is_zero(allow_identity);
     succ &= allow_identity | ~zero;
     succ &= ~hibit(s);
@@ -592,8 +606,6 @@ decaf_error_t API_NS(point_decode) (
 #define NEG_D 0
 #endif
 
-
-
 void API_NS(point_sub) (
     point_t p,
     const point_t q,
@@ -688,8 +700,8 @@ void API_NS(point_negate) (
    const point_t a
 ) {
     gf_sub(nega->x, ZERO, a->x);
-    gf_cpy(nega->y, a->y);
-    gf_cpy(nega->z, a->z);
+    gf_copy(nega->y, a->y);
+    gf_copy(nega->z, a->z);
     gf_sub(nega->t, ZERO, a->t);
 }
 
@@ -827,7 +839,7 @@ niels_to_pt (
     gf_add ( e->y, n->b, n->a );
     gf_sub ( e->x, n->b, n->a );
     gf_mul ( e->t, e->y, e->x );
-    gf_cpy ( e->z, ONE );
+    gf_copy ( e->z, ONE );
 }
 
 static NOINLINE void
@@ -882,7 +894,7 @@ add_pniels_to_pt (
 ) {
     gf L0;
     gf_mul ( L0, p->z, pn->z );
-    gf_cpy ( p->z, L0 );
+    gf_copy ( p->z, L0 );
     add_niels_to_pt( p, pn->n, before_double );
 }
 
@@ -894,7 +906,7 @@ sub_pniels_from_pt (
 ) {
     gf L0;
     gf_mul ( L0, p->z, pn->z );
-    gf_cpy ( p->z, L0 );
+    gf_copy ( p->z, L0 );
     sub_niels_from_pt( p, pn->n, before_double );
 }
 
@@ -1203,7 +1215,7 @@ void API_NS(point_from_hash_nonuniform) (
     // TODO: simplify since we don't return a hint anymore
     // TODO: test pathological case ur0^2 = 1/(1-d)
     gf r0,r,a,b,c,dee,D,N,rN,e;
-    gf_deser(r0,ser);
+    gf_deserialize(r0,ser);
     gf_strong_reduce(r0);
     gf_sqr(a,r0);
 #if P_MOD_8 == 5
@@ -1265,7 +1277,7 @@ void API_NS(point_from_hash_nonuniform) (
     /* isogenize */
 #if IMAGINE_TWIST
     gf_mul(c,a,SQRT_MINUS_ONE);
-    gf_cpy(a,c);
+    gf_copy(a,c);
 #endif
     
     gf_sqr(c,a); /* s^2 */
@@ -1326,7 +1338,7 @@ API_NS(invert_elligator_nonuniform) (
     succ &= ~(is_identity & sgn_ed_T); /* NB: there are no preimages of rotated identity. */
 #endif
     
-    gf_encode(recovered_hash, b); 
+    gf_serialize(recovered_hash, b); 
     /* TODO: deal with overflow flag */
     return decaf_succeed_if(succ);
 }
@@ -1380,14 +1392,14 @@ void API_NS(point_debugging_torque) (
     gf tmp;
     gf_mul(tmp,p->x,SQRT_MINUS_ONE);
     gf_mul(q->x,p->y,SQRT_MINUS_ONE);
-    gf_cpy(q->y,tmp);
-    gf_cpy(q->z,p->z);
+    gf_copy(q->y,tmp);
+    gf_copy(q->z,p->z);
     gf_sub(q->t,ZERO,p->t);
 #else
     gf_sub(q->x,ZERO,p->x);
     gf_sub(q->y,ZERO,p->y);
-    gf_cpy(q->z,p->z);
-    gf_cpy(q->t,p->t);
+    gf_copy(q->z,p->z);
+    gf_copy(q->t,p->t);
 #endif
 }
 
@@ -1397,16 +1409,16 @@ void API_NS(point_debugging_pscale) (
     const uint8_t factor[SER_BYTES]
 ) {
     gf gfac,tmp;
-    ignore_result(gf_deser(gfac,factor));
+    ignore_result(gf_deserialize(gfac,factor));
     cond_sel(gfac,gfac,ONE,gf_eq(gfac,ZERO));
     gf_mul(tmp,p->x,gfac);
-    gf_cpy(q->x,tmp);
+    gf_copy(q->x,tmp);
     gf_mul(tmp,p->y,gfac);
-    gf_cpy(q->y,tmp);
+    gf_copy(q->y,tmp);
     gf_mul(tmp,p->z,gfac);
-    gf_cpy(q->z,tmp);
+    gf_copy(q->z,tmp);
     gf_mul(tmp,p->t,gfac);
-    gf_cpy(q->t,tmp);
+    gf_copy(q->t,tmp);
 }
 
 static void gf_batch_invert (
@@ -1417,7 +1429,7 @@ static void gf_batch_invert (
     gf t1;
     assert(n>1);
   
-    gf_cpy(out[1], in[0]);
+    gf_copy(out[1], in[0]);
     int i;
     for (i=1; i<(int) (n-1); i++) {
         gf_mul(out[i+1], out[i], in[i]);
@@ -1428,9 +1440,9 @@ static void gf_batch_invert (
 
     for (i=n-1; i>0; i--) {
         gf_mul(t1, out[i], out[0]);
-        gf_cpy(out[i], t1);
+        gf_copy(out[i], t1);
         gf_mul(t1, out[0], in[i]);
-        gf_cpy(out[0], t1);
+        gf_copy(out[0], t1);
     }
 }
 
@@ -1447,15 +1459,15 @@ static void batch_normalize_niels (
     for (i=0; i<n; i++) {
         gf_mul(product, table[i]->a, zis[i]);
         gf_strong_reduce(product);
-        gf_cpy(table[i]->a, product);
+        gf_copy(table[i]->a, product);
         
         gf_mul(product, table[i]->b, zis[i]);
         gf_strong_reduce(product);
-        gf_cpy(table[i]->b, product);
+        gf_copy(table[i]->b, product);
         
         gf_mul(product, table[i]->c, zis[i]);
         gf_strong_reduce(product);
-        gf_cpy(table[i]->c, product);
+        gf_copy(table[i]->c, product);
     }
     
     decaf_bzero(product,sizeof(product));
@@ -1500,7 +1512,7 @@ void API_NS(precompute) (
 
             pt_to_pniels(pn_tmp, start);
             memcpy(table->table[idx], pn_tmp->n, sizeof(pn_tmp->n));
-            gf_cpy(zs[idx], pn_tmp->z);
+            gf_copy(zs[idx], pn_tmp->z);
 			
             if (j >= (1u<<(t-1)) - 1) break;
             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
@@ -1733,7 +1745,7 @@ void API_NS(precompute_wnafs) (
     prepare_wnaf_table(tmp,base,DECAF_WNAF_FIXED_TABLE_BITS);
     for (i=0; i<1<<DECAF_WNAF_FIXED_TABLE_BITS; i++) {
         memcpy(out[i], tmp[i]->n, sizeof(niels_t));
-        gf_cpy(zs[i], tmp[i]->z);
+        gf_copy(zs[i], tmp[i]->z);
     }
     batch_normalize_niels(out, (const gf *)zs, zis, 1<<DECAF_WNAF_FIXED_TABLE_BITS);
     
diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index 388faba..420b588 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -13,8 +13,9 @@ f_field_h = gen_file(
 #include "word.h"
 
 #define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
+#define NLIMBS (%(gf_impl_bits)d/sizeof(word_t)/8)
 typedef struct gf_%(gf_shortname)s_s {
-    word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8];
+    word_t limb[NLIMBS];
 } __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
 
 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
@@ -33,6 +34,7 @@ typedef struct gf_%(gf_shortname)s_s {
 #define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_serialize      gf_%(gf_shortname)s_serialize
 #define gf_deserialize    gf_%(gf_shortname)s_deserialize
+#define MODULUS           gf_%(gf_shortname)s_MODULUS
 
 #define SQRT_MINUS_ONE    P%(gf_shortname)s_SQRT_MINUS_ONE /* might not be defined */
 
@@ -42,6 +44,8 @@ typedef struct gf_%(gf_shortname)s_s {
 extern "C" {
 #endif
 
+const gf MODULUS;
+
 /* Defined below in f_impl.h */
 static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
 static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b);
@@ -61,4 +65,9 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
 #endif
 
 #include "f_impl.h" /* Bring in the inline implementations */
+
+#ifndef LIMBPERM
+  #define LIMBPERM(i) (i)
+#endif
+#define LIMB_MASK(i) (((1ull)<<LIMB_PLACE_VALUE(i))-1)
 """)
diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c
index cfc3fb3..b8c3a5c 100644
--- a/src/p25519/arch_32/f_impl.c
+++ b/src/p25519/arch_32/f_impl.c
@@ -91,88 +91,37 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
 }
 
 void gf_strong_reduce (gf a) {
-    uint32_t maske = (1<<26)-1, masko = (1<<25)-1;
-
     /* first, clear high */
     a->limb[0] += (a->limb[9]>>25)*19;
-    a->limb[9] &= masko;
+    a->limb[9] &= LIMB_MASK(9);
 
     /* now the total is less than 2p */
 
     /* compute total_value - p.  No need to reduce mod p. */
-    int64_t scarry = 0;
-    int i;
-    for (i=0; i<10; /*i+=2*/) {
-        scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske);
-        a->limb[i] = scarry & maske;
-        scarry >>= 26;
-        i++;
-
-        scarry = scarry + a->limb[i] - masko;
-        a->limb[i] = scarry & masko;
-        scarry >>= 25;
-        i++;
+    dsword_t scarry = 0;
+    for (unsigned int i=0; i<10; i++) {
+        scarry = scarry + a->limb[i] - MODULUS->limb[i];
+        a->limb[i] = scarry & LIMB_MASK(i);
+        scarry >>= LIMB_PLACE_VALUE(i);
     }
 
     /* uncommon case: it was >= p, so now scarry = 0 and this = x
      * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
      * so let's add back in p.  will carry back off the top for 2^255.
      */
-
     assert(word_is_zero(scarry) | word_is_zero(scarry+1));
 
-    uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske;
-    uint64_t carry = 0;
+    word_t scarry_0 = scarry;
+    dword_t carry = 0;
 
     /* add it back */
-    for (i=0; i<10; /*i+=2*/) {
-        carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske);
-        a->limb[i] = carry & maske;
-        carry >>= 26;
-        i++;
-
-        carry = carry + a->limb[i] + scarry_masko;
-        a->limb[i] = carry & masko;
-        carry >>= 25;
+    for (unsigned int i=0; i<10; i++) {
+        carry = carry + a->limb[i] + (scarry_0 & MODULUS->limb[i]);
+        a->limb[i] = carry & LIMB_MASK(i);
+        carry >>= LIMB_PLACE_VALUE(i);
         i++;
     }
 
-    assert(word_is_zero(carry + scarry));
-}
-
-#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
-void gf_serialize (uint8_t serial[32], const gf x) {
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    unsigned int j=0, fill=0;
-    dword_t buffer = 0;
-    for (unsigned int i=0; i<32; i++) {
-        if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) {
-            buffer |= ((dword_t)red->limb[j]) << fill;
-            fill += LIMB_PLACE_VALUE(j);
-            j++;
-        }
-        serial[i] = buffer;
-        fill -= 8;
-        buffer >>= 8;
-    }
+    assert(word_is_zero(carry + scarry_0));
 }
 
-mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
-    unsigned int j=0, fill=0;
-    dword_t buffer = 0;
-    for (unsigned int i=0; i<32; i++) {
-        buffer |= ((dword_t)serial[i]) << fill;
-        fill += 8;
-        if (fill >= LIMB_PLACE_VALUE(j) || i == 31) {
-            assert(j < sizeof(x->limb)/sizeof(x->limb[0]));
-            word_t mask = ((1ull)<<LIMB_PLACE_VALUE(j))-1;
-            x->limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field.
-            buffer >>= LIMB_PLACE_VALUE(j);
-            fill -= LIMB_PLACE_VALUE(j);
-            j++;
-        }
-    }
-    return -1; // FIXME: test whether in field.
-}
diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h
index 5e51bf0..f917fa0 100644
--- a/src/p25519/arch_32/f_impl.h
+++ b/src/p25519/arch_32/f_impl.h
@@ -3,8 +3,9 @@
  */
 
 #define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26
-#define FIELD_LITERAL(a,b,c,d,e) \
-    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}
+#define FIELD_LITERAL(a,b,c,d,e) {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}
+
+#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<10; i++) {
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index 414fd66..ec98829 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -97,45 +97,3 @@ void gf_strong_reduce (gf a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t serial[32], const gf x) {
-    int i,j;
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    uint64_t *r = red->limb;
-    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
-    for (i=0; i<4; i++) {
-        for (j=0; j<8; j++) {
-            serial[8*i+j] = ser64[i];
-            ser64[i] >>= 8;
-        }
-    }
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
-    int i,j;
-    uint64_t ser64[4], mask = ((1ull<<51)-1);
-    for (i=0; i<4; i++) {
-        uint64_t out = 0;
-        for (j=0; j<8; j++) {
-            out |= ((uint64_t)serial[8*i+j])<<(8*j);
-        }
-        ser64[i] = out;
-    }
-    
-    /* Test for >= 2^255-19 */
-    uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
-    ge &= ser64[1];
-    ge &= ser64[2];
-    ge &= (ser64[3]<<1) + 1;
-    ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
-    
-    x->limb[0] = ser64[0] & mask;
-    x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
-    x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
-    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
-    x->limb[4] = ser64[3]>>12;
-    
-    return ~word_is_zero(~ge);
-}
diff --git a/src/p25519/arch_ref64/f_impl.h b/src/p25519/arch_ref64/f_impl.h
index dcd097d..c4c472f 100644
--- a/src/p25519/arch_ref64/f_impl.h
+++ b/src/p25519/arch_ref64/f_impl.h
@@ -4,6 +4,8 @@
 
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
 
+#define LIMB_PLACE_VALUE(i) 51
+
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<5; i++) {
         out->limb[i] = a->limb[i] + b->limb[i];
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 0b02519..81af981 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -208,45 +208,3 @@ void gf_strong_reduce (gf a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t serial[32], const gf x) {
-    int i,j;
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    uint64_t *r = red->limb;
-    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
-    for (i=0; i<4; i++) {
-        for (j=0; j<8; j++) {
-            serial[8*i+j] = ser64[i];
-            ser64[i] >>= 8;
-        }
-    }
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
-    int i,j;
-    uint64_t ser64[4], mask = ((1ull<<51)-1);
-    for (i=0; i<4; i++) {
-        uint64_t out = 0;
-        for (j=0; j<8; j++) {
-            out |= ((uint64_t)serial[8*i+j])<<(8*j);
-        }
-        ser64[i] = out;
-    }
-    
-    /* Test for >= 2^255-19 */
-    uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
-    ge &= ser64[1];
-    ge &= ser64[2];
-    ge &= (ser64[3]<<1) + 1;
-    ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
-    
-    x->limb[0] = ser64[0] & mask;
-    x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
-    x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
-    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
-    x->limb[4] = ser64[3]>>12;
-    
-    return ~word_is_zero(~ge);
-}
diff --git a/src/p25519/arch_x86_64/f_impl.h b/src/p25519/arch_x86_64/f_impl.h
index 3461a6c..647f966 100644
--- a/src/p25519/arch_x86_64/f_impl.h
+++ b/src/p25519/arch_x86_64/f_impl.h
@@ -4,6 +4,8 @@
 
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
 
+#define LIMB_PLACE_VALUE(i) 51
+
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<5; i++) {
         out->limb[i] = a->limb[i] + b->limb[i];
diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c
index a3749d6..f348307 100644
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -18,14 +18,17 @@ const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
     0x78595a6804c9e,
     0x2b8324804fc1d
 )};
+
+const gf MODULUS = {FIELD_LITERAL(
+    0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff
+)};
     
 /* TODO put in header */
 extern const gf_25519_t decaf_255_ONE;
 extern mask_t decaf_255_gf_eq(const gf_25519_t a, const gf_25519_t b);
 
 /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
-void 
-gf_isr (
+void gf_isr (
     gf_25519_t a,
     const gf_25519_t x
 ) {
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index 24e8fe2..1131def 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -142,53 +142,3 @@ void gf_strong_reduce (gf a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t *serial, const gf x) {
-    int i,j;
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    for (i=0; i<8; i++) {
-        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
-        for (j=0; j<7; j++) {
-            serial[7*i+j] = limb;
-            limb >>= 8;
-        }
-        assert(limb == 0);
-    }
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
-    int i,j;
-    for (i=0; i<8; i++) {
-        uint64_t out = 0;
-        for (j=0; j<7; j++) {
-            out |= ((uint64_t)serial[7*i+j])<<(8*j);
-        }
-        x->limb[2*i] = out & ((1ull<<28)-1);
-        x->limb[2*i+1] = out >> 28;
-    }
-    
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 56 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    uint32_t ge = -1, mask = (1ull<<28)-1;
-    for (i=0; i<8; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
-    
-    /* Propagate the rest */
-    for (i=9; i<16; i++) {
-        ge &= x->limb[i];
-    }
-    
-    return ~word_is_zero(ge ^ mask);
-}
-
diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h
index a82452f..330a29c 100644
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -5,6 +5,8 @@
 #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
     {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
+    
+#define LIMB_PLACE_VALUE(i) 28
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index b1719ad..be58706 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -834,9 +834,7 @@ void gf_mulw (
     c[1] += accum8 >> 28;
 }
 
-void gf_strong_reduce (
-    gf a
-) {
+void gf_strong_reduce (gf a) {
     word_t mask = (1ull<<28)-1;
 
     /* first, clear high */
@@ -875,59 +873,3 @@ void gf_strong_reduce (
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (
-    uint8_t *serial,
-    const gf x
-) {
-    int i,j;
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    for (i=0; i<8; i++) {
-        uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
-        for (j=0; j<7; j++) {
-            serial[7*i+j] = limb;
-            limb >>= 8;
-        }
-        assert(limb == 0);
-    }
-}
-
-mask_t
-gf_deserialize (
-    gf x,
-    const uint8_t serial[56]
-) {
-    int i,j;
-    for (i=0; i<8; i++) {
-        uint64_t out = 0;
-        for (j=0; j<7; j++) {
-            out |= ((uint64_t)serial[7*i+j])<<(8*j);
-        }
-        x->limb[2*i] = out & ((1ull<<28)-1);
-        x->limb[2*i+1] = out >> 28;
-    }
-    
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 56 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    uint32_t ge = -1, mask = (1ull<<28)-1;
-    for (i=0; i<8; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
-    
-    /* Propagate the rest */
-    for (i=9; i<16; i++) {
-        ge &= x->limb[i];
-    }
-    
-    return ~word_is_zero(ge ^ mask);
-}
diff --git a/src/p448/arch_arm_32/f_impl.h b/src/p448/arch_arm_32/f_impl.h
index 4392012..e193c34 100644
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -5,6 +5,8 @@
 #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
     {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
+    
+#define LIMB_PLACE_VALUE(i) 28
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
diff --git a/src/p448/arch_neon/f_impl.c b/src/p448/arch_neon/f_impl.c
index 845f31e..94f4df2 100644
--- a/src/p448/arch_neon/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -684,4 +684,3 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     
     return ~word_is_zero(ge ^ mask);
 }
-
diff --git a/src/p448/arch_neon/f_impl.h b/src/p448/arch_neon/f_impl.h
index 15f0cae..3c1845a 100644
--- a/src/p448/arch_neon/f_impl.h
+++ b/src/p448/arch_neon/f_impl.h
@@ -11,6 +11,8 @@
       LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
       LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \
       LIMBLO(d),LIMBLO(h), LIMBHI(d),LIMBHI(h)}}
+    
+#define LIMB_PLACE_VALUE(i) 28
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
diff --git a/src/p448/arch_ref64/f_impl.h b/src/p448/arch_ref64/f_impl.h
index 161b919..4caf47b 100644
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -3,6 +3,8 @@
  */
 
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
+    
+#define LIMB_PLACE_VALUE(i) 56
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<8; i++) {
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 8ebb569..9523a50 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -329,51 +329,3 @@ void gf_strong_reduce (gf a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t *serial, const gf x) {
-    int i,j;
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    for (i=0; i<8; i++) {
-        for (j=0; j<7; j++) {
-            serial[7*i+j] = red->limb[i];
-            red->limb[i] >>= 8;
-        }
-        assert(red->limb[i] == 0);
-    }
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
-    int i,j;
-    for (i=0; i<8; i++) {
-        word_t out = 0;
-        for (j=0; j<7; j++) {
-            out |= ((word_t)serial[7*i+j])<<(8*j);
-        }
-        x->limb[i] = out;
-    }
-    
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 56 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    word_t ge = -1, mask = (1ull<<56)-1;
-    for (i=0; i<4; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
-    
-    /* Propagate the rest */
-    for (i=5; i<8; i++) {
-        ge &= x->limb[i];
-    }
-    
-    return ~word_is_zero(ge ^ mask);
-}
-
diff --git a/src/p448/arch_x86_64/f_impl.h b/src/p448/arch_x86_64/f_impl.h
index a62e1b4..f69ba1f 100644
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -3,6 +3,7 @@
  */
 
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
+#define LIMB_PLACE_VALUE(i) 56
 
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
diff --git a/src/p448/f_arithmetic.c b/src/p448/f_arithmetic.c
index afb6792..d631c81 100644
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -10,6 +10,11 @@
 
 #include "field.h"
 
+const gf MODULUS = {FIELD_LITERAL(
+    0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 
+    0xfffffffffffffe, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff
+)};
+
 void 
 gf_isr (
     gf a,
diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c
index e021241..7a63a57 100644
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -329,65 +329,3 @@ void gf_strong_reduce (gf *a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t *serial, const struct gf *x) {
-    int i,j,k=0;
-    gf red;
-    gf_copy(&red, x);
-    gf_strong_reduce(&red);
-    word_t r = 0;
-    for (i=0; i<8; i+=2) {
-        r = red.limb[i];
-        for (j=0; j<7; j++) {
-            serial[k++] = r;
-            r >>= 8;
-        }
-        assert(r<16);
-        r += red.limb[i+1]<<4;
-        for (j=0; j<8; j++) {
-            serial[k++] = r;
-            r >>= 8;
-        }
-        assert(r==0);
-    }
-}
-
-mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
-    int i,j,k=0;
-
-    for (i=0; i<8; i+=2) {
-        word_t r = 0;
-        for (j=0; j<8; j++) {
-            r |= ((word_t)serial[k++])<<(8*j);
-        }
-        x->limb[i] = r & ((1ull<<60)-1);
-        r >>= 60;
-        for (j=0; j<7; j++) {
-            r |= ((word_t)serial[k++])<<(8*j+4);
-        }
-        x->limb[i+1] = r;
-    }
-    
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 60 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    word_t ge = -1, mask = (1ull<<60)-1;
-    for (i=0; i<4; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
-    
-    /* Propagate the rest */
-    for (i=5; i<8; i++) {
-        ge &= x->limb[i];
-    }
-    
-    return ~word_is_zero(ge ^ mask);
-}
-
diff --git a/src/p480/arch_x86_64/f_impl.h b/src/p480/arch_x86_64/f_impl.h
index d501eb3..272125f 100644
--- a/src/p480/arch_x86_64/f_impl.h
+++ b/src/p480/arch_x86_64/f_impl.h
@@ -2,6 +2,8 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
+#define LIMB_PLACE_VALUE(i) 60
+
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
@@ -26,12 +28,6 @@ void gf_sub_RAW (gf out, const gf a, const gf b) {
     */
 }
 
-void gf_copy (gf out, const gf a) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
-        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
-    }
-}
-
 void gf_bias (
     gf a, int amt
 ) {
diff --git a/src/p480/f_arithmetic.c b/src/p480/f_arithmetic.c
index 227cdfe..e516c93 100644
--- a/src/p480/f_arithmetic.c
+++ b/src/p480/f_arithmetic.c
@@ -10,6 +10,12 @@
 
 #include "field.h"
 
+
+const gf MODULUS = {FIELD_LITERAL(
+    0xfffffffffffffff, 0xfffffffffffffff, 0xfffffffffffffff, 0xfffffffffffffff, 
+    0xffffffffffffffe, 0xfffffffffffffff, 0xfffffffffffffff, 0xfffffffffffffff
+)};
+
 void 
 gf_isr (
     gf_a_t a,
diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c
index c3aee6f..0c0bc0c 100644
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -320,49 +320,3 @@ void gf_strong_reduce (gf a) {
 
     assert(word_is_zero(carry + scarry));
 }
-
-void gf_serialize (uint8_t *serial, const struct gf x) {
-    int i,k=0;
-    gf red;
-    gf_copy(&red, x);
-    gf_strong_reduce(&red);
-    
-    uint64_t r=0;
-    int bits = 0;
-    for (i=0; i<9; i++) {
-        r |= red.limb[i] << bits;
-        for (bits += 58; bits >= 8; bits -= 8) {
-            serial[k++] = r;
-            r >>= 8;
-        }
-        assert(bits <= 6);
-    }
-    assert(bits);
-    serial[k++] = r;
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
-    int i,k=0,bits=0;
-    __uint128_t out = 0;
-    uint64_t mask = (1ull<<58)-1;
-    for (i=0; i<9; i++) {
-        out >>= 58;
-        for (; bits<58; bits+=8) {
-            out |= ((__uint128_t)serial[k++])<<bits;
-        }
-        x->limb[i] = out & mask;
-        bits -= 58;
-    }
-    
-    /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = word_is_zero(out>>57);
-    
-    uint64_t and = -1ull;
-    for (i=0; i<8; i++) {
-        and &= x->limb[i];
-    }
-    and &= (2*out+1);
-    good &= word_is_zero((and+1)>>58);
-    
-    return good;
-}
diff --git a/src/p521/arch_ref64/f_impl.h b/src/p521/arch_ref64/f_impl.h
index e9d631a..42a37e6 100644
--- a/src/p521/arch_ref64/f_impl.h
+++ b/src/p521/arch_ref64/f_impl.h
@@ -2,6 +2,8 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
+#define LIMB_PLACE_VALUE(i) 58
+
 void gf_add_RAW (gf out, const gf a, const gf b) {
     for (unsigned int i=0; i<9; i++) {
         out->limb[i] = a->limb[i] + b->limb[i];
diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c
index 2040531..8de3642 100644
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -389,51 +389,3 @@ void gf_strong_reduce (gf *a) {
 
     a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }
-
-void gf_serialize (uint8_t *serial, const struct gf *x) {
-    unsigned int i,k=0;
-    gf red;
-    gf_copy(&red, x);
-    gf_strong_reduce(&red);
-    
-    uint64_t r=0;
-    int bits = 0;
-    for (i=0; i<9; i++) {
-        r |= red.limb[LIMBPERM(i)] << bits;
-        for (bits += 58; bits >= 8; bits -= 8) {
-            serial[k++] = r;
-            r >>= 8;
-        }
-        assert(bits <= 6);
-    }
-    assert(bits);
-    serial[k++] = r;
-}
-
-mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
-    int i,k=0,bits=0;
-    __uint128_t out = 0;
-    uint64_t mask = (1ull<<58)-1;
-    for (i=0; i<9; i++) {
-        out >>= 58;
-        for (; bits<58; bits+=8) {
-            out |= ((__uint128_t)serial[k++])<<bits;
-        }
-        x->limb[LIMBPERM(i)] = out & mask;
-        bits -= 58;
-    }
-    
-    /* Check for reduction.  First, high has to be < 2^57 */
-    mask_t good = word_is_zero(out>>57);
-    
-    uint64_t and = -1ull;
-    for (i=0; i<8; i++) {
-        and &= x->limb[LIMBPERM(i)];
-    }
-    and &= (2*out+1);
-    good &= word_is_zero((and+1)>>58);
-
-    x->limb[3] = x->limb[7] = x->limb[11] = 0;
-    
-    return good;
-}
diff --git a/src/p521/arch_x86_64_r12/f_impl.h b/src/p521/arch_x86_64_r12/f_impl.h
index 434a114..4f9e965 100644
--- a/src/p521/arch_x86_64_r12/f_impl.h
+++ b/src/p521/arch_x86_64_r12/f_impl.h
@@ -4,6 +4,7 @@
 
 /* FIXME: Currently this file desn't work at all, because the struct is declared [9] and not [12] */
 #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
+#define LIMB_PLACE_VALUE(i) ((((i)&4)==3) ? 0 : 57)
 #define USE_P521_3x3_TRANSPOSE
 
 typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
diff --git a/src/p521/f_arithmetic.c b/src/p521/f_arithmetic.c
index 7ce39d8..a0c774a 100644
--- a/src/p521/f_arithmetic.c
+++ b/src/p521/f_arithmetic.c
@@ -10,6 +10,12 @@
 
 #include "field.h"
 
+const gf MODULUS = {FIELD_LITERAL(
+    0x3ffffffffffffff, 0x3ffffffffffffff, 0x3ffffffffffffff,
+    0x3ffffffffffffff, 0x3ffffffffffffff, 0x3ffffffffffffff,
+    0x3ffffffffffffff, 0x3ffffffffffffff, 0x1ffffffffffffff
+)};
+
 void 
 gf_isr (
     gf_a_t a,