p521 testing, 803kcy ecdh

10 years ago · 1d07343067
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,47 @@
 October 27, 2014:
    Added more support for >512-bit primes.  Changed shared secret
    to not overflow the buffer in this case.  Changed hashing to
    SHA512-PRNG; this doesn't change the behavior in the case that
    only one block is required.

    E-521 appears to be working.  Needs more testing, and maybe some
    careful analysis since the carry-handling bounds are awfully tight
    under the current analysis (the "< 5<<57" that it #if0 asserts is
    not actually tight enough under the current analysis; need
    it to be < (1+epsilon) << 59).
    
    So you actually do need to reduce often, at least in the x86_64_r12
    version.
    
    p521/arch_ref64: simple and relatively slow impl.  Like
    p448/arch_ref64, this arch reduces after every add or sub.
    
    p521/arch_x86_64_r12: aggressive, fast implementation.  This impl
    stores 521 bits not in 9 limbs, but 12!  Limbs 3,7,11 are 0, and
    are there only for vector alignment.  (TODO: remove those limbs
    from precomputed tables, so that we don't have to look them up!).
    The carry handling on this build is very tight, and probably could
    stand more analysis.  This is why I have the careful balancing of
    "hexad" and "nonad" multiplies in its Chung-Hasan mul routine.
    
    The 'r12 build is a work in progress, and currently only works on
    clang (because it rearranges vectors in the timesW function).
    
    Timings for the fast, aggressive arch on Haswell:
        mul:    146cy
        sqr:    111cy
        invert: 62kcy
        
        keygen: 270kcy
        ecdh:   803kcy
        sign:   283kcy
        verif:  907kcy
        
    Same rules as other Goldi benchmarks.  Turbo off, HT off,
    timing-channel protected (no dataflow from secrets to branches,
    memory lookups or known vt instructions), compressed points.
    

 October 23, 2014:
    Pushing through changes for curve flexibility.  First up is
    Ed480-Ridinghood, because it has the same number of words.  Next
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -12,7 +12,7 @@
 #include "ec_point.h"
 #include "magic.h"

 #define is32 (GOLDI_BITS == 32 || FIELD_BITS == 480)
 #define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
 /* TODO XXX PERF FIXME: better detection of overflow conditions */

 /* I wanted to just use if (is32)
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -243,6 +243,8 @@ goldilocks_shared_secret_core (
    const struct goldilocks_public_key_t *your_pubkey,
    const struct goldilocks_precomputed_public_key_t *pre
 ) {
    uint8_t gxy[GOLDI_FIELD_BYTES];
    
    /* This function doesn't actually need anything in goldilocks_global,
     * so it doesn't check init.
     */
@@ -277,7 +279,7 @@ goldilocks_shared_secret_core (
 #endif
    
    
    field_serialize(shared,&pk);
    field_serialize(gxy,&pk);
    
    /* obliterate records of our failure by adjusting with obliteration key */
    struct sha512_ctx_t ctx;
@@ -305,7 +307,7 @@ goldilocks_shared_secret_core (
 #endif
       
    /* stir in the shared key and finish */
    sha512_update(&ctx, shared, GOLDI_FIELD_BYTES);
    sha512_update(&ctx, gxy, GOLDI_FIELD_BYTES);
    sha512_final(&ctx, shared);
    
    return (GOLDI_ECORRUPT & ~msucc)
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -154,6 +154,14 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
        return (big_register_t)x;
    }
 #endif

 typedef struct {
    uint64xn_t unaligned;
 } __attribute__((packed)) unaligned_uint64xn_t;

 typedef struct {
    uint32xn_t unaligned;
 } __attribute__((packed)) unaligned_uint32xn_t;
    
 /**
 * Return -1 if x==0, and 0 otherwise.
--- a/src/p521/arch_x86_64_r12/arch_config.h
+++ b/src/p521/arch_x86_64_r12/arch_config.h
--- a/src/p521/arch_x86_64_r12/p521.c
+++ b/src/p521/arch_x86_64_r12/p521.c
@@ -4,41 +4,20 @@

 #include "p521.h"

 typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
 static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 };

 typedef struct {
  uint64x3_t lo, hi;
 } hexad_t;

 /* Currently requires CLANG.  Sorry. */
 static inline uint64x3_t timesW (uint64x3_t u) {
  return u.zxyw + u.zwww;
 }

 /* Store three vectors.  Currently requries AVX2 (TODO: remove) */
 static const uint64x4_t ls_mask_3 = { -1ull, -1ull, -1ull, 0 };
 static void store3 (uint64_t *x, uint64x3_t v) {
  _mm256_maskstore_epi64((long long *) x, ls_mask_3, v);
 }
  uint64x3_t lo, hi, hier;
 } nonad_t;

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 static __inline__ __uint128_t widemul(
    const uint64_t a,
    const uint64_t b
 ) {
    return ((__uint128_t)a) * ((__uint128_t)b);
 }

 static inline __uint128_t widemulu(const uint64_t a, const uint64_t b) {
 static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
    return ((__uint128_t)(a)) * b;
 }

 static inline __int128_t widemuls(const int64_t a, const int64_t b) {
 static inline __int128_t widemuls(int64_t a, int64_t b) {
    return ((__int128_t)(a)) * b;
 }
 
@@ -48,8 +27,9 @@ static inline uint64_t opacify(uint64_t x) {
    return x;
 }

 static inline void hexad_mul (
  hexad_t *hex,
 /* These used to be hexads, leading to 10% better performance, but there were overflow issues */
 static inline void nonad_mul (
  nonad_t *hex,
  const uint64_t *a,
  const uint64_t *b
 ) {
@@ -76,15 +56,13 @@ static inline void hexad_mul (
    lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
    hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };

    hi = hi<<6 | lo>>58;
    lo &= mask58;

    hex->lo = lo;
    hex->hi = hi;
    hex->hier = hi>>52;
    hex->hi = (hi<<12)>>6 | lo>>58;
    hex->lo = lo & mask58;
 }

 static inline void hexad_mul_signed (
  hexad_t *hex,
  nonad_t *hex,
  const int64_t *a,
  const int64_t *b
 ) {
@@ -111,15 +89,18 @@ static inline void hexad_mul_signed (
    lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
    hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };

    hi = hi<<6 | lo>>58;
    lo &= mask58;

    hex->lo = lo;
    hex->hi = hi;
    /*
    hex->hier = (uint64x4_t)((int64x4_t)hi>>52);
    hex->hi = (hi<<12)>>6 | lo>>58;
    hex->lo = lo & mask58;
    */
    
    hex->hi = hi<<6 | lo>>58;
    hex->lo = lo & mask58;
 }

 static inline void hexad_sqr (
  hexad_t *hex,
 static inline void nonad_sqr (
  nonad_t *hex,
  const uint64_t *a
 ) {
    __uint128_t xu, xv, xw;
@@ -143,15 +124,13 @@ static inline void hexad_sqr (
    lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
    hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };

    hi = hi<<6 | lo>>58;
    lo &= mask58;

    hex->lo = lo;
    hex->hi = hi;
    hex->hier = hi>>52;
    hex->hi = (hi<<12)>>6 | lo>>58;
    hex->lo = lo & mask58;
 }

 static inline void hexad_sqr_signed (
  hexad_t *hex,
  nonad_t *hex,
  const int64_t *a
 ) {
    __uint128_t xu, xv, xw;
@@ -175,11 +154,15 @@ static inline void hexad_sqr_signed (
    lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
    hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };

    hi = hi<<6 | lo>>58;
    lo &= mask58;

    hex->lo = lo;
    hex->hi = hi;
    /*
    hex->hier = (uint64x4_t)((int64x4_t)hi>>52);
    hex->hi = (hi<<12)>>6 | lo>>58;
    hex->lo = lo & mask58;
    */
    
    hex->hi = hi<<6 | lo>>58;
    hex->lo = lo & mask58;
 }


@@ -190,51 +173,83 @@ p521_mul (
    const p521_t *as,
    const p521_t *bs
 ) {
    int i;
    
 #if 0
    assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
    assert(bs->limb[3] == 0 && bs->limb[7] == 0 && bs->limb[11] == 0);
    for (i=0; i<12; i++) {
        assert(as->limb[i] < 5ull<<57);
        assert(bs->limb[i] < 5ull<<57);
    }
 #endif
    
    /* Bounds on the hexads and nonads.
     *
     * Limbs < 2<<58 + ep.
     * Nonad mul < 1<<58, 1<<58, tiny
     * -> t0 < (3,2,2)<<58 + tiny
     * t1,t2 < 2<<58 + tiny
     *   * w < (4,2,2)
     * Hexad mul < +- (5,4,3) * 4<<116 -> 2^58 lo, +- (5,4,3) * 4<<58+ep
     * TimesW < (2,1,1)<<58, (6,5,4)*4<<58 + ep
    
     * ot2 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
         == (3,2,2) + (4,2,2) + (4,2,2) +- (6,5,4)*4 - (1) << 58
         in (-25, +35) << 58

    uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
    uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
    uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;
     
     */
    
    
    uint64_t *c = cs->limb;
    const uint64_t *a = as->limb, *b = bs->limb;

  hexad_t ad, be, cf, abde, bcef, acdf;
  hexad_mul(&ad, &a[0], &b[0]);
  hexad_mul(&be, &a[3], &b[3]);
  hexad_mul(&cf, &a[6], &b[6]);
  
  uint64_t amt = 32;
  uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
    nonad_t ad, be, cf, abde, bcef, acdf;
    nonad_mul(&ad, &a[0], &b[0]);
    nonad_mul(&be, &a[4], &b[4]);
    nonad_mul(&cf, &a[8], &b[8]);

    uint64_t amt = 26;
    uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
    vhi2 = { 0, 0, -amt<<57, 0 };

  uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo;

  int64_t ta[3], tb[3];
  // it seems to be faster not to vectorize these loops
  for (int i=0; i<3; i++) {
    ta[i] = a[i]-a[i+3];
    tb[i] = b[i]-b[i+3];
  }
  hexad_mul_signed(&abde,ta,tb);

  for (int i=0; i<3; i++) {
    ta[i] = a[i+3]-a[i+6];
    tb[i] = b[i+3]-b[i+6];
  }
  hexad_mul_signed(&bcef,ta,tb);

  for (int i=0; i<3; i++) {
    ta[i] = a[i]-a[i+6];
    tb[i] = b[i]-b[i+6];
  }
  hexad_mul_signed(&acdf,ta,tb);

  uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo);
  uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi);
  uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2;
  
  uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
  uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
  uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
  
  store3(&c[0], out0);
  store3(&c[3], out1);
  store3(&c[6], out2);
    uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier);

    int64_t ta[4] VECTOR_ALIGNED, tb[4] VECTOR_ALIGNED;
    // it seems to be faster not to vectorize these loops
    for (i=0; i<3; i++) {
        ta[i] = a[i]-a[i+4];
        tb[i] = b[i]-b[i+4];
    }
    hexad_mul_signed(&abde,ta,tb);

    for (i=0; i<3; i++) {
        ta[i] = a[i+4]-a[i+8];
        tb[i] = b[i+4]-b[i+8];
    }
    hexad_mul_signed(&bcef,ta,tb);

    for (i=0; i<3; i++) {
        ta[i] = a[i]-a[i+8];
        tb[i] = b[i]-b[i+8];
    }
    hexad_mul_signed(&acdf,ta,tb);

    uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
    uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
    uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;

    uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
    uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
    uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);

    *(uint64x4_t *)&c[0] = out0;
    *(uint64x4_t *)&c[4] = out1;
    *(uint64x4_t *)&c[8] = out2;
 }


@@ -243,48 +258,58 @@ p521_sqr (
    p521_t *__restrict__ cs,
    const p521_t *as
 ) {
    

    int i;
 #if 0
    assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
    for (i=0; i<12; i++) {
        assert(as->limb[i] < 5ull<<57);
    }
 #endif

    uint64_t *c = cs->limb;
    const uint64_t *a = as->limb;

  hexad_t ad, be, cf, abde, bcef, acdf;
  hexad_sqr(&ad, &a[0]);
  hexad_sqr(&be, &a[3]);
  hexad_sqr(&cf, &a[6]);
  
  uint64_t amt = 32;
  uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
    nonad_t ad, be, cf, abde, bcef, acdf;
    nonad_sqr(&ad, &a[0]);
    nonad_sqr(&be, &a[4]);
    nonad_sqr(&cf, &a[8]);

    uint64_t amt = 26;
    uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
    vhi2 = { 0, 0, -amt<<57, 0 };
    
    uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier);

    int64_t ta[4] VECTOR_ALIGNED;
    // it seems to be faster not to vectorize these loops
    for (i=0; i<3; i++) {
        ta[i] = a[i]-a[i+4];
    }
    hexad_sqr_signed(&abde,ta);

    for (i=0; i<3; i++) {
        ta[i] = a[i+4]-a[i+8];
    }
    hexad_sqr_signed(&bcef,ta);

    for (i=0; i<3; i++) {
        ta[i] = a[i]-a[i+8];
    }
    hexad_sqr_signed(&acdf,ta);

    uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
    uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
    uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;

    uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
    uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
    uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);

  uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo;

  int64_t ta[3];
  // it seems to be faster not to vectorize these loops
  for (int i=0; i<3; i++) {
    ta[i] = a[i]-a[i+3];
  }
  hexad_sqr_signed(&abde,ta);

  for (int i=0; i<3; i++) {
    ta[i] = a[i+3]-a[i+6];
  }
  hexad_sqr_signed(&bcef,ta);

  for (int i=0; i<3; i++) {
    ta[i] = a[i]-a[i+6];
  }
  hexad_sqr_signed(&acdf,ta);

  uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo);
  uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi);
  uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2;
  
  uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
  uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
  uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
  
  store3(&c[0], out0);
  store3(&c[3], out1);
  store3(&c[6], out2);
    *(uint64x4_t *)&c[0] = out0;
    *(uint64x4_t *)&c[4] = out1;
    *(uint64x4_t *)&c[8] = out2;
 }

 void
@@ -293,37 +318,59 @@ p521_mulw (
    const p521_t *as,
    uint64_t b
 ) {
    
    

 #if 0
    int i;
    assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
    for (i=0; i<12; i++) {
        assert(as->limb[i] < 1ull<<61);
    }
    assert(b < 1ull<<61);
 #endif
    
    
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum3 = 0, accum6 = 0;
    uint64_t mask = (1ull<<58) - 1;  

    int i;
    for (i=0; i<3; i++) {
        accum0 += widemul(b, a[LIMBPERM(i)]);
        accum3 += widemul(b, a[LIMBPERM(i+3)]);
        accum6 += widemul(b, a[LIMBPERM(i+6)]);
        c[LIMBPERM(i)]   = accum0 & mask; accum0 >>= 58;
        c[LIMBPERM(i+3)] = accum3 & mask; accum3 >>= 58;
        if (i==2) { 
            c[LIMBPERM(i+6)] = accum6 & (mask>>1); accum6 >>= 57;
        } else {
            c[LIMBPERM(i+6)] = accum6 & mask; accum6 >>= 58;
        }
    }
    uint64_t mask = (1ull<<58) - 1;

    accum0 += widemulu(b, a[0]);
    accum3 += widemulu(b, a[1]);
    accum6 += widemulu(b, a[2]);
    c[0] = accum0 & mask; accum0 >>= 58;
    c[1] = accum3 & mask; accum3 >>= 58;
    c[2] = accum6 & mask; accum6 >>= 58;

    accum0 += widemulu(b, a[4]);
    accum3 += widemulu(b, a[5]);
    accum6 += widemulu(b, a[6]);
    c[4] = accum0 & mask; accum0 >>= 58;
    c[5] = accum3 & mask; accum3 >>= 58;
    c[6] = accum6 & mask; accum6 >>= 58;

    accum0 += widemulu(b, a[8]);
    accum3 += widemulu(b, a[9]);
    accum6 += widemulu(b, a[10]);
    c[8] = accum0 & mask; accum0 >>= 58;
    c[9] = accum3 & mask; accum3 >>= 58;
    c[10] = accum6 & (mask>>1); accum6 >>= 57;
    
    accum0 += c[LIMBPERM(3)];
    c[LIMBPERM(3)] = accum0 & mask;
    c[LIMBPERM(4)] += accum0 >> 58;
    accum0 += c[1];
    c[1] = accum0 & mask;
    c[5] += accum0 >> 58;

    accum3 += c[LIMBPERM(6)];
    c[LIMBPERM(6)] = accum3 & mask;
    c[LIMBPERM(7)] += accum3 >> 58;
    accum3 += c[2];
    c[2] = accum3 & mask;
    c[6] += accum3 >> 58;

    accum6 += c[LIMBPERM(0)];
    c[LIMBPERM(0)] = accum6 & mask;
    c[LIMBPERM(1)] += accum6 >> 58;
    accum6 += c[0];
    c[0] = accum6 & mask;
    c[4] += accum6 >> 58;
    
    c[3] = c[7] = c[11] = 0;
 }


@@ -366,6 +413,8 @@ p521_strong_reduce (
    }

    assert(is_zero(carry + scarry));

    a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }

 mask_t
@@ -377,8 +426,8 @@ p521_is_zero (
    p521_strong_reduce(&b);

    uint64_t any = 0;
    int i;
    for (i=0; i<9; i++) {
    unsigned int i;
    for (i=0; i<sizeof(b)/sizeof(b.limb[0]); i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
@@ -389,7 +438,7 @@ p521_serialize (
    uint8_t *serial,
    const struct p521_t *x
 ) {
    int i,k=0;
    unsigned int i,k=0;
    p521_t red;
    p521_copy(&red, x);
    p521_strong_reduce(&red);
@@ -430,10 +479,12 @@ p521_deserialize (
    
    uint64_t and = -1ull;
    for (i=0; i<8; i++) {
        and &= x->limb[i];
        and &= x->limb[LIMBPERM(i)];
    }
    and &= (2*out+1);
    good &= is_zero((and+1)>>58);

    x->limb[3] = x->limb[7] = x->limb[11] = 0;
    
    return good;
 }
--- a/src/p521/arch_x86_64_r12/p521.h
+++ b/src/p521/arch_x86_64_r12/p521.h
@@ -9,13 +9,14 @@
 #include <string.h>

 #include "word.h"
 #include "constant_time.h"

 #define LIMBPERM(x) (((x)%3)*3 + (x)/3)
 #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
 #define USE_P521_3x3_TRANSPOSE

 typedef struct p521_t {
  uint64_t limb[9];
 } p521_t;
  uint64_t limb[12];
 } __attribute__((aligned(32))) p521_t;

 #ifdef __cplusplus
 extern "C" {
@@ -85,12 +86,6 @@ p521_bias (
    p521_t *inout,
    int amount
 ) __attribute__((unused));

 static __inline__ void
 p521_really_bias (
    p521_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 p521_mul (
@@ -126,6 +121,19 @@ p521_deserialize (

 /* -------------- Inline functions begin here -------------- */

 typedef uint64x4_t uint64x3_t; /* fit it in a vector register */

 static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 };

 /* Currently requires CLANG.  Sorry. */
 static inline uint64x3_t
 __attribute__((unused))
 timesW (
  uint64x3_t u
 ) {
  return u.zxyw + u.zwww;
 }

 void
 p521_set_ui (
    p521_t *out,
@@ -133,7 +141,7 @@ p521_set_ui (
 ) {
    int i;
    out->limb[0] = x;
    for (i=1; i<9; i++) {
    for (i=1; i<12; i++) {
      out->limb[i] = 0;
    }
 }
@@ -145,10 +153,9 @@ p521_add (
    const p521_t *b
 ) {
    unsigned int i;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
    p521_weak_reduce(out);
 }

 void
@@ -158,11 +165,9 @@ p521_sub (
    const p521_t *b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
    p521_weak_reduce(out);
 }

 void
@@ -171,11 +176,9 @@ p521_neg (
    const p521_t *a
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
    for (i=0; i<9; i++) {
        out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i];
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
    }
    p521_weak_reduce(out);
 }

 void
@@ -183,9 +186,7 @@ p521_addw (
    p521_t *a,
    uint64_t x
 ) {
  a->limb[0] += x;
  a->limb[LIMBPERM(1)] += a->limb[0]>>58;
  a->limb[0] &= (1ull<<58)-1;
    a->limb[0] += x;
 }
             
 void
@@ -193,9 +194,7 @@ p521_subw (
    p521_t *a,
    uint64_t x
 ) {
  a->limb[0] -= x;
  p521_really_bias(a, 1);
  p521_weak_reduce(a);
    a->limb[0] -= x;
 }

 void
@@ -206,38 +205,42 @@ p521_copy (
    memcpy(out,a,sizeof(*a));
 }

 void
 p521_really_bias (
    p521_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt;
    int i;
    for (i=0; i<9; i++) {
        a->limb[i] += (i==8) ? co2 : co1;
    }
 }

 void
 p521_bias (
    p521_t *a,
    int amt
 ) {
    (void) a;
    (void) amt;
    uint64_t co0 = ((1ull<<58)-2)*amt, co1 = ((1ull<<58)-1)*amt;
    uint64x4_t vlo = { co0, co1, co1, 0 }, vhi = { co1, co1, co1, 0 };
    ((uint64x4_t*)a)[0] += vlo;
    ((uint64x4_t*)a)[1] += vhi;
    ((uint64x4_t*)a)[2] += vhi;
 }

 void
 p521_weak_reduce (
    p521_t *a
 ) {
    uint64_t mask = (1ull<<58) - 1;
    uint64_t tmp = a->limb[8] >> 57;
 #if 0
    int i;
    for (i=8; i>0; i--) {
        a->limb[LIMBPERM(i)] = (a->limb[LIMBPERM(i)] & ((i==8) ? mask>>1 : mask)) + (a->limb[LIMBPERM(i-1)]>>58);
    assert(a->limb[3] == 0 && a->limb[7] == 0 && a->limb[11] == 0);
    for (i=0; i<12; i++) {
        assert(a->limb[i] < 3ull<<61);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 #endif
    
    uint64x3_t
        ot0 = ((uint64x4_t*)a)[0],
        ot1 = ((uint64x4_t*)a)[1],
        ot2 = ((uint64x4_t*)a)[2];
    
    uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
    uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
    uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);

    ((uint64x4_t*)a)[0] = out0;
    ((uint64x4_t*)a)[1] = out1;
    ((uint64x4_t*)a)[2] = out2;
 }

 #ifdef __cplusplus
--- a/src/p521/magic.c
+++ b/src/p521/magic.c
@@ -44,12 +44,15 @@ const struct affine_t goldilocks_base_point = {
        U58LE(0x02a940a2f19ba6c),
        U58LE(0x3331c90d2c6ba52),
        U58LE(0x2878a3bfd9f42fc),
 	0,
        U58LE(0x03ec4cd920e2a8c),
        U58LE(0x0c6203913f6ecc5),
        U58LE(0x06277e432c8a5ac),
 	0,
        U58LE(0x1d568fc99c6059d),
        U58LE(0x1b2063b22fcf270),
        U58LE(0x0752cb45c48648b)
        U58LE(0x0752cb45c48648b),
 	0
 #else
        U58LE(0x02a940a2f19ba6c),
        U58LE(0x03ec4cd920e2a8c),
@@ -85,12 +88,15 @@ sqrt_d_minus_1 = {{
    U58LE(0x1e2be72c1c81990),
    U58LE(0x207dfc238a33e46),
    U58LE(0x2264cfb418c4c30),
    0,
    U58LE(0x1135002ad596c69),
    U58LE(0x0e30107cd79d1f6),
    U58LE(0x0524b9e715937f5),
    0,
    U58LE(0x2ab3a257a22666d),
    U58LE(0x2d80cc2936a1824),
    U58LE(0x0a9ea3ac10d6aed)
    U58LE(0x0a9ea3ac10d6aed),
    0
 #else
    U58LE(0x1e2be72c1c81990),
    U58LE(0x1135002ad596c69),
--- a/test/bench.c
+++ b/test/bench.c
@@ -106,6 +106,10 @@ int main(int argc, char **argv) {
    word_t sk[SCALAR_WORDS],tk[SCALAR_WORDS];
    q448_randomize(&crand, sk);
    
    memset(&a,0,sizeof(a));
    memset(&b,0,sizeof(b));
    memset(&c,0,sizeof(c));
    memset(&d,0,sizeof(d));
    when = now();
    for (i=0; i<nbase*5000; i++) {
        field_mul(&c, &b, &a);
@@ -206,6 +210,7 @@ int main(int argc, char **argv) {
    when = now() - when;
    printf("decompress:  %5.1fµs\n", when * 1e6 / i);
    
    convert_affine_to_extensible(&exta, &affine);
    when = now();
    for (i=0; i<nbase; i++) {
        serialize_extensible(&a, &exta);
@@ -262,6 +267,8 @@ int main(int argc, char **argv) {
    when = now() - when;
    printf("barrett mac: %5.1fns\n", when * 1e9 / i);
    
    memset(&ext,0,sizeof(ext));
    memset(&niels,0,sizeof(niels)); /* avoid assertions in p521 even though this isn't a valid ext or niels */
    when = now();
    for (i=0; i<nbase*100; i++) {
        add_tw_niels_to_tw_extensible(&ext, &niels);
@@ -269,6 +276,7 @@ int main(int argc, char **argv) {
    when = now() - when;
    printf("exti+niels:  %5.1fns\n", when * 1e9 / i);
    
    convert_tw_extensible_to_tw_pniels(&pniels, &ext);
    when = now();
    for (i=0; i<nbase*100; i++) {
        add_tw_pniels_to_tw_extensible(&ext, &pniels);
@@ -297,6 +305,7 @@ int main(int argc, char **argv) {
    when = now() - when;
    printf("a->i isog:   %5.1fns\n", when * 1e9 / i);
    
    memset(&mb,0,sizeof(mb));
    when = now();
    for (i=0; i<nbase*100; i++) {
        montgomery_step(&mb);
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -20,6 +20,11 @@ static mask_t mpz_to_field (
    return succ;
 }

 static inline int BRANCH_ON_CONSTANT(int x) {
    __asm__ ("" : "+r"(x));
    return x;
 }

 static mask_t field_assert_eq_gmp(
    const char *descr,
    const struct field_t *a,
@@ -43,8 +48,15 @@ static mask_t field_assert_eq_gmp(
    unsigned int i;
    for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
        int radix_bits = 1 + (sizeof(x->limb[0]) * FIELD_BITS - 1) / sizeof(*x);
        word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
            (1ull<<radix_bits) - 2 : (1ull<<radix_bits) - 1; // FIELD_MAGIC
        word_t yardstick;

        if (BRANCH_ON_CONSTANT(FIELD_BITS == 521 && sizeof(*x)==12*8)) {
            yardstick = (1ull<<58) - 1;
        } else {
            yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
                (1ull<<radix_bits) - 2 : (1ull<<radix_bits) - 1; // FIELD_MAGIC
        }

        if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
            youfail();
            printf("    Limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n",
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -274,7 +274,7 @@ int test_pointops (void) {

        #if (FIELD_BITS % 8)
            ser[FIELD_BYTES-1] &= (1<<(FIELD_BITS%8)) - 1;
 	#endif
        #endif
        
        /* TODO: we need a field generate, which can return random or pathological. */
        mask_t succ = field_deserialize(&serf, ser);
@@ -295,7 +295,7 @@ int test_pointops (void) {
        }
        
        ret = single_twisting_test(&base);
        if (ret) return ret;
        //if (ret) return ret;
    }
    
    return 0;
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -44,6 +44,8 @@ single_scalarmul_compatibility_test (
    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
 #elif FIELD_BITS == 480
    struct { int n,t,s; } params[] = {{5,6,16},{6,5,16},{4,5,24},{4,4,30},{1,2,240}};
 #elif FIELD_BITS == 521
    struct { int n,t,s; } params[] = {{5,8,13},{4,5,26},{1,2,(SCALAR_BITS+1)/2}};
 #else
    struct { int n,t,s; } params[] = {{5,5,(SCALAR_BITS+24)/25},{1,2,(SCALAR_BITS+1)/2}};
 #endif