further reduce the code in f_impl.h

9 years ago · 2402788996
--- a/src/p25519/arch_ref64/f_impl.h
+++ b/src/p25519/arch_ref64/f_impl.h
@@ -4,14 +4,12 @@
 #ifndef __P25519_H__
 #define __P25519_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "decaf/decaf_255.h"
 #include "word.h"

 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 #ifdef __cplusplus
@@ -20,54 +18,29 @@ extern "C" {

 /* -------------- Inline functions begin here -------------- */

 void
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
 void gf_add_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    gf_25519_weak_reduce(out);
    gf_weak_reduce(out);
 }

 void
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
 void gf_sub_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
    }
    gf_25519_weak_reduce(out);
 }

 void
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) {
    memcpy(out,a,sizeof(*a));
    gf_weak_reduce(out);
 }

 void
 gf_25519_bias (
    gf_25519_t a,
    int amt
 ) {
 void gf_bias (gf a, int amt) {
    (void) a;
    (void) amt;
 }

 void
 gf_25519_weak_reduce (
    gf_25519_t a
 ) {
 void gf_weak_reduce (gf a) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[4] >> 51;
    int i;
--- a/src/p25519/arch_x86_64/f_impl.h
+++ b/src/p25519/arch_x86_64/f_impl.h
@@ -4,36 +4,24 @@
 #ifndef __P25519_H__
 #define __P25519_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "decaf/decaf_255.h"
 #include "word.h"

 #define DECAF_255_LIMB_BITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 /* -------------- Inline functions begin here -------------- */

 void
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
 void gf_add_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
 }

 void
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
 void gf_sub_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
    for (i=0; i<5; i++) {
@@ -41,11 +29,7 @@ gf_25519_sub_RAW (
    }
 }

 void
 gf_25519_bias (
    gf_25519_t a,
    int amt
 ) {
 void gf_bias (gf a, int amt) {
    a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
    int i;
    for (i=1; i<5; i++) {
@@ -53,10 +37,7 @@ gf_25519_bias (
    }
 }

 void
 gf_25519_weak_reduce (
    gf_25519_t a
 ) {
 void gf_weak_reduce (gf a) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[4] >> 51;
    int i;
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -4,17 +4,12 @@
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"
 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}

@@ -24,12 +19,7 @@ extern "C" {

 /* -------------- Inline functions begin here -------------- */

 void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_add_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
@@ -42,12 +32,7 @@ gf_448_add_RAW (
    */
 }

 void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_sub_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
@@ -60,11 +45,7 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_bias (
    gf_448_t a,
    int amt
 ) {
 void gf_bias (gf a, int amt) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
@@ -74,10 +55,7 @@ gf_448_bias (
    aa[3] += lo;
 }

 void
 gf_448_weak_reduce (
    gf_448_t a
 ) {
 void gf_weak_reduce (gf a) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    int i;
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -4,17 +4,12 @@
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"
 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 28
 #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}

@@ -24,12 +19,7 @@ extern "C" {

 /* -------------- Inline functions begin here -------------- */

 void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_add_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
@@ -42,12 +32,7 @@ gf_448_add_RAW (
    */
 }

 void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_sub_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
@@ -60,11 +45,7 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_bias (
    gf_448_t a,
    int amt
 ) {
 void gf_bias (gf a, int amt) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
@@ -74,10 +55,7 @@ gf_448_bias (
    aa[3] += lo;
 }

 void
 gf_448_weak_reduce (
    gf_448_t a
 ) {
 void gf_weak_reduce (gf a) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    int i;
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
@@ -4,20 +4,15 @@
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"
 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct gf_448_s {
  uint32_t limb[16];
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
 #define USE_NEON_PERM 1
 #define LBITS 28
 #define LIMBHI(x) ((x##ull)>>LBITS)
 #define LIMBLO(x) ((x##ull)&((1ull<<LBITS)-1))
 #define LIMBHI(x) ((x##ull)>>28)
 #define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
 #  define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \
      LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
@@ -30,24 +25,14 @@ extern "C" {
    
 /* -------------- Inline functions begin here -------------- */

 void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_add_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
    }
 }

 void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_sub_RAW (gf out, const gf a, const gf b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
@@ -60,11 +45,7 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_bias (
    gf_448_t a,
    int amt
 ) {
 void gf_bias (gf a, int amt) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
@@ -74,10 +55,7 @@ gf_448_bias (
    aa[3] += hi;
 }

 void
 gf_448_weak_reduce (
    gf_448_t a
 ) {
 void gf_weak_reduce (gf a) {

    uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
       tmp = vshr_n_u32(aa[7],28);
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -4,17 +4,12 @@
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "word.h"

 typedef struct gf_448_s {
  uint64_t limb[8];
 } __attribute__((aligned(32))) gf_448_s, gf_448_t[1];

 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}

 #ifdef __cplusplus
@@ -23,46 +18,29 @@ extern "C" {

 /* -------------- Inline functions begin here -------------- */

 void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_add_RAW (gf  out, const gf  a, const gf  b) {
    unsigned int i;
    for (i=0; i<8; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    gf_448_weak_reduce(out);
    gf_weak_reduce(out);
 }

 void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_sub_RAW (gf  out, const gf  a, const gf  b) {
    unsigned int i;
    uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
    for (i=0; i<8; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1);
    }
    gf_448_weak_reduce(out);
    gf_weak_reduce(out);
 }

 void
 gf_448_bias (
    gf_448_t a,
    int amt
 ) {
 void gf_bias (gf  a, int amt) {
    (void) a;
    (void) amt;
 }

 void
 gf_448_weak_reduce (
    gf_448_t a
 ) {
 void gf_weak_reduce (gf  a) {
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    int i;
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -4,13 +4,11 @@
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>

 #include "decaf/decaf_448.h"
 #include "word.h"

 #define LBITS 56
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}

 #ifdef __cplusplus
@@ -19,12 +17,7 @@ extern "C" {

 /* -------------- Inline functions begin here -------------- */

 void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_add_RAW (gf  out, const gf  a, const gf  b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
@@ -37,12 +30,7 @@ gf_448_add_RAW (
    */
 }

 void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) {
 void gf_sub_RAW (gf  out, const gf  a, const gf  b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
@@ -55,11 +43,7 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_bias (
    gf_448_t a,
    int amt
 ) {
 void gf_bias (gf  a, int amt) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    
 #if __AVX2__
@@ -82,10 +66,7 @@ gf_448_bias (
 #endif
 }

 void
 gf_448_weak_reduce (
    gf_448_t a
 ) {
 void gf_weak_reduce (gf  a) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
--- a/src/p480/arch_x86_64/f_impl.h
+++ b/src/p480/arch_x86_64/f_impl.h
@@ -1,78 +1,23 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __gf_480_H__
 #define __gf_480_H__ 1
 #ifndef __gf_H__
 #define __gf_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct gf_480_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) gf_480_t;

 #ifdef __cplusplus
 extern "C" {
 #endif
             
 static __inline__ void
 gf_480_weak_reduce (
    gf_480_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 gf_480_strong_reduce (
    gf_480_t *inout
 );
  
 static __inline__ void
 gf_480_bias (
    gf_480_t *inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 gf_480_mul (
    gf_480_t *__restrict__ out,
    const gf_480_t *a,
    const gf_480_t *b
 );

 void
 gf_480_mulw (
    gf_480_t *__restrict__ out,
    const gf_480_t *a,
    uint64_t b
 );

 void
 gf_480_sqr (
    gf_480_t *__restrict__ out,
    const gf_480_t *a
 );

 void
 gf_480_serialize (
    uint8_t *serial,
    const struct gf_480_t *x
 );

 mask_t
 gf_480_deserialize (
    gf_480_t *x,
    const uint8_t serial[60]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 gf_480_add_RAW (
    gf_480_t *out,
    const gf_480_t *a,
    const gf_480_t *b
 ) {
 void gf_add_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
@@ -85,12 +30,7 @@ gf_480_add_RAW (
    */
 }

 void
 gf_480_sub_RAW (
    gf_480_t *out,
    const gf_480_t *a,
    const gf_480_t *b
 ) {
 void gf_sub_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
@@ -103,21 +43,15 @@ gf_480_sub_RAW (
    */
 }

 void
 gf_480_copy (
    gf_480_t *out,
    const gf_480_t *a
 ) {
 void gf_copy (gf  *out, const gf  *a) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
    }
 }

 void
 gf_480_bias (
    gf_480_t *a,
    int amt
 void gf_bias (
    gf  *a, int amt
 ) {
    uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
    
@@ -141,10 +75,7 @@ gf_480_bias (
 #endif
 }

 void
 gf_480_weak_reduce (
    gf_480_t *a
 ) {
 void gf_weak_reduce (gf  *a) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<60) - 1;
    uint64_t tmp = a->limb[7] >> 60;
@@ -160,4 +91,4 @@ gf_480_weak_reduce (
 }; /* extern "C" */
 #endif

 #endif /* __gf_480_H__ */
 #endif /* __gf_H__ */
--- a/src/p521/arch_ref64/f_impl.h
+++ b/src/p521/arch_ref64/f_impl.h
@@ -4,118 +4,41 @@
 #ifndef __P521_H__
 #define __P521_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "word.h"

 typedef struct gf_521_t {
  uint64_t limb[9];
 } gf_521_t;

 #ifdef __cplusplus
 extern "C" {
 #endif
             
 static __inline__ void
 gf_521_weak_reduce (
    gf_521_t *inout
 ) __attribute__((unused));
             
 void
 gf_521_strong_reduce (
    gf_521_t *inout
 );

 static __inline__ void
 gf_521_bias (
    gf_521_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 gf_521_mul (
    gf_521_t *__restrict__ out,
    const gf_521_t *a,
    const gf_521_t *b
 );

 void
 gf_521_mulw (
    gf_521_t *__restrict__ out,
    const gf_521_t *a,
    uint64_t b
 );

 void
 gf_521_sqr (
    gf_521_t *__restrict__ out,
    const gf_521_t *a
 );

 void
 gf_521_serialize (
    uint8_t *serial,
    const struct gf_521_t *x
 );

 mask_t
 gf_521_deserialize (
    gf_521_t *x,
    const uint8_t serial[66]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 gf_521_add_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
 void gf_add_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    gf_521_weak_reduce(out);
    gf_weak_reduce(out);
 }

 void
 gf_521_sub_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
 void gf_sub_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
    }
    gf_521_weak_reduce(out);
 }

 void
 gf_521_copy (
    gf_521_t *out,
    const gf_521_t *a
 ) {
    memcpy(out,a,sizeof(*a));
    gf_weak_reduce(out);
 }

 void
 gf_521_bias (
    gf_521_t *a,
    int amt
 ) {
 void gf_bias (gf *a, int amt) {
    (void) a;
    (void) amt;
 }

 void
 gf_521_weak_reduce (
    gf_521_t *a
 ) {
 void gf_weak_reduce (gf  *a) {
    uint64_t mask = (1ull<<58) - 1;
    uint64_t tmp = a->limb[8] >> 57;
    int i;
--- a/src/p521/arch_x86_64_r12/f_impl.h
+++ b/src/p521/arch_x86_64_r12/f_impl.h
@@ -4,20 +4,18 @@
 #ifndef __P521_H__
 #define __P521_H__ 1

 #include "f_field.h"

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "word.h"
 #include "constant_time.h"

 /* FIXME: Currenmtlty desn't work at all, because the struct is declared [9] and not [12] */
 #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
 #define USE_P521_3x3_TRANSPOSE

 typedef struct gf_521_s {
  uint64_t limb[12];
 } __attribute__((aligned(32))) gf_521_t;

 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -29,43 +27,25 @@ typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
 static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 };

 /* Currently requires CLANG.  Sorry. */
 static inline uint64x3_t
 __attribute__((unused))
 timesW (
  uint64x3_t u
 ) {
  return u.zxyw + u.zwww;
 static inline uint64x3_t timesW (uint64x3_t u) {
    return u.zxyw + u.zwww;
 }

 void
 gf_521_add_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
 void gf_add_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
 }

 void
 gf_521_sub_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
 void gf_sub_RAW (gf  *out, const gf  *a, const gf  *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
 }

 void
 gf_521_bias (
    gf_521_t *a,
    int amt
 ) {
 void gf_bias (gf  *a, int amt) {
    uint64_t co0 = ((1ull<<58)-2)*amt, co1 = ((1ull<<58)-1)*amt;
    uint64x4_t vlo = { co0, co1, co1, 0 }, vhi = { co1, co1, co1, 0 };
    ((uint64x4_t*)a)[0] += vlo;
@@ -73,10 +53,7 @@ gf_521_bias (
    ((uint64x4_t*)a)[2] += vhi;
 }

 void
 gf_521_weak_reduce (
    gf_521_t *a
 ) {
 void gf_weak_reduce (gf  *a) {
 #if 0
    int i;
    assert(a->limb[3] == 0 && a->limb[7] == 0 && a->limb[11] == 0);
@@ -84,7 +61,6 @@ gf_521_weak_reduce (
        assert(a->limb[i] < 3ull<<61);
    }
 #endif
    
    uint64x3_t
        ot0 = ((uint64x4_t*)a)[0],
        ot1 = ((uint64x4_t*)a)[1],