generate most of f_impl.h. Not tested on most arches yet :-(

9 years ago · 4218223dd7
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -9,22 +9,52 @@ f_field_h = gen_file(
 #include "constant_time.h"
 #include <string.h>

 #include "f_impl.h"

 #include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
 #include "word.h"

 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
 #define gf                gf_%(gf_shortname)s_t
 #define gf_s              gf_%(gf_shortname)s_s
 #define gf_mul            gf_%(gf_shortname)s_mul
 #define gf_sqr            gf_%(gf_shortname)s_sqr
 #define gf_copy           gf_%(gf_shortname)s_copy
 #define gf_add_RAW        gf_%(gf_shortname)s_add_RAW
 #define gf_sub_RAW        gf_%(gf_shortname)s_sub_RAW
 #define gf_mulw           gf_%(gf_shortname)s_mulw
 #define gf_bias           gf_%(gf_shortname)s_bias
 #define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_weak_reduce    gf_%(gf_shortname)s_weak_reduce
 #define gf_strong_reduce  gf_%(gf_shortname)s_strong_reduce
 #define gf_mul            gf_%(gf_shortname)s_mul
 #define gf_sqr            gf_%(gf_shortname)s_sqr
 #define gf_mulw           gf_%(gf_shortname)s_mulw
 #define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_serialize      gf_%(gf_shortname)s_serialize
 #define gf_deserialize    gf_%(gf_shortname)s_deserialize

 #define SQRT_MINUS_ONE    P%(gf_shortname)s_SQRT_MINUS_ONE /* might not be defined */

 #define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))

 #ifdef __cplusplus
 extern "C" {
 #endif

 /* Defined below in f_impl.h */
 static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
 static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b);
 static INLINE_UNUSED void gf_sub_RAW (gf out, const gf a, const gf b);
 static INLINE_UNUSED void gf_bias (gf inout, int amount);
 static INLINE_UNUSED void gf_weak_reduce (gf inout);

 void gf_strong_reduce (gf inout);   
 void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
 void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
 void gf_sqr (gf_s *__restrict__ out, const gf a);
 void gf_serialize (uint8_t *serial, const gf x);
 mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

 #include "f_impl.h" /* Bring in the inline implementations */
 """)
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
--- a/src/p25519/arch_ref64/f_impl.h
+++ b/src/p25519/arch_ref64/f_impl.h
@@ -14,88 +14,10 @@
 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 /*
 #define FIELD_LITERAL(a,b,c,d) {{ \
    (a##ull) & LMASK, \
    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
    (d##ull)>>12 \
 }}
 */

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_weak_reduce (
    gf_25519_t inout
 ) __attribute__((unused));
             
 void
 gf_25519_strong_reduce (
    gf_25519_t inout
 );

 static __inline__ void
 gf_25519_bias (
    gf_25519_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 gf_25519_mul (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    const gf_25519_t b
 );

 void
 gf_25519_mulw (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    uint64_t b
 );

 void
 gf_25519_sqr (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a
 );

 void
 gf_25519_serialize (
    uint8_t serial[32],
    const gf_25519_t x
 );

 mask_t
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 );

 /* -------------- Inline functions begin here -------------- */

 void
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"
 #include "x86-64-arith.h"

 static inline uint64_t shr(__uint128_t x, int n) {
--- a/src/p25519/arch_x86_64/f_impl.h
+++ b/src/p25519/arch_x86_64/f_impl.h
@@ -14,88 +14,6 @@
 #define DECAF_255_LIMB_BITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 /*
 #define FIELD_LITERAL(a,b,c,d) {{ \
    (a##ull) & LMASK, \
    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
    (d##ull)>>12 \
 }}
 */

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) __attribute__((unused));
             
 static __inline__ void
 gf_25519_weak_reduce (
    gf_25519_t inout
 ) __attribute__((unused));
             
 void
 gf_25519_strong_reduce (
    gf_25519_t inout
 );

 static __inline__ void
 gf_25519_bias (
    gf_25519_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 gf_25519_mul (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    const gf_25519_t b
 );

 void
 gf_25519_mulw (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    uint64_t b
 );

 void
 gf_25519_sqr (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a
 );

 void
 gf_25519_serialize (
    uint8_t serial[32],
    const gf_25519_t x
 );

 mask_t
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 );

 /* -------------- Inline functions begin here -------------- */

 void
@@ -123,14 +41,6 @@ gf_25519_sub_RAW (
    }
 }

 void
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 gf_25519_bias (
    gf_25519_t a,
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -2,8 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"
 #include "f_impl.h"
 #include "f_field.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -22,74 +22,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif

 static __inline__ void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_weak_reduce (
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 gf_448_strong_reduce (
    gf_448_t inout
 );
             
 static __inline__ void
 gf_448_bias (
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 gf_448_mul (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 gf_448_mulw (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 gf_448_sqr (
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 gf_448_serialize (
    uint8_t *serial,
    const gf_448_t x
 );

 mask_t
 gf_448_deserialize (
    gf_448_t x,
    const uint8_t serial[56]
 );

 /* -------------- Inline functions begin here -------------- */

 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 gf_448_bias (
    gf_448_t a,
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -2,8 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"
 #include "f_impl.h"
 #include "f_field.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -22,74 +22,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif

 static __inline__ void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_weak_reduce (
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 gf_448_strong_reduce (
    gf_448_t inout
 );
             
 static __inline__ void
 gf_448_bias (
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 gf_448_mul (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 gf_448_mulw (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 gf_448_sqr (
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 gf_448_serialize (
    uint8_t *serial,
    const gf_448_t x
 );

 mask_t
 gf_448_deserialize (
    gf_448_t x,
    const uint8_t serial[56]
 );

 /* -------------- Inline functions begin here -------------- */

 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 gf_448_bias (
    gf_448_t a,
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -2,8 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "word.h"
 #include "f_impl.h"
 #include "f_field.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
@@ -27,75 +27,7 @@ typedef struct gf_448_s {
 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_weak_reduce (
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 gf_448_strong_reduce (
    gf_448_t inout
 );
             
 static __inline__ void
 gf_448_bias (
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 gf_448_mul (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 gf_448_mulw (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 gf_448_sqr (
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 gf_448_serialize (
    uint8_t *serial,
    const gf_448_t x
 );

 mask_t
 gf_448_deserialize (
    gf_448_t x,
    const uint8_t serial[56]
 );

    
 /* -------------- Inline functions begin here -------------- */

 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) {
  *out = *a;
 }

 void
 gf_448_bias (
    gf_448_t a,
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -21,74 +21,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif

 static __inline__ void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused));
             
 static __inline__ void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused));
             
 static __inline__ void
 gf_448_weak_reduce (
    gf_448_t inout
 ) __attribute__((unused));
             
 void
 gf_448_strong_reduce (
    gf_448_t inout
 );

 static __inline__ void
 gf_448_bias (
    gf_448_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 gf_448_mul (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 gf_448_mulw (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 gf_448_sqr (
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 gf_448_serialize (
    uint8_t *serial,
    const gf_448_t x
 );

 mask_t
 gf_448_deserialize (
    gf_448_t x,
    const uint8_t serial[56]
 );

 /* -------------- Inline functions begin here -------------- */

 void
@@ -118,14 +50,6 @@ gf_448_sub_RAW (
    gf_448_weak_reduce(out);
 }

 void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 gf_448_bias (
    gf_448_t a,
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"
 #include "x86-64-arith.h"

 void
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -17,74 +17,6 @@
 extern "C" {
 #endif

 static __inline__ void
 gf_448_add_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_sub_RAW (
    gf_448_t out,
    const gf_448_t a,
    const gf_448_t b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 gf_448_weak_reduce (
    gf_448_t inout
 ) __attribute__((unused,always_inline));
             
 void
 gf_448_strong_reduce (
    gf_448_t inout
 );

 static __inline__ void
 gf_448_bias (
    gf_448_t inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 gf_448_mul (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    const gf_448_t b
 );

 void
 gf_448_mulw (
    gf_448_s *__restrict__ out,
    const gf_448_t a,
    uint64_t b
 );

 void
 gf_448_sqr (
    gf_448_s *__restrict__ out,
    const gf_448_t a
 );

 void
 gf_448_serialize (
    uint8_t *serial,
    const gf_448_t x
 );

 mask_t
 gf_448_deserialize (
    gf_448_t x,
    const uint8_t serial[56]
 );

 /* -------------- Inline functions begin here -------------- */

 void
@@ -123,17 +55,6 @@ gf_448_sub_RAW (
    */
 }

 void
 gf_448_copy (
    gf_448_t out,
    const gf_448_t a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
    }
 }

 void
 gf_448_bias (
    gf_448_t a,
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -2,14 +2,13 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "x86-64-arith.h"
 #include "f_field.h"

 void
 p480_mul (
    p480_t *__restrict__ cs,
    const p480_t *as,
    const p480_t *bs
 gf_480_mul (
    gf_480_t *__restrict__ cs,
    const gf_480_t *as,
    const gf_480_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;
@@ -146,9 +145,9 @@ p480_mul (
 }

 void
 p480_mulw (
    p480_t *__restrict__ cs,
    const p480_t *as,
 gf_480_mulw (
    gf_480_t *__restrict__ cs,
    const gf_480_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
@@ -191,9 +190,9 @@ p480_mulw (
 }

 void
 p480_sqr (
    p480_t *__restrict__ cs,
    const p480_t *as
 gf_480_sqr (
    gf_480_t *__restrict__ cs,
    const gf_480_t *as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;
@@ -306,8 +305,8 @@ p480_sqr (
 }

 void
 p480_strong_reduce (
    p480_t *a
 gf_480_strong_reduce (
    gf_480_t *a
 ) {
    uint64_t mask = (1ull<<60)-1;

@@ -349,14 +348,14 @@ p480_strong_reduce (
 }

 void
 p480_serialize (
 gf_480_serialize (
    uint8_t *serial,
    const struct p480_t *x
    const struct gf_480_t *x
 ) {
    int i,j,k=0;
    p480_t red;
    p480_copy(&red, x);
    p480_strong_reduce(&red);
    gf_480_t red;
    gf_480_copy(&red, x);
    gf_480_strong_reduce(&red);
    word_t r = 0;
    for (i=0; i<8; i+=2) {
        r = red.limb[i];
@@ -375,8 +374,8 @@ p480_serialize (
 }

 mask_t
 p480_deserialize (
    p480_t *x,
 gf_480_deserialize (
    gf_480_t *x,
    const uint8_t serial[60]
 ) {
    int i,j,k=0;
--- a/src/p480/arch_x86_64/f_impl.h
+++ b/src/p480/arch_x86_64/f_impl.h
@@ -1,97 +1,77 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __p480_H__
 #define __p480_H__ 1
 #ifndef __gf_480_H__
 #define __gf_480_H__ 1

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct p480_t {
 typedef struct gf_480_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p480_t;
 } __attribute__((aligned(32))) gf_480_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p480_add_RAW (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_sub_RAW (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_copy (
    p480_t *out,
    const p480_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_weak_reduce (
    p480_t *inout
 gf_480_weak_reduce (
    gf_480_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p480_strong_reduce (
    p480_t *inout
 gf_480_strong_reduce (
    gf_480_t *inout
 );
  
 static __inline__ void
 p480_bias (
    p480_t *inout,
 gf_480_bias (
    gf_480_t *inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 p480_mul (
    p480_t *__restrict__ out,
    const p480_t *a,
    const p480_t *b
 gf_480_mul (
    gf_480_t *__restrict__ out,
    const gf_480_t *a,
    const gf_480_t *b
 );

 void
 p480_mulw (
    p480_t *__restrict__ out,
    const p480_t *a,
 gf_480_mulw (
    gf_480_t *__restrict__ out,
    const gf_480_t *a,
    uint64_t b
 );

 void
 p480_sqr (
    p480_t *__restrict__ out,
    const p480_t *a
 gf_480_sqr (
    gf_480_t *__restrict__ out,
    const gf_480_t *a
 );

 void
 p480_serialize (
 gf_480_serialize (
    uint8_t *serial,
    const struct p480_t *x
    const struct gf_480_t *x
 );

 mask_t
 p480_deserialize (
    p480_t *x,
 gf_480_deserialize (
    gf_480_t *x,
    const uint8_t serial[60]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p480_add_RAW (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 gf_480_add_RAW (
    gf_480_t *out,
    const gf_480_t *a,
    const gf_480_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -106,10 +86,10 @@ p480_add_RAW (
 }

 void
 p480_sub_RAW (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 gf_480_sub_RAW (
    gf_480_t *out,
    const gf_480_t *a,
    const gf_480_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -124,9 +104,9 @@ p480_sub_RAW (
 }

 void
 p480_copy (
    p480_t *out,
    const p480_t *a
 gf_480_copy (
    gf_480_t *out,
    const gf_480_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
@@ -135,8 +115,8 @@ p480_copy (
 }

 void
 p480_bias (
    p480_t *a,
 gf_480_bias (
    gf_480_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
@@ -162,8 +142,8 @@ p480_bias (
 }

 void
 p480_weak_reduce (
    p480_t *a
 gf_480_weak_reduce (
    gf_480_t *a
 ) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<60) - 1;
@@ -180,4 +160,4 @@ p480_weak_reduce (
 }; /* extern "C" */
 #endif

 #endif /* __p480_H__ */
 #endif /* __gf_480_H__ */
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
@@ -17,10 +17,10 @@ static __inline__ uint64_t is_zero(uint64_t a) {
 }

 void
 p521_mul (
    p521_t *__restrict__ cs,
    const p521_t *as,
    const p521_t *bs
 gf_521_mul (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as,
    const gf_521_t *bs
 ) {
    uint64_t *c = cs->limb;
    const uint64_t *a = as->limb, *b = bs->limb;
@@ -158,9 +158,9 @@ p521_mul (
 }

 void
 p521_mulw (
    p521_t *__restrict__ cs,
    const p521_t *as,
 gf_521_mulw (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
@@ -197,9 +197,9 @@ p521_mulw (
 }

 void
 p521_sqr (
    p521_t *__restrict__ cs,
    const p521_t *as
 gf_521_sqr (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as
 ) {
    uint64_t *c = cs->limb;
    const uint64_t *a = as->limb;
@@ -306,8 +306,8 @@ p521_sqr (
 }

 void
 p521_strong_reduce (
    p521_t *a
 gf_521_strong_reduce (
    gf_521_t *a
 ) {
    uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;

@@ -347,14 +347,14 @@ p521_strong_reduce (
 }

 void
 p521_serialize (
 gf_521_serialize (
    uint8_t *serial,
    const struct p521_t *x
    const struct gf_521_t *x
 ) {
    int i,k=0;
    p521_t red;
    p521_copy(&red, x);
    p521_strong_reduce(&red);
    gf_521_t red;
    gf_521_copy(&red, x);
    gf_521_strong_reduce(&red);
    
    uint64_t r=0;
    int bits = 0;
@@ -371,8 +371,8 @@ p521_serialize (
 }

 mask_t
 p521_deserialize (
    p521_t *x,
 gf_521_deserialize (
    gf_521_t *x,
    const uint8_t serial[66]
 ) {
    int i,k=0,bits=0;
--- a/src/p521/arch_ref64/f_impl.h
+++ b/src/p521/arch_ref64/f_impl.h
@@ -10,122 +10,102 @@

 #include "word.h"

 typedef struct p521_t {
 typedef struct gf_521_t {
  uint64_t limb[9];
 } p521_t;
 } gf_521_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p521_add_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p521_sub_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p521_copy (
    p521_t *out,
    const p521_t *a
 ) __attribute__((unused));
             
 static __inline__ void
 p521_weak_reduce (
    p521_t *inout
 gf_521_weak_reduce (
    gf_521_t *inout
 ) __attribute__((unused));
             
 void
 p521_strong_reduce (
    p521_t *inout
 gf_521_strong_reduce (
    gf_521_t *inout
 );

 static __inline__ void
 p521_bias (
    p521_t *inout,
 gf_521_bias (
    gf_521_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 p521_mul (
    p521_t *__restrict__ out,
    const p521_t *a,
    const p521_t *b
 gf_521_mul (
    gf_521_t *__restrict__ out,
    const gf_521_t *a,
    const gf_521_t *b
 );

 void
 p521_mulw (
    p521_t *__restrict__ out,
    const p521_t *a,
 gf_521_mulw (
    gf_521_t *__restrict__ out,
    const gf_521_t *a,
    uint64_t b
 );

 void
 p521_sqr (
    p521_t *__restrict__ out,
    const p521_t *a
 gf_521_sqr (
    gf_521_t *__restrict__ out,
    const gf_521_t *a
 );

 void
 p521_serialize (
 gf_521_serialize (
    uint8_t *serial,
    const struct p521_t *x
    const struct gf_521_t *x
 );

 mask_t
 p521_deserialize (
    p521_t *x,
 gf_521_deserialize (
    gf_521_t *x,
    const uint8_t serial[66]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p521_add_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 gf_521_add_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
    unsigned int i;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    p521_weak_reduce(out);
    gf_521_weak_reduce(out);
 }

 void
 p521_sub_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 gf_521_sub_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
    for (i=0; i<9; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
    }
    p521_weak_reduce(out);
    gf_521_weak_reduce(out);
 }

 void
 p521_copy (
    p521_t *out,
    const p521_t *a
 gf_521_copy (
    gf_521_t *out,
    const gf_521_t *a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p521_bias (
    p521_t *a,
 gf_521_bias (
    gf_521_t *a,
    int amt
 ) {
    (void) a;
@@ -133,8 +113,8 @@ p521_bias (
 }

 void
 p521_weak_reduce (
    p521_t *a
 gf_521_weak_reduce (
    gf_521_t *a
 ) {
    uint64_t mask = (1ull<<58) - 1;
    uint64_t tmp = a->limb[8] >> 57;
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -2,7 +2,7 @@
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_impl.h"
 #include "f_field.h"

 typedef struct {
  uint64x3_t lo, hi, hier;
@@ -168,10 +168,10 @@ static inline void hexad_sqr_signed (


 void
 p521_mul (
    p521_t *__restrict__ cs,
    const p521_t *as,
    const p521_t *bs
 gf_521_mul (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as,
    const gf_521_t *bs
 ) {
    int i;
    
@@ -254,9 +254,9 @@ p521_mul (


 void
 p521_sqr (
    p521_t *__restrict__ cs,
    const p521_t *as
 gf_521_sqr (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as
 ) {
    

@@ -313,9 +313,9 @@ p521_sqr (
 }

 void
 p521_mulw (
    p521_t *__restrict__ cs,
    const p521_t *as,
 gf_521_mulw (
    gf_521_t *__restrict__ cs,
    const gf_521_t *as,
    uint64_t b
 ) {
    
@@ -375,8 +375,8 @@ p521_mulw (


 void
 p521_strong_reduce (
    p521_t *a
 gf_521_strong_reduce (
    gf_521_t *a
 ) {
    uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;

@@ -418,14 +418,14 @@ p521_strong_reduce (
 }

 void
 p521_serialize (
 gf_521_serialize (
    uint8_t *serial,
    const struct p521_t *x
    const struct gf_521_t *x
 ) {
    unsigned int i,k=0;
    p521_t red;
    p521_copy(&red, x);
    p521_strong_reduce(&red);
    gf_521_t red;
    gf_521_copy(&red, x);
    gf_521_strong_reduce(&red);
    
    uint64_t r=0;
    int bits = 0;
@@ -442,8 +442,8 @@ p521_serialize (
 }

 mask_t
 p521_deserialize (
    p521_t *x,
 gf_521_deserialize (
    gf_521_t *x,
    const uint8_t serial[LIMBPERM(66)]
 ) {
    int i,k=0,bits=0;
--- a/src/p521/arch_x86_64_r12/f_impl.h
+++ b/src/p521/arch_x86_64_r12/f_impl.h
@@ -14,82 +14,14 @@
 #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
 #define USE_P521_3x3_TRANSPOSE

 typedef struct p521_t {
 typedef struct gf_521_s {
  uint64_t limb[12];
 } __attribute__((aligned(32))) p521_t;
 } __attribute__((aligned(32))) gf_521_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p521_add_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p521_sub_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p521_copy (
    p521_t *out,
    const p521_t *a
 ) __attribute__((unused));
             
 static __inline__ void
 p521_weak_reduce (
    p521_t *inout
 ) __attribute__((unused));
             
 void
 p521_strong_reduce (
    p521_t *inout
 );

 static __inline__ void
 p521_bias (
    p521_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 p521_mul (
    p521_t *__restrict__ out,
    const p521_t *a,
    const p521_t *b
 );

 void
 p521_mulw (
    p521_t *__restrict__ out,
    const p521_t *a,
    uint64_t b
 );

 void
 p521_sqr (
    p521_t *__restrict__ out,
    const p521_t *a
 );

 void
 p521_serialize (
    uint8_t *serial,
    const struct p521_t *x
 );

 mask_t
 p521_deserialize (
    p521_t *x,
    const uint8_t serial[66]
 );

 /* -------------- Inline functions begin here -------------- */

 typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
@@ -106,10 +38,10 @@ timesW (
 }

 void
 p521_add_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 gf_521_add_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -118,10 +50,10 @@ p521_add_RAW (
 }

 void
 p521_sub_RAW (
    p521_t *out,
    const p521_t *a,
    const p521_t *b
 gf_521_sub_RAW (
    gf_521_t *out,
    const gf_521_t *a,
    const gf_521_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -130,16 +62,8 @@ p521_sub_RAW (
 }

 void
 p521_copy (
    p521_t *out,
    const p521_t *a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p521_bias (
    p521_t *a,
 gf_521_bias (
    gf_521_t *a,
    int amt
 ) {
    uint64_t co0 = ((1ull<<58)-2)*amt, co1 = ((1ull<<58)-1)*amt;
@@ -150,8 +74,8 @@ p521_bias (
 }

 void
 p521_weak_reduce (
    p521_t *a
 gf_521_weak_reduce (
    gf_521_t *a
 ) {
 #if 0
    int i;