wipe out the multiple layers of rename between decaf_fast and field. still some serious HACKs in the include prio to avoid multiple definition of struct gf

10 years ago · 5af980b85a
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -27,8 +27,6 @@
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
 #define SER_BYTES DECAF_255_SER_BYTES
 #define gf_s gf_255_s
 #define gf gf_255_t

 #if WBITS == 64
 typedef __int128_t decaf_sdword_t;
@@ -72,7 +70,7 @@ typedef struct { niels_t n; gf z; } __attribute__((aligned(32))) pniels_s, pniel
 /* Precomputed base */
 struct precomputed_s { niels_t table [DECAF_COMBS_N<<(DECAF_COMBS_T-1)]; };

 extern const field_t API_NS(precomputed_base_as_fe)[];
 extern const gf API_NS(precomputed_base_as_fe)[];
 const precomputed_s *API_NS(precomputed_base) =
    (const precomputed_s *) &API_NS(precomputed_base_as_fe);

@@ -95,52 +93,6 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 /** Copy x = y */
 siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }

 /** Mostly-unoptimized multiply, but at least it's unrolled. */
 siv gf_mul (gf c, const gf a, const gf b) {
    field_mul((field_t *)c, (const field_t *)a, (const field_t *)b);
 }

 /** Dedicated square */
 siv gf_sqr (gf c, const gf a) {
    field_sqr((field_t *)c, (const field_t *)a);
 }

 /** Add mod p.  Conservatively always weak-reduce. */
 snv gf_add ( gf_s *__restrict__ c, const gf a, const gf b ) {
    field_add((field_t *)c, (const field_t *)a, (const field_t *)b);
 }

 /** Subtract mod p.  Conservatively always weak-reduce. */
 snv gf_sub ( gf c, const gf a, const gf b ) {
    field_sub((field_t *)c, (const field_t *)a, (const field_t *)b);
 }

 /** Add mod p.  Conservatively always weak-reduce.) */
 siv gf_bias ( gf c, int amt) {
    field_bias((field_t *)c, amt);
 }

 /** Subtract mod p.  Bias by 2 and don't reduce  */
 siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
    field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
    gf_bias(c, 2);
    if (WBITS==32) field_weak_reduce((field_t*) c); // HACK
 }

 /** Subtract mod p. Bias by amt but don't reduce.  */
 siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
    field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
    gf_bias(c, amt);
    if (WBITS==32) field_weak_reduce((field_t*) c); // HACK
 }

 /** Add mod p.  Don't reduce. */
 siv gf_add_nr ( gf c, const gf a, const gf b ) {
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
    field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
 }

 /** Constant time, x = is_z ? z : y */
 siv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
    constant_time_select(x,z,y,sizeof(gf),is_z);
@@ -162,29 +114,11 @@ siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
    });
 }

 /**
 * Mul by signed int.  Not constant-time WRT the sign of that int.
 * Just uses a full mul (PERF)
 */
 siv gf_mlw(gf c, const gf a, int w) {
    if (w>0) {
        field_mulw((field_t *)c, (const field_t *)a, w);
    } else {
        field_mulw((field_t *)c, (const field_t *)a, -w);
        gf_sub(c,ZERO,c);
    }
 }

 /** Canonicalize */
 siv gf_canon ( gf a ) {
    field_strong_reduce((field_t *)a);
 }

 /** Compare a==b */
 static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
    gf c;
    gf_sub(c,a,b);
    gf_canon(c);
    gf_strong_reduce(c);
    decaf_word_t ret=0;
    FOR_LIMB(i, ret |= c->limb[i] );
    /* Hope the compiler is too dumb to optimize this, thus noinline */
@@ -194,7 +128,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
 /** Inverse square root using addition chain. */
 static decaf_bool_t gf_isqrt_chk(gf y, const gf x, decaf_bool_t allow_zero) {
    gf tmp0, tmp1;
    field_isr((field_t *)y, (const field_t *)x);
    gf_isr((gf_s *)y, (const gf_s *)x);
    gf_sqr(tmp0,y);
    gf_mul(tmp1,tmp0,x);
    return gf_eq(tmp1,ONE) | (allow_zero & gf_eq(tmp1,ZERO));
@@ -211,11 +145,24 @@ sv gf_invert(gf y, const gf x) {
    gf_cpy(y, t2);
 }

 /**
 * Mul by signed int.  Not constant-time WRT the sign of that int.
 * Just uses a full mul (PERF)
 */
 static inline void gf_mulw_sgn(gf c, const gf a, int w) {
    if (w>0) {
        gf_mulw(c, a, w);
    } else {
        gf_mulw(c, a, -w);
        gf_sub(c,ZERO,c);
    }
 }

 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t hibit(const gf x) {
    gf y;
    gf_add(y,x,x);
    gf_canon(y);
    gf_strong_reduce(y);
    return -(y->limb[0]&1);
 }

@@ -223,7 +170,7 @@ static decaf_word_t hibit(const gf x) {
 static decaf_word_t lobit(const gf x) {
    gf y;
    gf_cpy(y,x);
    gf_canon(y);
    gf_strong_reduce(y);
    return -(y->limb[0]&1);
 }

@@ -454,7 +401,7 @@ decaf_bool_t API_NS(scalar_eq) (
 const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};

 static void gf_encode ( unsigned char ser[SER_BYTES], gf a ) {
    field_serialize(ser, (field_t *)a);
    gf_serialize(ser, (gf_s *)a);
 }
 
 extern const gf SQRT_MINUS_ONE, SQRT_ONE_MINUS_D; /* Intern this? */
@@ -528,7 +475,7 @@ void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
 * Deserialize a bool, return TRUE if < p.
 */
 static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
    return field_deserialize((field_t *)s, ser);
    return gf_deserialize((gf_s *)s, ser);
 }
   
 decaf_bool_t API_NS(point_decode) (
@@ -544,7 +491,7 @@ decaf_bool_t API_NS(point_decode) (
    gf_sub ( f, ONE, a ); /* f = 1-s^2 = 1-as^2 since a=1 */
    succ &= ~ gf_eq( f, ZERO );
    gf_sqr ( b, f ); 
    gf_mlw ( c, a, 4-4*EDWARDS_D ); 
    gf_mulw_sgn ( c, a, 4-4*EDWARDS_D ); 
    gf_add ( c, c, b ); /* t^2 */
    gf_mul ( d, f, s ); /* s(1-s^2) for denoms */
    gf_sqr ( e, d );
@@ -596,7 +543,7 @@ void API_NS(point_sub) (
    gf_add_nr ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mlw ( p->x, b, -2*EDWARDS_D );
    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
    gf_add_nr ( b, a, p->y );
    gf_sub_nr ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
@@ -622,7 +569,7 @@ void API_NS(point_add) (
    gf_add_nr ( b, q->y, q->x );
    gf_mul ( p->y, d, b );
    gf_mul ( b, r->t, q->t );
    gf_mlw ( p->x, b, -2*EDWARDS_D );
    gf_mulw_sgn ( p->x, b, -2*EDWARDS_D );
    gf_add_nr ( b, a, p->y );
    gf_sub_nr ( c, p->y, a );
    gf_mul ( a, q->z, r->z );
@@ -646,11 +593,11 @@ snv point_double_internal (
    gf_add_nr ( d, c, a );
    gf_add_nr ( p->t, q->y, q->x );
    gf_sqr ( b, p->t );
    gf_sub_nr_x ( b, b, d, 3 );
    gf_subx_nr ( b, b, d, 3 );
    gf_sub_nr ( p->t, a, c );
    gf_sqr ( p->x, q->z );
    gf_add_nr ( p->z, p->x, p->x );
    gf_sub_nr_x ( a, p->z, p->t, 4 );
    gf_subx_nr ( a, p->z, p->t, 4 );
    gf_mul ( p->x, a, b );
    gf_mul ( p->z, p->t, a );
    gf_mul ( p->y, p->t, d );
@@ -777,7 +724,7 @@ static void pt_to_pniels (
 ) {
    gf_sub ( b->n->a, a->y, a->x );
    gf_add ( b->n->b, a->x, a->y );
    gf_mlw ( b->n->c, a->t, -2*EDWARDS_D );
    gf_mulw_sgn ( b->n->c, a->t, -2*EDWARDS_D );
    gf_add ( b->z, a->z, a->z );
 }

@@ -1047,12 +994,12 @@ void API_NS(point_from_hash_nonuniform) (
    // TODO: simplify since we don't return a hint anymore
    gf r0,r,a,b,c,dee,D,N,rN,e;
    gf_deser(r0,ser);
    gf_canon(r0);
    gf_strong_reduce(r0);
    gf_sqr(a,r0);
    //gf_sub(r,ZERO,a); /*gf_mlw(r,a,QUADRATIC_NONRESIDUE);*/
    //gf_sub(r,ZERO,a); /*gf_mulw_sgn(r,a,QUADRATIC_NONRESIDUE);*/
        gf_mul(r,a,SQRT_MINUS_ONE);
    gf_mlw(dee,ONE,EDWARDS_D);
    gf_mlw(c,r,EDWARDS_D);
    gf_mulw_sgn(dee,ONE,EDWARDS_D);
    gf_mulw_sgn(c,r,EDWARDS_D);
    
    /* Compute D := (dr+a-d)(dr-ar-d) with a=1 */
    gf_sub(a,c,dee);
@@ -1064,7 +1011,7 @@ void API_NS(point_from_hash_nonuniform) (
    
    /* compute N := (r+1)(a-2d) */
    gf_add(a,r,ONE);
    gf_mlw(N,a,1-2*EDWARDS_D);
    gf_mulw_sgn(N,a,1-2*EDWARDS_D);
    
    /* e = +-1/sqrt(+-ND) */
    gf_mul(rN,r,N);
@@ -1078,8 +1025,8 @@ void API_NS(point_from_hash_nonuniform) (
    /* b <- t/s */
    cond_sel(c,r0,r,square); /* r? = sqr ? r : 1 */
    /* In two steps to avoid overflow on 32-bit arch */
    gf_mlw(a,c,1-2*EDWARDS_D);
    gf_mlw(b,a,1-2*EDWARDS_D);
    gf_mulw_sgn(a,c,1-2*EDWARDS_D);
    gf_mulw_sgn(b,a,1-2*EDWARDS_D);
    gf_sub(c,r,ONE);
    gf_mul(a,b,c); /* = r? * (r-1) * (a-2d)^2 with a=1 */
    gf_mul(b,a,e);
@@ -1148,7 +1095,7 @@ API_NS(invert_elligator_nonuniform) (
        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
        
    }
    gf_mlw(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
    gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
    gf_add(a,d,b); /* num? */
    gf_sub(d,d,b); /* den? */
    gf_mul(b,a,d); /* n*d */
@@ -1199,7 +1146,7 @@ decaf_bool_t API_NS(point_valid) (
    gf_sqr(b,p->y);
    gf_sub(a,b,a);
    gf_sqr(b,p->t);
    gf_mlw(c,b,-EDWARDS_D);
    gf_mulw_sgn(c,b,-EDWARDS_D);
    gf_sqr(b,p->z);
    gf_add(b,b,c);
    out &= gf_eq(a,b);
@@ -1281,15 +1228,15 @@ static void batch_normalize_niels (

    for (i=0; i<n; i++) {
        gf_mul(product, table[i]->a, zis[i]);
        gf_canon(product);
        gf_strong_reduce(product);
        gf_cpy(table[i]->a, product);
        
        gf_mul(product, table[i]->b, zis[i]);
        gf_canon(product);
        gf_strong_reduce(product);
        gf_cpy(table[i]->b, product);
        
        gf_mul(product, table[i]->c, zis[i]);
        gf_canon(product);
        gf_strong_reduce(product);
        gf_cpy(table[i]->c, product);
    }
 }
@@ -1510,7 +1457,7 @@ sv prepare_wnaf_table(
    }
 }

 extern const field_t API_NS(precomputed_wnaf_as_fe)[];
 extern const gf API_NS(precomputed_wnaf_as_fe)[];
 static const niels_t *API_NS(wnaf_base) = (const niels_t *)API_NS(precomputed_wnaf_as_fe);
 const size_t API_NS2(sizeof,precomputed_wnafs) __attribute((visibility("hidden")))
    = sizeof(niels_t)<<DECAF_WNAF_FIXED_TABLE_BITS;
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -19,7 +19,7 @@
 #define API_NS2(_pref,_id) _pref##_decaf_255_##_id

 /* To satisfy linker. */
 const field_t API_NS(precomputed_base_as_fe)[1];
 const gf API_NS(precomputed_base_as_fe)[1];
 const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 const API_NS(scalar_t) sc_r2 = {{{0}}};
@@ -29,7 +29,7 @@ const unsigned char base_point_ser_for_pregen[DECAF_255_SER_BYTES];
 const API_NS(point_t) API_NS(point_base);

 struct niels_s;
 const field_t *API_NS(precomputed_wnaf_as_fe);
 const gf_s *API_NS(precomputed_wnaf_as_fe);
 extern const size_t API_NS2(sizeof,precomputed_wnafs);

 void API_NS(precompute_wnafs) (
@@ -48,26 +48,26 @@ static void scalar_print(const char *name, const API_NS(scalar_t) sc) {
    printf("}}};\n\n");
 }

 static void field_print(const field_t *f) {
    const int FIELD_SER_BYTES = (FIELD_BITS + 7) / 8;
    unsigned char ser[FIELD_SER_BYTES];
    field_serialize(ser,f);
 static void field_print(const gf f) {
    const int GF_SER_BYTES = (GF_BITS + 7) / 8;
    unsigned char ser[GF_SER_BYTES];
    gf_serialize(ser,f);
    int b=0, i, comma=0;
    unsigned long long limb = 0;
    printf("FIELD_LITERAL(");
    for (i=0; i<FIELD_SER_BYTES; i++) {
    printf("{FIELD_LITERAL(");
    for (i=0; i<GF_SER_BYTES; i++) {
        limb |= ((uint64_t)ser[i])<<b;
        b += 8;
        if (b >= FIELD_LIT_LIMB_BITS) {
            limb &= (1ull<<FIELD_LIT_LIMB_BITS) -1;
            b -= FIELD_LIT_LIMB_BITS;
        if (b >= GF_LIT_LIMB_BITS) {
            limb &= (1ull<<GF_LIT_LIMB_BITS) -1;
            b -= GF_LIT_LIMB_BITS;
            if (comma) printf(",");
            comma = 1;
            printf("0x%016llx", limb);
            limb = ((uint64_t)ser[i])>>(8-b);
        }
    }
    printf(")");
    printf(")}");
    assert(b<8);
 }

@@ -88,41 +88,39 @@ int main(int argc, char **argv) {
    if (ret || !preWnaf) return 1;
    API_NS(precompute_wnafs)(preWnaf, real_point_base);

    const field_t *output;
    const gf_s *output;
    unsigned i;
    
    printf("/** @warning: this file was automatically generated. */\n");
    printf("#include <decaf.h>\n\n");
    printf("#include \"field.h\"\n\n");
    printf("#include \"decaf.h\"\n\n");
    printf("#define API_NS(_id) decaf_255_##_id\n");
    printf("#define API_NS2(_pref,_id) _pref##_decaf_255_##_id\n");
    
    output = (const field_t *)real_point_base;
    output = (const gf_s *)real_point_base;
    printf("const API_NS(point_t) API_NS(point_base) = {{\n");
    for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(field_t)) {
    for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(gf)) {
        if (i) printf(",\n  ");
        printf("{");
        field_print(output++);
        printf("}");
    }
    printf("\n}};\n");
    
    output = (const field_t *)pre;
    printf("const field_t API_NS(precomputed_base_as_fe)[%d]\n", 
        (int)(API_NS2(sizeof,precomputed_s) / sizeof(field_t)));
    output = (const gf_s *)pre;
    printf("const gf API_NS(precomputed_base_as_fe)[%d]\n", 
        (int)(API_NS2(sizeof,precomputed_s) / sizeof(gf)));
    printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
    
    for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(field_t)) {
    for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(gf)) {
        if (i) printf(",\n  ");
        field_print(output++);
    }
    printf("\n};\n");
    
    output = (const field_t *)preWnaf;
    printf("const field_t API_NS(precomputed_wnaf_as_fe)[%d]\n", 
        (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(field_t)));
    output = (const gf_s *)preWnaf;
    printf("const gf API_NS(precomputed_wnaf_as_fe)[%d]\n", 
        (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(gf)));
    printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
    for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(field_t)) {
    for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(gf)) {
        if (i) printf(",\n  ");
        field_print(output++);
    }
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -1,23 +1,20 @@
 /**
 * @file field.h
 * @brief Generic field header.
 * @brief Generic gf header.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */

 #ifndef __FIELD_H__
 #define __FIELD_H__
 #ifndef __GF_H__
 #define __GF_H__

 #include "constant_time.h"
 #include "f_field.h"
 #include <string.h>

 typedef struct field_t field_a_t[1];
 #define field_a_restrict_t struct field_t *__restrict__

 #define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
 #define is32 (GOLDI_BITS == 32 || GF_BITS != 448)
 #if (is32)
 #define IF32(s) (s)
 #else
@@ -33,9 +30,9 @@ typedef struct field_t field_a_t[1];
 * If x=0, returns 0.
 */
 void
 field_isr (
    field_a_t       a,
    const field_a_t x
 gf_isr (
    gf       a,
    const gf x
 );
    
 /**
@@ -43,62 +40,75 @@ field_isr (
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 field_sqrn (
    field_a_restrict_t y,
    const field_a_t x,
 gf_sqrn (
    gf_s *__restrict__ y,
    const gf x,
    int n
 ) {
    field_a_t tmp;
    gf tmp;
    assert(n>0);
    if (n&1) {
        field_sqr(y,x);
        gf_sqr(y,x);
        n--;
    } else {
        field_sqr(tmp,x);
        field_sqr(y,tmp);
        gf_sqr(tmp,x);
        gf_sqr(y,tmp);
        n-=2;
    }
    for (; n; n-=2) {
        field_sqr(tmp,y);
        field_sqr(y,tmp);
        gf_sqr(tmp,y);
        gf_sqr(y,tmp);
    }
 }

 static __inline__ void
 field_subx_RAW (
    field_a_t d,
    const field_a_t a,
    const field_a_t b
 gf_subx_RAW (
    gf d,
    const gf a,
    const gf b
 ) {
    field_sub_RAW ( d, a, b );
    field_bias( d, 2 );
    IF32( field_weak_reduce ( d ) );
    gf_sub_RAW ( d, a, b );
    gf_bias( d, 2 );
    IF32( gf_weak_reduce ( d ) );
 }

 static __inline__ void
 field_sub (
    field_a_t d,
    const field_a_t a,
    const field_a_t b
 gf_sub (
    gf d,
    const gf a,
    const gf b
 ) {
    field_sub_RAW ( d, a, b );
    field_bias( d, 2 );
    field_weak_reduce ( d );
    gf_sub_RAW ( d, a, b );
    gf_bias( d, 2 );
    gf_weak_reduce ( d );
 }

 static __inline__ void
 field_add (
    field_a_t d,
    const field_a_t a,
    const field_a_t b
 gf_add (
    gf d,
    const gf a,
    const gf b
 ) {
    field_add_RAW ( d, a, b );
    field_weak_reduce ( d );
    gf_add_RAW ( d, a, b );
    gf_weak_reduce ( d );
 }

 #define gf_add_nr gf_add_RAW

 /** Subtract mod p.  Bias by 2 and don't reduce  */
 static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
    gf_sub_RAW(c,a,b);
    gf_bias(c, 2);
    if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
 }

 /** Subtract mod p. Bias by amt but don't reduce.  */
 static inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) {
    gf_sub_RAW(c,a,b);
    gf_bias(c, amt);
    if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
 }

 /* FIXME: no warnings on RAW routines */
 #define field_add_nr field_add_RAW
 #define field_sub_nr field_sub_RAW
 #define field_subx_nr field_subx_RAW

 #endif // __FIELD_H__
 #endif // __GF_H__
--- a/src/p25519/arch_ref64/p25519.c
+++ b/src/p25519/arch_ref64/p25519.c
@@ -17,10 +17,10 @@ static __inline__ uint64_t is_zero(uint64_t a) {
 }

 void
 p255_mul (
    p255_t *__restrict__ cs,
    const p255_t *as,
    const p255_t *bs
 gf_25519_mul (
    gf_25519_t __restrict__ cs,
    const gf_25519_t as,
    const gf_25519_t bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    
@@ -52,9 +52,9 @@ p255_mul (
 }

 void
 p255_mulw (
    p255_t *__restrict__ cs,
    const p255_t *as,
 gf_25519_mulw (
    gf_25519_t __restrict__ cs,
    const gf_25519_t as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
@@ -79,16 +79,16 @@ p255_mulw (
 }

 void
 p255_sqr (
    p255_t *__restrict__ cs,
    const p255_t *as
 gf_25519_t qr (
    gf_25519_t __restrict__ cs,
    const gf_25519_t as
 ) {
    p255_mul(cs,as,as); // TODO
    gf_25519_mul(cs,as,as); // TODO
 }

 void
 p255_strong_reduce (
    p255_t *a
 gf_25519_t trong_reduce (
    gf_25519_t a
 ) {
    uint64_t mask = (1ull<<51)-1;

@@ -128,14 +128,14 @@ p255_strong_reduce (
 }

 void
 p255_serialize (
 gf_25519_t erialize (
    uint8_t serial[32],
    const struct p255_t *x
    const struct gf_25519_t x
 ) {
    int i,j;
    p255_t red;
    p255_copy(&red, x);
    p255_strong_reduce(&red);
    gf_25519_t red;
    gf_25519_copy(&red, x);
    gf_25519_t trong_reduce(&red);
    uint64_t *r = red.limb;
    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
    for (i=0; i<4; i++) {
@@ -147,8 +147,8 @@ p255_serialize (
 }

 mask_t
 p255_deserialize (
    p255_t *x,
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 ) {
    int i,j;
--- a/src/p25519/arch_ref64/p25519.h
+++ b/src/p25519/arch_ref64/p25519.h
@@ -1,8 +1,8 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P255_H__
 #define __P255_H__ 1
 #ifndef __P25519_H__
 #define __P25519_H__ 1

 #include <stdint.h>
 #include <assert.h>
@@ -10,9 +10,9 @@

 #include "word.h"

 typedef struct p255_t {
 typedef struct gf_25519_s {
  uint64_t limb[5];
 } p255_t;
 } gf_25519_s, gf_25519_t[1];

 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
@@ -32,113 +32,113 @@ extern "C" {
 #endif

 static __inline__ void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_copy (
    p255_t *out,
    const p255_t *a
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) __attribute__((unused));
             
 static __inline__ void
 p255_weak_reduce (
    p255_t *inout
 gf_25519_weak_reduce (
    gf_25519_t inout
 ) __attribute__((unused));
             
 void
 p255_strong_reduce (
    p255_t *inout
 gf_25519_strong_reduce (
    gf_25519_t inout
 );

 static __inline__ void
 p255_bias (
    p255_t *inout,
 gf_25519_bias (
    gf_25519_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 p255_mul (
    p255_t *__restrict__ out,
    const p255_t *a,
    const p255_t *b
 gf_25519_mul (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    const gf_25519_t b
 );

 void
 p255_mulw (
    p255_t *__restrict__ out,
    const p255_t *a,
 gf_25519_mulw (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    uint64_t b
 );

 void
 p255_sqr (
    p255_t *__restrict__ out,
    const p255_t *a
 gf_25519_sqr (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a
 );

 void
 p255_serialize (
 gf_25519_serialize (
    uint8_t serial[32],
    const struct p255_t *x
    const gf_25519_t x
 );

 mask_t
 p255_deserialize (
    p255_t *x,
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
    unsigned int i;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    p255_weak_reduce(out);
    gf_25519_weak_reduce(out);
 }

 void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
    }
    p255_weak_reduce(out);
    gf_25519_weak_reduce(out);
 }

 void
 p255_copy (
    p255_t *out,
    const p255_t *a
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p255_bias (
    p255_t *a,
 gf_25519_bias (
    gf_25519_t a,
    int amt
 ) {
    (void) a;
@@ -146,8 +146,8 @@ p255_bias (
 }

 void
 p255_weak_reduce (
    p255_t *a
 gf_25519_weak_reduce (
    gf_25519_t a
 ) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[4] >> 51;
@@ -162,4 +162,4 @@ p255_weak_reduce (
 }; /* extern "C" */
 #endif

 #endif /* __P255_H__ */
 #endif /* __P25519_H__ */
--- a/src/p25519/arch_x86_64/p25519.c
+++ b/src/p25519/arch_x86_64/p25519.c
@@ -10,10 +10,10 @@ static inline uint64_t shr(__uint128_t x, int n) {
 }

 void
 p255_mul (
    p255_t *__restrict__ cs,
    const p255_t *as,
    const p255_t *bs
 gf_25519_mul (
    gf_25519_s *__restrict__ cs,
    const gf_25519_t as,
    const gf_25519_t bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;
@@ -92,9 +92,9 @@ p255_mul (
 }

 void
 p255_sqr (
    p255_t *__restrict__ cs,
    const p255_t *as
 gf_25519_sqr (
    gf_25519_s *__restrict__ cs,
    const gf_25519_t as
 ) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;
@@ -156,9 +156,9 @@ p255_sqr (
 }

 void
 p255_mulw (
    p255_t *__restrict__ cs,
    const p255_t *as,
 gf_25519_mulw (
    gf_25519_s *__restrict__ cs,
    const gf_25519_t as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
@@ -191,8 +191,8 @@ p255_mulw (
 }

 void
 p255_strong_reduce (
    p255_t *a
 gf_25519_strong_reduce (
    gf_25519_t a
 ) {
    uint64_t mask = (1ull<<51)-1;

@@ -232,15 +232,15 @@ p255_strong_reduce (
 }

 void
 p255_serialize (
 gf_25519_serialize (
    uint8_t serial[32],
    const struct p255_t *x
    const gf_25519_t x
 ) {
    int i,j;
    p255_t red;
    p255_copy(&red, x);
    p255_strong_reduce(&red);
    uint64_t *r = red.limb;
    gf_25519_t red;
    gf_25519_copy(red, x);
    gf_25519_strong_reduce(red);
    uint64_t *r = red->limb;
    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
    for (i=0; i<4; i++) {
        for (j=0; j<8; j++) {
@@ -251,8 +251,8 @@ p255_serialize (
 }

 mask_t
 p255_deserialize (
    p255_t *x,
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 ) {
    int i,j;
--- a/src/p25519/arch_x86_64/p25519.h
+++ b/src/p25519/arch_x86_64/p25519.h
@@ -1,8 +1,8 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P255_H__
 #define __P255_H__ 1
 #ifndef __P25519_H__
 #define __P25519_H__ 1

 #include <stdint.h>
 #include <assert.h>
@@ -10,9 +10,12 @@

 #include "word.h"

 typedef struct p255_t {
 #ifndef __DECAF_255_H__ // HACK FIXME
 #define DECAF_WORD_BITS 64
 typedef struct gf_25519_s {
  uint64_t limb[5];
 } p255_t;
 } gf_25519_s, gf_25519_t[1];
 #endif

 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
@@ -32,80 +35,80 @@ extern "C" {
 #endif

 static __inline__ void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_copy (
    p255_t *out,
    const p255_t *a
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) __attribute__((unused));
             
 static __inline__ void
 p255_weak_reduce (
    p255_t *inout
 gf_25519_weak_reduce (
    gf_25519_t inout
 ) __attribute__((unused));
             
 void
 p255_strong_reduce (
    p255_t *inout
 gf_25519_strong_reduce (
    gf_25519_t inout
 );

 static __inline__ void
 p255_bias (
    p255_t *inout,
 gf_25519_bias (
    gf_25519_t inout,
    int amount
 ) __attribute__((unused));
         
 void
 p255_mul (
    p255_t *__restrict__ out,
    const p255_t *a,
    const p255_t *b
 gf_25519_mul (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    const gf_25519_t b
 );

 void
 p255_mulw (
    p255_t *__restrict__ out,
    const p255_t *a,
 gf_25519_mulw (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a,
    uint64_t b
 );

 void
 p255_sqr (
    p255_t *__restrict__ out,
    const p255_t *a
 gf_25519_sqr (
    gf_25519_s *__restrict__ out,
    const gf_25519_t a
 );

 void
 p255_serialize (
 gf_25519_serialize (
    uint8_t serial[32],
    const struct p255_t *x
    const gf_25519_t x
 );

 mask_t
 p255_deserialize (
    p255_t *x,
 gf_25519_deserialize (
    gf_25519_t x,
    const uint8_t serial[32]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_add_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
    unsigned int i;
    for (i=0; i<5; i++) {
@@ -114,10 +117,10 @@ p255_add_RAW (
 }

 void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 gf_25519_sub_RAW (
    gf_25519_t out,
    const gf_25519_t a,
    const gf_25519_t b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
@@ -127,16 +130,16 @@ p255_sub_RAW (
 }

 void
 p255_copy (
    p255_t *out,
    const p255_t *a
 gf_25519_copy (
    gf_25519_t out,
    const gf_25519_t a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p255_bias (
    p255_t *a,
 gf_25519_bias (
    gf_25519_t a,
    int amt
 ) {
    a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
@@ -147,8 +150,8 @@ p255_bias (
 }

 void
 p255_weak_reduce (
    p255_t *a
 gf_25519_weak_reduce (
    gf_25519_t a
 ) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[4] >> 51;
@@ -163,4 +166,4 @@ p255_weak_reduce (
 }; /* extern "C" */
 #endif

 #endif /* __P255_H__ */
 #endif /* __P25519_H__ */
--- a/src/p25519/arch_x86_64/x86-64-arith.h
+++ b/src/p25519/arch_x86_64/x86-64-arith.h
@@ -1 +0,0 @@
 ../../p448/arch_x86_64/x86-64-arith.h
--- a/src/p25519/arch_x86_64/x86-64-arith.h
+++ b/src/p25519/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,323 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __X86_64_ARITH_H__
 #define __X86_64_ARITH_H__

 #include <stdint.h>

 /* TODO: non x86-64 versions of these.
 * FUTURE: autogenerate
 */

 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"r"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"r"(b), "a"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"r"(b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "leaq (,%%rdx,2), %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       "addq %[c], %[lo2]; "
       "adcq %[d], %[hi2]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       "addq %%rax, %[lo2]; "
       "adcq %%rdx, %[hi2]; "
       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
 }

 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"r"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"r"(b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"r"(b), "a"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
  
 }

 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t c,d, lo = *acc, hi = *acc>>64;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[lo], %[c]; "
       "sbbq %[hi], %[d]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  return ((__uint128_t)(a)) * b;
 }

 static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  return ((__int128_t)(a)) * b;
 }
 
 static __inline__ uint64_t opacify(uint64_t x) {
  __asm__ volatile("" : "+r"(x));
  return x;
 }

 static __inline__ mask_t is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }

 #endif /* __X86_64_ARITH_H__ */
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -10,7 +10,7 @@

 #include "field.h"

 const field_a_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
 const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
    0x61b274a0ea0b0,
    0x0d5a5fc8f189d,
    0x7ef5e9cbd0c60,
@@ -18,7 +18,7 @@ const field_a_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
    0x2b8324804fc1d
 )};
    
 const field_a_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere?
 const gf_25519_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere?
    0x6db8831bbddec,
    0x38d7b56c9c165,
    0x016b221394bdc,
@@ -26,15 +26,15 @@ const field_a_t SQRT_ONE_MINUS_D = {FIELD_LITERAL( // FIXME MAGIC goes elsewhere
    0x0a0d85b4032b1
 )};
    
 static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
 static const gf_25519_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
    1,0,0,0,0
 )}; 

 // ARCH MAGIC FIXME copy-pasted from decaf_fast.c
 static mask_t gf_eq(const field_a_t a, const field_a_t b) {
    field_a_t c;
    field_sub(c,a,b);
    field_strong_reduce(c);
 static mask_t gf_eq(const gf_25519_t a, const gf_25519_t b) {
    gf_25519_t c;
    gf_sub(c,a,b);
    gf_strong_reduce(c);
    mask_t ret=0;
    int i;
    for (i=0; i<5; i++) { ret |= c->limb[i]; }
@@ -43,19 +43,19 @@ static mask_t gf_eq(const field_a_t a, const field_a_t b) {

 /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 gf_isr (
    gf_25519_t a,
    const gf_25519_t x
 ) {
    field_a_t st[3], tmp1, tmp2;
    gf_25519_t st[3], tmp1, tmp2;
    const struct { unsigned char sh, idx; } ops[] = {
        {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
    };
    st[0][0] = st[1][0] = st[2][0] = x[0];
    unsigned int i;
    for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
        field_sqrn(tmp1, st[1^(i&1)], ops[i].sh);
        field_mul(tmp2, tmp1, st[ops[i].idx]);
        gf_sqrn(tmp1, st[1^(i&1)], ops[i].sh);
        gf_mul(tmp2, tmp1, st[ops[i].idx]);
        st[i&1][0] = tmp2[0];
    }
    
@@ -64,5 +64,5 @@ field_isr (
    // ARCH MAGIC FIXME: should be cond_sel
    for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i]            &  mask)
                                      | (SQRT_MINUS_ONE->limb[i] & ~mask);
    field_mul(a,tmp1,st[0]);
    gf_mul(a,tmp1,st[0]);
 }
--- a/src/p25519/f_field.h
+++ b/src/p25519/f_field.h
@@ -13,20 +13,21 @@
 #include <string.h>

 #include "p25519.h"
 #define FIELD_LIT_LIMB_BITS  51
 #define FIELD_BITS           255
 #define field_t              p255_t
 #define field_mul            p255_mul
 #define field_sqr            p255_sqr
 #define field_add_RAW        p255_add_RAW
 #define field_sub_RAW        p255_sub_RAW
 #define field_mulw           p255_mulw
 #define field_bias           p255_bias
 #define field_isr            p255_isr
 #define field_weak_reduce    p255_weak_reduce
 #define field_strong_reduce  p255_strong_reduce
 #define field_serialize      p255_serialize
 #define field_deserialize    p255_deserialize
 #define SQRT_MINUS_ONE P25519_SQRT_MINUS_ONE
 #define GF_LIT_LIMB_BITS  51
 #define GF_BITS           255
 #define gf              gf_25519_t
 #define gf_s              gf_25519_s
 #define gf_mul            gf_25519_mul
 #define gf_sqr            gf_25519_sqr
 #define gf_add_RAW        gf_25519_add_RAW
 #define gf_sub_RAW        gf_25519_sub_RAW
 #define gf_mulw           gf_25519_mulw
 #define gf_bias           gf_25519_bias
 #define gf_isr            gf_25519_isr
 #define gf_weak_reduce    gf_25519_weak_reduce
 #define gf_strong_reduce  gf_25519_strong_reduce
 #define gf_serialize      gf_25519_serialize
 #define gf_deserialize    gf_25519_deserialize
 #define SQRT_MINUS_ONE    P25519_SQRT_MINUS_ONE

 #endif /* __F_FIELD_H__ */
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"

 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 gf_isr (
    gf_a_t a,
    const gf_a_t x
 ) {
    field_a_t L0, L1, L2;
    field_sqr  (   L1,     x );
    field_mul  (   L2,     x,   L1 );
    field_sqr  (   L1,   L2 );
    field_mul  (   L2,     x,   L1 );
    field_sqrn (   L1,   L2,     3 );
    field_mul  (   L0,   L2,   L1 );
    field_sqrn (   L1,   L0,     3 );
    field_mul  (   L0,   L2,   L1 );
    field_sqrn (   L2,   L0,     9 );
    field_mul  (   L1,   L0,   L2 );
    field_sqr  (   L0,   L1 );
    field_mul  (   L2,     x,   L0 );
    field_sqrn (   L0,   L2,    18 );
    field_mul  (   L2,   L1,   L0 );
    field_sqrn (   L0,   L2,    37 );
    field_mul  (   L1,   L2,   L0 );
    field_sqrn (   L0,   L1,    37 );
    field_mul  (   L1,   L2,   L0 );
    field_sqrn (   L0,   L1,   111 );
    field_mul  (   L2,   L1,   L0 );
    field_sqr  (   L0,   L2 );
    field_mul  (   L1,     x,   L0 );
    field_sqrn (   L0,   L1,   223 );
    field_mul  (     a,   L2,   L0 );
    gf_a_t L0, L1, L2;
    gf_sqr  (   L1,     x );
    gf_mul  (   L2,     x,   L1 );
    gf_sqr  (   L1,   L2 );
    gf_mul  (   L2,     x,   L1 );
    gf_sqrn (   L1,   L2,     3 );
    gf_mul  (   L0,   L2,   L1 );
    gf_sqrn (   L1,   L0,     3 );
    gf_mul  (   L0,   L2,   L1 );
    gf_sqrn (   L2,   L0,     9 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqr  (   L0,   L1 );
    gf_mul  (   L2,     x,   L0 );
    gf_sqrn (   L0,   L2,    18 );
    gf_mul  (   L2,   L1,   L0 );
    gf_sqrn (   L0,   L2,    37 );
    gf_mul  (   L1,   L2,   L0 );
    gf_sqrn (   L0,   L1,    37 );
    gf_mul  (   L1,   L2,   L0 );
    gf_sqrn (   L0,   L1,   111 );
    gf_mul  (   L2,   L1,   L0 );
    gf_sqr  (   L0,   L2 );
    gf_mul  (   L1,     x,   L0 );
    gf_sqrn (   L0,   L1,   223 );
    gf_mul  (     a,   L2,   L0 );
 }
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -13,19 +13,19 @@
 #include <string.h>

 #include "p448.h"
 #define FIELD_LIT_LIMB_BITS  56
 #define FIELD_BITS           448
 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
 #define field_add_RAW        p448_add_RAW
 #define field_sub_RAW        p448_sub_RAW
 #define field_mulw           p448_mulw
 #define field_bias           p448_bias
 #define field_isr            p448_isr
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define GF_LIT_LIMB_BITS  56
 #define GF_BITS           448
 #define gf              p448_t
 #define gf_mul            p448_mul
 #define gf_sqr            p448_sqr
 #define gf_add_RAW        p448_add_RAW
 #define gf_sub_RAW        p448_sub_RAW
 #define gf_mulw           p448_mulw
 #define gf_bias           p448_bias
 #define gf_isr            p448_isr
 #define gf_weak_reduce    p448_weak_reduce
 #define gf_strong_reduce  p448_strong_reduce
 #define gf_serialize      p448_serialize
 #define gf_deserialize    p448_deserialize

 #endif /* __F_FIELD_H__ */
--- a/src/p480/f_arithmetic.c
+++ b/src/p480/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"

 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 gf_isr (
    gf_a_t a,
    const gf_a_t x
 ) {
    field_a_t L0, L1, L2, L3;
    field_sqr  (   L2,     x );
    field_mul  (   L1,     x,   L2 );
    field_sqrn (   L0,   L1,     2 );
    field_mul  (   L2,   L1,   L0 );
    field_sqrn (   L0,   L2,     4 );
    field_mul  (   L1,   L2,   L0 );
    field_sqr  (   L0,   L1 );
    field_mul  (   L2,     x,   L0 );
    field_sqrn (   L0,   L2,     8 );
    field_mul  (   L2,   L1,   L0 );
    field_sqrn (   L0,   L2,    17 );
    field_mul  (   L1,   L2,   L0 );
    field_sqrn (   L0,   L1,    17 );
    field_mul  (   L1,   L2,   L0 );
    field_sqrn (   L3,   L1,    17 );
    field_mul  (   L0,   L2,   L3 );
    field_sqrn (   L2,   L0,    51 );
    field_mul  (   L0,   L1,   L2 );
    field_sqrn (   L1,   L0,   119 );
    field_mul  (   L2,   L0,   L1 );
    field_sqr  (   L0,   L2 );
    field_mul  (   L1,     x,   L0 );
    field_sqrn (   L0,   L1,   239 );
    field_mul  (     a,   L2,   L0 );
    gf_a_t L0, L1, L2, L3;
    gf_sqr  (   L2,     x );
    gf_mul  (   L1,     x,   L2 );
    gf_sqrn (   L0,   L1,     2 );
    gf_mul  (   L2,   L1,   L0 );
    gf_sqrn (   L0,   L2,     4 );
    gf_mul  (   L1,   L2,   L0 );
    gf_sqr  (   L0,   L1 );
    gf_mul  (   L2,     x,   L0 );
    gf_sqrn (   L0,   L2,     8 );
    gf_mul  (   L2,   L1,   L0 );
    gf_sqrn (   L0,   L2,    17 );
    gf_mul  (   L1,   L2,   L0 );
    gf_sqrn (   L0,   L1,    17 );
    gf_mul  (   L1,   L2,   L0 );
    gf_sqrn (   L3,   L1,    17 );
    gf_mul  (   L0,   L2,   L3 );
    gf_sqrn (   L2,   L0,    51 );
    gf_mul  (   L0,   L1,   L2 );
    gf_sqrn (   L1,   L0,   119 );
    gf_mul  (   L2,   L0,   L1 );
    gf_sqr  (   L0,   L2 );
    gf_mul  (   L1,     x,   L0 );
    gf_sqrn (   L0,   L1,   239 );
    gf_mul  (     a,   L2,   L0 );
 }
--- a/src/p480/f_field.h
+++ b/src/p480/f_field.h
@@ -13,19 +13,19 @@
 #include <string.h>

 #include "p480.h"
 #define FIELD_LIT_LIMB_BITS  60
 #define FIELD_BITS           480
 #define field_t              p480_t
 #define field_mul            p480_mul
 #define field_sqr            p480_sqr
 #define field_add_RAW        p480_add_RAW
 #define field_sub_RAW        p480_sub_RAW
 #define field_mulw           p480_mulw
 #define field_bias           p480_bias
 #define field_isr            p480_isr
 #define field_weak_reduce    p480_weak_reduce
 #define field_strong_reduce  p480_strong_reduce
 #define field_serialize      p480_serialize
 #define field_deserialize    p480_deserialize
 #define GF_LIT_LIMB_BITS  60
 #define GF_BITS           480
 #define gf              p480_t
 #define gf_mul            p480_mul
 #define gf_sqr            p480_sqr
 #define gf_add_RAW        p480_add_RAW
 #define gf_sub_RAW        p480_sub_RAW
 #define gf_mulw           p480_mulw
 #define gf_bias           p480_bias
 #define gf_isr            p480_isr
 #define gf_weak_reduce    p480_weak_reduce
 #define gf_strong_reduce  p480_strong_reduce
 #define gf_serialize      p480_serialize
 #define gf_deserialize    p480_deserialize

 #endif /* __F_FIELD_H__ */
--- a/src/p521/f_arithmetic.c
+++ b/src/p521/f_arithmetic.c
@@ -11,33 +11,33 @@
 #include "field.h"

 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 gf_isr (
    gf_a_t a,
    const gf_a_t x
 ) {
    field_a_t L0, L1, L2;
    field_sqr  (   L1,     x );
    field_mul  (   L0,     x,   L1 );
    field_sqrn (   L2,   L0,     2 );
    field_mul  (   L1,   L0,   L2 );
    field_sqrn (   L2,   L1,     4 );
    field_mul  (   L0,   L1,   L2 );
    field_sqrn (   L2,   L0,     8 );
    field_mul  (   L1,   L0,   L2 );
    field_sqrn (   L2,   L1,    16 );
    field_mul  (   L0,   L1,   L2 );
    field_sqrn (   L2,   L0,    32 );
    field_mul  (   L1,   L0,   L2 );
    field_sqr  (   L2,   L1 );
    field_mul  (   L0,     x,   L2 );
    field_sqrn (   L2,   L0,    64 );
    field_mul  (   L0,   L1,   L2 );
    field_sqrn (   L2,   L0,   129 );
    field_mul  (   L1,   L0,   L2 );
    field_sqr  (   L2,   L1 );
    field_mul  (   L0,     x,   L2 );
    field_sqrn (   L2,   L0,   259 );
    field_mul  (   L1,   L0,   L2 );
    field_sqr  (   L0,   L1 );
    field_mul  (     a,     x,   L0 );
    gf_a_t L0, L1, L2;
    gf_sqr  (   L1,     x );
    gf_mul  (   L0,     x,   L1 );
    gf_sqrn (   L2,   L0,     2 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqrn (   L2,   L1,     4 );
    gf_mul  (   L0,   L1,   L2 );
    gf_sqrn (   L2,   L0,     8 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqrn (   L2,   L1,    16 );
    gf_mul  (   L0,   L1,   L2 );
    gf_sqrn (   L2,   L0,    32 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqr  (   L2,   L1 );
    gf_mul  (   L0,     x,   L2 );
    gf_sqrn (   L2,   L0,    64 );
    gf_mul  (   L0,   L1,   L2 );
    gf_sqrn (   L2,   L0,   129 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqr  (   L2,   L1 );
    gf_mul  (   L0,     x,   L2 );
    gf_sqrn (   L2,   L0,   259 );
    gf_mul  (   L1,   L0,   L2 );
    gf_sqr  (   L0,   L1 );
    gf_mul  (     a,     x,   L0 );
 }
--- a/src/p521/f_field.h
+++ b/src/p521/f_field.h
@@ -13,19 +13,19 @@
 #include "constant_time.h"

 #include "p521.h"
 #define FIELD_LIT_LIMB_BITS  58
 #define FIELD_BITS           521
 #define field_t              p521_t
 #define field_mul            p521_mul
 #define field_sqr            p521_sqr
 #define field_add_RAW        p521_add_RAW
 #define field_sub_RAW        p521_sub_RAW
 #define field_mulw           p521_mulw
 #define field_bias           p521_bias
 #define field_isr            p521_isr
 #define field_weak_reduce    p521_weak_reduce
 #define field_strong_reduce  p521_strong_reduce
 #define field_serialize      p521_serialize
 #define field_deserialize    p521_deserialize
 #define GF_LIT_LIMB_BITS  58
 #define GF_BITS           521
 #define gf              p521_t
 #define gf_mul            p521_mul
 #define gf_sqr            p521_sqr
 #define gf_add_RAW        p521_add_RAW
 #define gf_sub_RAW        p521_sub_RAW
 #define gf_mulw           p521_mulw
 #define gf_bias           p521_bias
 #define gf_isr            p521_isr
 #define gf_weak_reduce    p521_weak_reduce
 #define gf_strong_reduce  p521_strong_reduce
 #define gf_serialize      p521_serialize
 #define gf_deserialize    p521_deserialize

 #endif /* __F_FIELD_H__ */
--- a/src/public_include/decaf/decaf_255.h
+++ b/src/public_include/decaf/decaf_255.h
@@ -21,11 +21,13 @@ extern "C" {
 #define DECAF_255_SCALAR_BITS 254 // Curve25519: 253
 #define DECAF_255_SCALAR_LIMBS (256/DECAF_WORD_BITS)

 #ifndef __DECAF_GF_ALREADY_DEFINED__
 /** Galois field element internal structure */
 typedef struct gf_255_s {
 typedef struct gf_25519_s {
    decaf_word_t limb[DECAF_255_LIMBS];
 } gf_255_s, gf_255_t[1];
 } gf_25519_s, gf_25519_t[1];
 /** @endcond */
 #endif /* __DECAF_GF_ALREADY_DEFINED__ */

 /** Number of bytes in a serialized point. */
 #define DECAF_255_SER_BYTES 32
@@ -34,7 +36,7 @@ typedef struct gf_255_s {
 #define DECAF_255_SCALAR_BYTES 32

 /** Twisted Edwards (-1,d-1) extended homogeneous coordinates */
 typedef struct decaf_255_point_s { /**@cond internal*/gf_255_t x,y,z,t;/**@endcond*/ } decaf_255_point_t[1];
 typedef struct decaf_255_point_s { /**@cond internal*/gf_25519_t x,y,z,t;/**@endcond*/ } decaf_255_point_t[1];

 /** Precomputed table based on a point.  Can be trivial implementation. */
 struct decaf_255_precomputed_s;