it compiles, but it certainly doesnt work yet

10 years ago · 03ecad0551
--- a/+ 4
+++ b/+ 4
@@ -19,13 +19,13 @@ ASM ?= $(CC)
 DECAF ?= decaf_fast

 ifneq (,$(findstring x86_64,$(MACHINE)))
 ARCH ?= arch_x86_64
 ARCH ?= arch_ref64
 else
 # no i386 port yet
 ARCH ?= arch_arm_32
 ARCH ?= arch_ref32
 endif

 FIELD ?= p255
 FIELD ?= p25519

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 LANGFLAGS = -std=c99 -fno-strict-aliasing
 LANGXXFLAGS = -fno-strict-aliasing
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 OFLAGS ?= -O3
 OFLAGS ?= -O2

 TODAY = $(shell date "+%Y-%m-%d")

--- a/include/decaf_255.h
+++ b/include/decaf_255.h
@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t;
 /** Galois field element internal structure */
 typedef struct gf_s {
    decaf_word_t limb[DECAF_255_LIMBS];
 } __attribute__((aligned(32))) gf_s, gf[1];
 } gf_s, gf[1];
 /** @endcond */

 /** Number of bytes in a serialized point. */
--- a/include/decaf_crypto.h
+++ b/include/decaf_crypto.h
@@ -18,7 +18,7 @@
 #include "shake.h"

 /** Number of bytes for a symmetric key (expanded to full key) */
 #define DECAF_448_SYMMETRIC_KEY_BYTES 32
 #define DECAF_255_SYMMETRIC_KEY_BYTES 32

 /** @cond internal */
 #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h
@@ -31,29 +31,29 @@
 /** @endcond */

 /** A symmetric key, the compressed point of a private key. */
 typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES];
 typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES];

 /** An encoded public key. */
 typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES];
 typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES];

 /** A signature. */
 typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES];
 typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES];

 typedef struct {
    /** @cond intetrnal */
    /** The symmetric key from which everything is expanded */
    decaf_448_symmetric_key_t sym;
    decaf_255_symmetric_key_t sym;
    
    /** The scalar x */
    decaf_448_scalar_t secret_scalar;
    decaf_255_scalar_t secret_scalar;
    
    /** x*Base */
    decaf_448_public_key_t pub;
    decaf_255_public_key_t pub;
    /** @endcond */
 } /** Private key structure for pointers. */
  decaf_448_private_key_s,
  decaf_255_private_key_s,
  /** A private key (gmp array[1] style). */
  decaf_448_private_key_t[1];
  decaf_255_private_key_t[1];

 #ifdef __cplusplus
 extern "C" {
@@ -64,16 +64,16 @@ extern "C" {
 * @param [out] priv The derived private key.
 * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
 */
 void decaf_448_derive_private_key (
    decaf_448_private_key_t priv,
    const decaf_448_symmetric_key_t proto
 void decaf_255_derive_private_key (
    decaf_255_private_key_t priv,
    const decaf_255_symmetric_key_t proto
 ) NONNULL2 API_VIS;

 /**
 * @brief Destroy a private key.
 */
 void decaf_448_destroy_private_key (
    decaf_448_private_key_t priv
 void decaf_255_destroy_private_key (
    decaf_255_private_key_t priv
 ) NONNULL1 API_VIS;

 /**
@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key (
 * @param [out] pub The extracted private key.
 * @param [in] priv The private key.
 */
 void decaf_448_private_to_public (
    decaf_448_public_key_t pub,
    const decaf_448_private_key_t priv
 void decaf_255_private_to_public (
    decaf_255_public_key_t pub,
    const decaf_255_private_key_t priv
 ) NONNULL2 API_VIS;
    
 /**
@@ -104,11 +104,11 @@ void decaf_448_private_to_public (
 * and will almost definitely change in the future.
 */
 decaf_bool_t
 decaf_448_shared_secret (
 decaf_255_shared_secret (
    uint8_t *shared,
    size_t shared_bytes,
    const decaf_448_private_key_t my_privkey,
    const decaf_448_public_key_t your_pubkey
    const decaf_255_private_key_t my_privkey,
    const decaf_255_public_key_t your_pubkey
 ) NONNULL134 WARN_UNUSED API_VIS;
   
 /**
@@ -119,9 +119,9 @@ decaf_448_shared_secret (
 * @param [in] shake A SHAKE256 context with the message.
 */ 
 void
 decaf_448_sign_shake (
    decaf_448_signature_t sig,
    const decaf_448_private_key_t priv,
 decaf_255_sign_shake (
    decaf_255_signature_t sig,
    const decaf_255_private_key_t priv,
    const keccak_sponge_t shake
 ) NONNULL3 API_VIS;

@@ -134,9 +134,9 @@ decaf_448_sign_shake (
 * @param [in] message_len The message's length.
 */ 
 void
 decaf_448_sign (
    decaf_448_signature_t sig,
    const decaf_448_private_key_t priv,
 decaf_255_sign (
    decaf_255_signature_t sig,
    const decaf_255_private_key_t priv,
    const unsigned char *message,
    size_t message_len
 ) NONNULL3 API_VIS;
@@ -149,9 +149,9 @@ decaf_448_sign (
 * @param [in] shake A SHAKE256 context with the message.
 */    
 decaf_bool_t
 decaf_448_verify_shake (
    const decaf_448_signature_t sig,
    const decaf_448_public_key_t pub,
 decaf_255_verify_shake (
    const decaf_255_signature_t sig,
    const decaf_255_public_key_t pub,
    const keccak_sponge_t shake
 ) NONNULL3 API_VIS WARN_UNUSED;

@@ -164,9 +164,9 @@ decaf_448_verify_shake (
 * @param [in] message_len The message's length.
 */    
 decaf_bool_t
 decaf_448_verify (
    const decaf_448_signature_t sig,
    const decaf_448_public_key_t pub,
 decaf_255_verify (
    const decaf_255_signature_t sig,
    const decaf_255_public_key_t pub,
    const unsigned char *message,
    size_t message_len
 ) NONNULL3 API_VIS WARN_UNUSED;
--- a/include/shake.hxx
+++ b/include/shake.hxx
@@ -192,18 +192,18 @@ private:
 };

 /**@cond internal*/
 inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
 inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
    *this = rng.read(SER_BYTES);
 }

 inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
 inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
    SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES);
    rng.read(buffer);
    set_to_hash(buffer);
 }


 inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
 inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
    SecureBuffer out(STEG_BYTES);
    bool done;
    do {
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t;
 #define siv static inline void __attribute__((always_inline))
 static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};

 static const int EDWARDS_D = 121665;
 static const int EDWARDS_D = -89747;
    // Gonna test with PinkBikeShed until the math works...
    // Curve25519: 121665;

 static const scalar_t sc_p = {{{
    // Gonna test with PinkBikeShed until the math works...
    SC_LIMB(0xb6b98fd8849faf35),
    SC_LIMB(0x16241e6093b2ce59),
    SC_LIMB(0),
    SC_LIMB(0x2000000000000000)
    /* Curve25519:
    SC_LIMB(0x5812631a5cf5d3ed),
    SC_LIMB(0x14def9dea2f79cd6),
    SC_LIMB(0),
    SC_LIMB(0),
    SC_LIMB(0x1000000000000000)
    */
 }}};

 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR;

 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
    3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    5 /*PinkBikeShed.  Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };

 extern const point_t API_NS(point_base);
@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32;

 #ifdef __clang__
 #if 100*__clang_major__ + __clang_minor__ > 305
 #define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
 #define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize?
 #endif
 #endif

 #ifndef VECTORIZE
 #define VECTORIZE
 #ifndef UNROLL
 #define UNROLL
 #endif

 #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
 #define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++)  { op; }}
 #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}

 /** Copy x = y */
 siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) {

 /** Subtract mod p.  Bias by 2 and don't reduce  */
 siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
 //    FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
    ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
    field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
    gf_bias(c, 2);
@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {

 /** Add mod p.  Don't reduce. */
 siv gf_add_nr ( gf c, const gf a, const gf b ) {
 //    FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]);
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
    ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
    field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
 }
@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) {

 /** Constant time, if (swap) (x,y) = (y,x); */
 siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
    FOR_LIMB_V(i, {
    FOR_LIMB_U(i, {
        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
        x->limb[i] ^= s;
        y->limb[i] ^= s;
@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) (
    }
    return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
 #else
 	(void)out;
 	(void)a;
 	return 0;
    decaf_255_scalar_t b, ma;
    int i;
    sc_montmul(b,API_NS(scalar_one),sc_r2);
    sc_montmul(ma,a,sc_r2);
    for (i=SCALAR_BITS-1; i>=0; i--) {
        sc_montsqr(b,b);
            
        decaf_word_t w = sc_p->limb[i/WBITS];
        if (i<WBITS) {
            assert(w >= 2);
            w-=2;
        }
        if (1 & w>>(i%WBITS)) {
            sc_montmul(b,b,ma);
        }
    }

    sc_montmul(out,b,decaf_255_scalar_one);
    API_NS(scalar_destroy)(b);
    API_NS(scalar_destroy)(ma);
    return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
 #endif
 }

--- a/src/include/decaf_255_config.h
+++ b/src/include/decaf_255_config.h
@@ -0,0 +1,50 @@
 /**
 * @file decaf_config.h
 * @author Mike Hamburg
 *
 * @copyright
 *   Copyright (c) 2015 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 *
 * @brief Configuration for decaf_fast.c
 */
 #ifndef __DECAF_255_CONFIG_H__
 #define __DECAF_255_CONFIG_H__ 1

 /**
 * Use the Montgomery ladder for direct scalarmul.
 *
 * The Montgomery ladder is faster than Edwards scalarmul, but providing
 * the features Decaf supports (cofactor elimination, twist rejection)
 * makes it complicated and adds code.  Removing the ladder saves a few
 * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
 * time.
 */
 #define DECAF_USE_MONTGOMERY_LADDER 1

 /** The number of comb tables for fixed base scalarmul. */
 #define DECAF_COMBS_N 3

 /** The number of teeth per comb for fixed base scalarmul. */
 #define DECAF_COMBS_T 5

 /** The comb spacing fixed base scalarmul. */
 #define DECAF_COMBS_S 17

 /** Performance tuning: the width of the fixed window for scalar mul. */
 #define DECAF_WINDOW_BITS 4

 /**
 * The number of bits used for the precomputed table in variable-time
 * double scalarmul.
 */
 #define DECAF_WNAF_FIXED_TABLE_BITS 5

 /**
 * Performance tuning: bits used for the variable table in variable-time
 * double scalarmul.
 */
 #define DECAF_WNAF_VAR_TABLE_BITS 3


 #endif /* __DECAF_255_CONFIG_H__ */
--- a/src/p25519/arch_ref64/p25519.c
+++ b/src/p25519/arch_ref64/p25519.c
@@ -22,164 +22,33 @@ p255_mul (
    const p255_t *as,
    const p255_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    
    uint64_t bh[4];
    int i,j;
    for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
    
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<51) - 1;  

    uint64_t aa[4], bb[4], bbb[4];

    unsigned int i;
    for (i=0; i<4; i++) {
        aa[i]  = a[i] + a[i+4];
        bb[i]  = b[i] + b[i+4];
        bbb[i] = bb[i] + b[i+4];
    }

    int I_HATE_UNROLLED_LOOPS = 0;

    if (I_HATE_UNROLLED_LOOPS) {
        /* The compiler probably won't unroll this,
         * so it's like 80% slower.
         */
        for (i=0; i<4; i++) {
            accum2 = 0;

            unsigned int j;
            for (j=0; j<=i; j++) {
                accum2 += widemul(a[j],   b[i-j]);
                accum1 += widemul(aa[j], bb[i-j]);
                accum0 += widemul(a[j+4], b[i-j+4]);
            }
            for (; j<4; j++) {
                accum2 += widemul(a[j],   b[i-j+8]);
                accum1 += widemul(aa[j], bbb[i-j+4]);
                accum0 += widemul(a[j+4], bb[i-j+4]);
            }

            accum1 -= accum2;
            accum0 += accum2;

            c[i]   = ((uint64_t)(accum0)) & mask;
            c[i+4] = ((uint64_t)(accum1)) & mask;

            accum0 >>= 56;
            accum1 >>= 56;
    __uint128_t accum = 0;
    for (i=0; i<5; i++) {
        for (j=0; j<=i; j++) {
            accum += widemul(b[i-j], a[j]);
        }
    } else {
        accum2  = widemul(a[0],  b[0]);
        accum1 += widemul(aa[0], bb[0]);
        accum0 += widemul(a[4],  b[4]);

        accum2 += widemul(a[1],  b[7]);
        accum1 += widemul(aa[1], bbb[3]);
        accum0 += widemul(a[5],  bb[3]);

        accum2 += widemul(a[2],  b[6]);
        accum1 += widemul(aa[2], bbb[2]);
        accum0 += widemul(a[6],  bb[2]);

        accum2 += widemul(a[3],  b[5]);
        accum1 += widemul(aa[3], bbb[1]);
        accum0 += widemul(a[7],  bb[1]);

        accum1 -= accum2;
        accum0 += accum2;

        c[0] = ((uint64_t)(accum0)) & mask;
        c[4] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[1]);
        accum1 += widemul(aa[0], bb[1]);
        accum0 += widemul(a[4],  b[5]);

        accum2 += widemul(a[1],  b[0]);
        accum1 += widemul(aa[1], bb[0]);
        accum0 += widemul(a[5],  b[4]);

        accum2 += widemul(a[2],  b[7]);
        accum1 += widemul(aa[2], bbb[3]);
        accum0 += widemul(a[6],  bb[3]);

        accum2 += widemul(a[3],  b[6]);
        accum1 += widemul(aa[3], bbb[2]);
        accum0 += widemul(a[7],  bb[2]);

        accum1 -= accum2;
        accum0 += accum2;

        c[1] = ((uint64_t)(accum0)) & mask;
        c[5] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[2]);
        accum1 += widemul(aa[0], bb[2]);
        accum0 += widemul(a[4],  b[6]);

        accum2 += widemul(a[1],  b[1]);
        accum1 += widemul(aa[1], bb[1]);
        accum0 += widemul(a[5],  b[5]);

        accum2 += widemul(a[2],  b[0]);
        accum1 += widemul(aa[2], bb[0]);
        accum0 += widemul(a[6],  b[4]);

        accum2 += widemul(a[3],  b[7]);
        accum1 += widemul(aa[3], bbb[3]);
        accum0 += widemul(a[7],  bb[3]);

        accum1 -= accum2;
        accum0 += accum2;

        c[2] = ((uint64_t)(accum0)) & mask;
        c[6] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[3]);
        accum1 += widemul(aa[0], bb[3]);
        accum0 += widemul(a[4],  b[7]);

        accum2 += widemul(a[1],  b[2]);
        accum1 += widemul(aa[1], bb[2]);
        accum0 += widemul(a[5],  b[6]);

        accum2 += widemul(a[2],  b[1]);
        accum1 += widemul(aa[2], bb[1]);
        accum0 += widemul(a[6],  b[5]);

        accum2 += widemul(a[3],  b[0]);
        accum1 += widemul(aa[3], bb[0]);
        accum0 += widemul(a[7],  b[4]);

        accum1 -= accum2;
        accum0 += accum2;

        c[3] = ((uint64_t)(accum0)) & mask;
        c[7] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;
    } /* !I_HATE_UNROLLED_LOOPS */

    accum0 += accum1;
    accum0 += c[4];
    accum1 += c[0];
    c[4] = ((uint64_t)(accum0)) & mask;
    c[0] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    c[5] += ((uint64_t)(accum0));
    c[1] += ((uint64_t)(accum1));
        for (; j<5; j++) {
            accum += widemul(bh[i-j+4], a[j]);
        }
        c[i] = accum & mask;
        accum >>= 51;
    }
    /* PERF: parallelize? eh well this is reference */
    accum *= 19;
    accum += c[0];
    c[0] = accum & mask;
    accum >>= 51;
    
    assert(accum < mask);
    c[1] += accum;
 }

 void
@@ -188,27 +57,25 @@ p255_mulw (
    const p255_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    int i;
    
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum4 = 0;
    uint64_t mask = (1ull<<56) - 1;  

    int i;
    for (i=0; i<4; i++) {
        accum0 += widemul(b, a[i]);
        accum4 += widemul(b, a[i+4]);
        c[i]   = accum0 & mask; accum0 >>= 56;
        c[i+4] = accum4 & mask; accum4 >>= 56;
    __uint128_t accum = 0;
    for (i=0; i<5; i++) {
        accum += widemul(b, a[i]);
        c[i] = accum & mask;
        accum >>= 51;
    }
    /* PERF: parallelize? eh well this is reference */
    accum *= 19;
    accum += c[0];
    c[0] = accum & mask;
    accum >>= 51;
    
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;

    accum4 += c[0];
    c[0] = accum4 & mask;
    c[1] += accum4 >> 56;
    assert(accum < mask);
    c[1] += accum;
 }

 void
@@ -223,23 +90,21 @@ void
 p255_strong_reduce (
    p255_t *a
 ) {
    uint64_t mask = (1ull<<56)-1;
    uint64_t mask = (1ull<<51)-1;

    /* first, clear high */
    a->limb[4] += a->limb[7]>>56;
    a->limb[0] += a->limb[7]>>56;
    a->limb[7] &= mask;
    a->limb[0] += (a->limb[4]>>51)*19;
    a->limb[4] &= mask;

    /* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */
    /* now the total is less than 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    __int128_t scarry = 0;
    int i;
    for (i=0; i<8; i++) {
        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
    for (i=0; i<5; i++) {
        scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 56;
        scarry >>= 51;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
@@ -253,10 +118,10 @@ p255_strong_reduce (
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<8; i++) {
        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
    for (i=0; i<5; i++) {
        carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 56;
        carry >>= 51;
    }

    assert(is_zero(carry + scarry));
@@ -271,12 +136,13 @@ p255_serialize (
    p255_t red;
    p255_copy(&red, x);
    p255_strong_reduce(&red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
    uint64_t *r = red.limb;
    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
    for (i=0; i<4; i++) {
        for (j=0; j<8; j++) {
            serial[8*i+j] = ser64[i];
            ser64[i] >>= 8;
        }
        assert(red.limb[i] == 0);
    }
 }

@@ -286,33 +152,27 @@ p255_deserialize (
    const uint8_t serial[32]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
    uint64_t ser64[4], mask = ((1ull<<51)-1);
    for (i=0; i<4; i++) {
        uint64_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((uint64_t)serial[7*i+j])<<(8*j);
        for (j=0; j<8; j++) {
            out |= ((uint64_t)serial[8*i+j])<<(8*j);
        }
        x->limb[i] = out;
        ser64[i] = out;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 56 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    uint64_t ge = -1, mask = (1ull<<56)-1;
    for (i=0; i<4; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    /* Test for >= 2^255-19 */
    uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
    ge &= ser64[1];
    ge &= ser64[2];
    ge &= (ser64[3]<<1) + 1;
    ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    x->limb[0] = ser64[0] & mask;
    x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
    x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
    x->limb[4] = ser64[3]>>12;
    
    return ~is_zero(ge ^ mask);
    return ~is_zero(~ge);
 }
--- a/src/p25519/arch_ref64/p25519.h
+++ b/src/p25519/arch_ref64/p25519.h
@@ -15,7 +15,17 @@ typedef struct p255_t {
 } p255_t;

 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}}
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 /*
 #define FIELD_LITERAL(a,b,c,d) {{ \
    (a##ull) & LMASK, \
    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
    (d##ull)>>12 \
 }}
 */

 #ifdef __cplusplus
 extern "C" {
@@ -140,9 +150,9 @@ p255_weak_reduce (
    p255_t *a
 ) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[5] >> 51;
    uint64_t tmp = a->limb[4] >> 51;
    int i;
    for (i=7; i>0; i--) {
    for (i=4; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp*19;
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -10,58 +10,51 @@

 #include "field.h"

 extern field_a_t ONE; // TODO

 static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere?
 static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere?
    0x61b274a0ea0b0,
    0x0d5a5fc8f189d,
    0x7ef5e9cbd0c60,
    0x78595a6804c9e,
    0x2b8324804fc1d
 );
 )};
    
 static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
    1,0,0,0,0
 )}; 

 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 ) {
    field_a_t st[3], tmp1, tmp2;
    const struct { unsigned char sh, idx } ops[] = {
        {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
    };
    field_cpy(st[0],x);
    field_cpy(st[1],x);
    field_cpy(st[2],x);
 // ARCH MAGIC FIXME copy-pasted from decaf_fast.c
 static mask_t gf_eq(const field_a_t a, const field_a_t b) {
    field_a_t c;
    field_sub(c,a,b);
    field_strong_reduce(c);
    mask_t ret=0;
    int i;
    for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
        field_sqrn(tmp1, st[1^i&1], ops[i].sh);
        field_mul(tmp2, tmp1, st[ops[i].idx]);
        field_cpy(st[i&1], tmp2);
    }
    
    mask_t m = field_eq(st[1], ONE);
    cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m);
    field_mul(a,tmp1,st[0]);
 };
    for (i=0; i<5; i++) { ret |= c->limb[i]; }
    return ((__uint128_t)ret - 1) >> 64;
 }

 /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
 void 
 field_isr (
    field_a_t a,
    const field_a_t x
 ) {
    field_a_t st[3], tmp1, tmp2;
    const struct { unsigned char sh, idx } ops[] = {
    const struct { unsigned char sh, idx; } ops[] = {
        {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
    };
    field_cpy(st[0],x);
    field_cpy(st[1],x);
    field_cpy(st[2],x);
    int i;
    st[0][0] = st[1][0] = st[2][0] = x[0];
    unsigned int i;
    for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
        field_sqrn(tmp1, st[1^i&1], ops[i].sh);
        field_mul(tmp2, tmp1, st[ops[i].idx]);
        field_cpy(st[i&1], tmp2);
        st[i&1][0] = tmp2[0];
    }
    
    mask_t m = field_eq(st[1], ONE);
    mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE);
    
    // ARCH MAGIC FIXME: should be cond_sel
    for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i]            &  mask)
                                      | (SQRT_MINUS_ONE->limb[i] & ~mask);
    field_mul(a,tmp1,st[0]);
 }