From 03ecad0551fe624cb47dbabcce9932c2efe07ffa Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 19 Jun 2015 14:15:20 -0700
Subject: [PATCH] it compiles, but it certainly doesnt work yet

---
 Makefile                       |   8 +-
 include/decaf_255.h            |   2 +-
 include/decaf_crypto.h         |  64 ++++----
 include/shake.hxx              |   6 +-
 src/decaf_fast.c               |  52 ++++--
 src/include/decaf_255_config.h |  50 ++++++
 src/p25519/arch_ref64/p25519.c | 284 +++++++++------------------------
 src/p25519/arch_ref64/p25519.h |  16 +-
 src/p25519/f_arithmetic.c      |  59 +++----
 9 files changed, 240 insertions(+), 301 deletions(-)
 create mode 100644 src/include/decaf_255_config.h

diff --git a/Makefile b/Makefile
index c3a295e..4ac57aa 100644
--- a/Makefile
+++ b/Makefile
@@ -19,13 +19,13 @@ ASM ?= $(CC)
 DECAF ?= decaf_fast
 
 ifneq (,$(findstring x86_64,$(MACHINE)))
-ARCH ?= arch_x86_64
+ARCH ?= arch_ref64
 else
 # no i386 port yet
-ARCH ?= arch_arm_32
+ARCH ?= arch_ref32
 endif
 
-FIELD ?= p255
+FIELD ?= p25519
 
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 LANGFLAGS = -std=c99 -fno-strict-aliasing
 LANGXXFLAGS = -fno-strict-aliasing
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
-OFLAGS ?= -O3
+OFLAGS ?= -O2
 
 TODAY = $(shell date "+%Y-%m-%d")
 
diff --git a/include/decaf_255.h b/include/decaf_255.h
index 10e3b74..b978155 100644
--- a/include/decaf_255.h
+++ b/include/decaf_255.h
@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t;
 /** Galois field element internal structure */
 typedef struct gf_s {
     decaf_word_t limb[DECAF_255_LIMBS];
-} __attribute__((aligned(32))) gf_s, gf[1];
+} gf_s, gf[1];
 /** @endcond */
 
 /** Number of bytes in a serialized point. */
diff --git a/include/decaf_crypto.h b/include/decaf_crypto.h
index 6e428fc..6e34bdd 100644
--- a/include/decaf_crypto.h
+++ b/include/decaf_crypto.h
@@ -18,7 +18,7 @@
 #include "shake.h"
 
 /** Number of bytes for a symmetric key (expanded to full key) */
-#define DECAF_448_SYMMETRIC_KEY_BYTES 32
+#define DECAF_255_SYMMETRIC_KEY_BYTES 32
 
 /** @cond internal */
 #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h
@@ -31,29 +31,29 @@
 /** @endcond */
 
 /** A symmetric key, the compressed point of a private key. */
-typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES];
+typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES];
 
 /** An encoded public key. */
-typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES];
+typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES];
 
 /** A signature. */
-typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES];
+typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES];
 
 typedef struct {
     /** @cond intetrnal */
     /** The symmetric key from which everything is expanded */
-    decaf_448_symmetric_key_t sym;
+    decaf_255_symmetric_key_t sym;
     
     /** The scalar x */
-    decaf_448_scalar_t secret_scalar;
+    decaf_255_scalar_t secret_scalar;
     
     /** x*Base */
-    decaf_448_public_key_t pub;
+    decaf_255_public_key_t pub;
     /** @endcond */
 } /** Private key structure for pointers. */
-  decaf_448_private_key_s,
+  decaf_255_private_key_s,
   /** A private key (gmp array[1] style). */
-  decaf_448_private_key_t[1];
+  decaf_255_private_key_t[1];
 
 #ifdef __cplusplus
 extern "C" {
@@ -64,16 +64,16 @@ extern "C" {
  * @param [out] priv The derived private key.
  * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
  */
-void decaf_448_derive_private_key (
-    decaf_448_private_key_t priv,
-    const decaf_448_symmetric_key_t proto
+void decaf_255_derive_private_key (
+    decaf_255_private_key_t priv,
+    const decaf_255_symmetric_key_t proto
 ) NONNULL2 API_VIS;
 
 /**
  * @brief Destroy a private key.
  */
-void decaf_448_destroy_private_key (
-    decaf_448_private_key_t priv
+void decaf_255_destroy_private_key (
+    decaf_255_private_key_t priv
 ) NONNULL1 API_VIS;
 
 /**
@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key (
  * @param [out] pub The extracted private key.
  * @param [in] priv The private key.
  */
-void decaf_448_private_to_public (
-    decaf_448_public_key_t pub,
-    const decaf_448_private_key_t priv
+void decaf_255_private_to_public (
+    decaf_255_public_key_t pub,
+    const decaf_255_private_key_t priv
 ) NONNULL2 API_VIS;
     
 /**
@@ -104,11 +104,11 @@ void decaf_448_private_to_public (
  * and will almost definitely change in the future.
  */
 decaf_bool_t
-decaf_448_shared_secret (
+decaf_255_shared_secret (
     uint8_t *shared,
     size_t shared_bytes,
-    const decaf_448_private_key_t my_privkey,
-    const decaf_448_public_key_t your_pubkey
+    const decaf_255_private_key_t my_privkey,
+    const decaf_255_public_key_t your_pubkey
 ) NONNULL134 WARN_UNUSED API_VIS;
    
 /**
@@ -119,9 +119,9 @@ decaf_448_shared_secret (
  * @param [in] shake A SHAKE256 context with the message.
  */ 
 void
-decaf_448_sign_shake (
-    decaf_448_signature_t sig,
-    const decaf_448_private_key_t priv,
+decaf_255_sign_shake (
+    decaf_255_signature_t sig,
+    const decaf_255_private_key_t priv,
     const keccak_sponge_t shake
 ) NONNULL3 API_VIS;
 
@@ -134,9 +134,9 @@ decaf_448_sign_shake (
  * @param [in] message_len The message's length.
  */ 
 void
-decaf_448_sign (
-    decaf_448_signature_t sig,
-    const decaf_448_private_key_t priv,
+decaf_255_sign (
+    decaf_255_signature_t sig,
+    const decaf_255_private_key_t priv,
     const unsigned char *message,
     size_t message_len
 ) NONNULL3 API_VIS;
@@ -149,9 +149,9 @@ decaf_448_sign (
  * @param [in] shake A SHAKE256 context with the message.
  */    
 decaf_bool_t
-decaf_448_verify_shake (
-    const decaf_448_signature_t sig,
-    const decaf_448_public_key_t pub,
+decaf_255_verify_shake (
+    const decaf_255_signature_t sig,
+    const decaf_255_public_key_t pub,
     const keccak_sponge_t shake
 ) NONNULL3 API_VIS WARN_UNUSED;
 
@@ -164,9 +164,9 @@ decaf_448_verify_shake (
  * @param [in] message_len The message's length.
  */    
 decaf_bool_t
-decaf_448_verify (
-    const decaf_448_signature_t sig,
-    const decaf_448_public_key_t pub,
+decaf_255_verify (
+    const decaf_255_signature_t sig,
+    const decaf_255_public_key_t pub,
     const unsigned char *message,
     size_t message_len
 ) NONNULL3 API_VIS WARN_UNUSED;
diff --git a/include/shake.hxx b/include/shake.hxx
index 95b5d00..97edec5 100644
--- a/include/shake.hxx
+++ b/include/shake.hxx
@@ -192,18 +192,18 @@ private:
 };
 
 /**@cond internal*/
-inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
+inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
     *this = rng.read(SER_BYTES);
 }
 
-inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
+inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
     SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES);
     rng.read(buffer);
     set_to_hash(buffer);
 }
 
 
-inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
+inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
     SecureBuffer out(STEG_BYTES);
     bool done;
     do {
diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index 7a404b9..faa684c 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t;
 #define siv static inline void __attribute__((always_inline))
 static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};
 
-static const int EDWARDS_D = 121665;
+static const int EDWARDS_D = -89747;
+    // Gonna test with PinkBikeShed until the math works...
+    // Curve25519: 121665;
 
 static const scalar_t sc_p = {{{
+    // Gonna test with PinkBikeShed until the math works...
+    SC_LIMB(0xb6b98fd8849faf35),
+    SC_LIMB(0x16241e6093b2ce59),
+    SC_LIMB(0),
+    SC_LIMB(0x2000000000000000)
+    /* Curve25519:
     SC_LIMB(0x5812631a5cf5d3ed),
     SC_LIMB(0x14def9dea2f79cd6),
     SC_LIMB(0),
-    SC_LIMB(0),
     SC_LIMB(0x1000000000000000)
+    */
 }}};
 
 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR;
 
 /* sqrt(9) = 3 from the curve spec.  Not exported, but used by pregen tool. */
 const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
-    3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    5 /*PinkBikeShed.  Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
 
 extern const point_t API_NS(point_base);
@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 
 #ifdef __clang__
 #if 100*__clang_major__ + __clang_minor__ > 305
-#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
+#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize?
 #endif
 #endif
 
-#ifndef VECTORIZE
-#define VECTORIZE
+#ifndef UNROLL
+#define UNROLL
 #endif
 
 #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
-#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++)  { op; }}
+#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
 
 /** Copy x = y */
 siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) {
 
 /** Subtract mod p.  Bias by 2 and don't reduce  */
 siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
-//    FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
+//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
     ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
     field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
     gf_bias(c, 2);
@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
 
 /** Add mod p.  Don't reduce. */
 siv gf_add_nr ( gf c, const gf a, const gf b ) {
-//    FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]);
+//    FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
     ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
     field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
 }
@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) {
 
 /** Constant time, if (swap) (x,y) = (y,x); */
 siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
-    FOR_LIMB_V(i, {
+    FOR_LIMB_U(i, {
         decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
         x->limb[i] ^= s;
         y->limb[i] ^= s;
@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) (
     }
     return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
 #else
-	(void)out;
-	(void)a;
-	return 0;
+    decaf_255_scalar_t b, ma;
+    int i;
+    sc_montmul(b,API_NS(scalar_one),sc_r2);
+    sc_montmul(ma,a,sc_r2);
+    for (i=SCALAR_BITS-1; i>=0; i--) {
+        sc_montsqr(b,b);
+            
+        decaf_word_t w = sc_p->limb[i/WBITS];
+        if (i<WBITS) {
+            assert(w >= 2);
+            w-=2;
+        }
+        if (1 & w>>(i%WBITS)) {
+            sc_montmul(b,b,ma);
+        }
+    }
+
+    sc_montmul(out,b,decaf_255_scalar_one);
+    API_NS(scalar_destroy)(b);
+    API_NS(scalar_destroy)(ma);
+    return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
 #endif
 }
 
diff --git a/src/include/decaf_255_config.h b/src/include/decaf_255_config.h
new file mode 100644
index 0000000..be9d978
--- /dev/null
+++ b/src/include/decaf_255_config.h
@@ -0,0 +1,50 @@
+/**
+ * @file decaf_config.h
+ * @author Mike Hamburg
+ *
+ * @copyright
+ *   Copyright (c) 2015 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ *
+ * @brief Configuration for decaf_fast.c
+ */
+#ifndef __DECAF_255_CONFIG_H__
+#define __DECAF_255_CONFIG_H__ 1
+
+/**
+ * Use the Montgomery ladder for direct scalarmul.
+ *
+ * The Montgomery ladder is faster than Edwards scalarmul, but providing
+ * the features Decaf supports (cofactor elimination, twist rejection)
+ * makes it complicated and adds code.  Removing the ladder saves a few
+ * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
+ * time.
+ */
+#define DECAF_USE_MONTGOMERY_LADDER 1
+
+/** The number of comb tables for fixed base scalarmul. */
+#define DECAF_COMBS_N 3
+
+/** The number of teeth per comb for fixed base scalarmul. */
+#define DECAF_COMBS_T 5
+
+/** The comb spacing fixed base scalarmul. */
+#define DECAF_COMBS_S 17
+
+/** Performance tuning: the width of the fixed window for scalar mul. */
+#define DECAF_WINDOW_BITS 4
+
+/**
+ * The number of bits used for the precomputed table in variable-time
+ * double scalarmul.
+ */
+#define DECAF_WNAF_FIXED_TABLE_BITS 5
+
+/**
+ * Performance tuning: bits used for the variable table in variable-time
+ * double scalarmul.
+ */
+#define DECAF_WNAF_VAR_TABLE_BITS 3
+
+
+#endif /* __DECAF_255_CONFIG_H__ */
diff --git a/src/p25519/arch_ref64/p25519.c b/src/p25519/arch_ref64/p25519.c
index b5892fb..37cedb0 100644
--- a/src/p25519/arch_ref64/p25519.c
+++ b/src/p25519/arch_ref64/p25519.c
@@ -22,164 +22,33 @@ p255_mul (
     const p255_t *as,
     const p255_t *bs
 ) {
-    const uint64_t *a = as->limb, *b = bs->limb;
+    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
+    
+    uint64_t bh[4];
+    int i,j;
+    for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
+    
     uint64_t *c = cs->limb;
 
-    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull<<51) - 1;  
-
-    uint64_t aa[4], bb[4], bbb[4];
-
-    unsigned int i;
-    for (i=0; i<4; i++) {
-        aa[i]  = a[i] + a[i+4];
-        bb[i]  = b[i] + b[i+4];
-        bbb[i] = bb[i] + b[i+4];
-    }
-
-    int I_HATE_UNROLLED_LOOPS = 0;
-
-    if (I_HATE_UNROLLED_LOOPS) {
-        /* The compiler probably won't unroll this,
-         * so it's like 80% slower.
-         */
-        for (i=0; i<4; i++) {
-            accum2 = 0;
-
-            unsigned int j;
-            for (j=0; j<=i; j++) {
-                accum2 += widemul(a[j],   b[i-j]);
-                accum1 += widemul(aa[j], bb[i-j]);
-                accum0 += widemul(a[j+4], b[i-j+4]);
-            }
-            for (; j<4; j++) {
-                accum2 += widemul(a[j],   b[i-j+8]);
-                accum1 += widemul(aa[j], bbb[i-j+4]);
-                accum0 += widemul(a[j+4], bb[i-j+4]);
-            }
-
-            accum1 -= accum2;
-            accum0 += accum2;
-
-            c[i]   = ((uint64_t)(accum0)) & mask;
-            c[i+4] = ((uint64_t)(accum1)) & mask;
-
-            accum0 >>= 56;
-            accum1 >>= 56;
+    __uint128_t accum = 0;
+    for (i=0; i<5; i++) {
+        for (j=0; j<=i; j++) {
+            accum += widemul(b[i-j], a[j]);
         }
-    } else {
-        accum2  = widemul(a[0],  b[0]);
-        accum1 += widemul(aa[0], bb[0]);
-        accum0 += widemul(a[4],  b[4]);
-
-        accum2 += widemul(a[1],  b[7]);
-        accum1 += widemul(aa[1], bbb[3]);
-        accum0 += widemul(a[5],  bb[3]);
-
-        accum2 += widemul(a[2],  b[6]);
-        accum1 += widemul(aa[2], bbb[2]);
-        accum0 += widemul(a[6],  bb[2]);
-
-        accum2 += widemul(a[3],  b[5]);
-        accum1 += widemul(aa[3], bbb[1]);
-        accum0 += widemul(a[7],  bb[1]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[0] = ((uint64_t)(accum0)) & mask;
-        c[4] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2  = widemul(a[0],  b[1]);
-        accum1 += widemul(aa[0], bb[1]);
-        accum0 += widemul(a[4],  b[5]);
-
-        accum2 += widemul(a[1],  b[0]);
-        accum1 += widemul(aa[1], bb[0]);
-        accum0 += widemul(a[5],  b[4]);
-
-        accum2 += widemul(a[2],  b[7]);
-        accum1 += widemul(aa[2], bbb[3]);
-        accum0 += widemul(a[6],  bb[3]);
-
-        accum2 += widemul(a[3],  b[6]);
-        accum1 += widemul(aa[3], bbb[2]);
-        accum0 += widemul(a[7],  bb[2]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[1] = ((uint64_t)(accum0)) & mask;
-        c[5] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2  = widemul(a[0],  b[2]);
-        accum1 += widemul(aa[0], bb[2]);
-        accum0 += widemul(a[4],  b[6]);
-
-        accum2 += widemul(a[1],  b[1]);
-        accum1 += widemul(aa[1], bb[1]);
-        accum0 += widemul(a[5],  b[5]);
-
-        accum2 += widemul(a[2],  b[0]);
-        accum1 += widemul(aa[2], bb[0]);
-        accum0 += widemul(a[6],  b[4]);
-
-        accum2 += widemul(a[3],  b[7]);
-        accum1 += widemul(aa[3], bbb[3]);
-        accum0 += widemul(a[7],  bb[3]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[2] = ((uint64_t)(accum0)) & mask;
-        c[6] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2  = widemul(a[0],  b[3]);
-        accum1 += widemul(aa[0], bb[3]);
-        accum0 += widemul(a[4],  b[7]);
-
-        accum2 += widemul(a[1],  b[2]);
-        accum1 += widemul(aa[1], bb[2]);
-        accum0 += widemul(a[5],  b[6]);
-
-        accum2 += widemul(a[2],  b[1]);
-        accum1 += widemul(aa[2], bb[1]);
-        accum0 += widemul(a[6],  b[5]);
-
-        accum2 += widemul(a[3],  b[0]);
-        accum1 += widemul(aa[3], bb[0]);
-        accum0 += widemul(a[7],  b[4]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[3] = ((uint64_t)(accum0)) & mask;
-        c[7] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-    } /* !I_HATE_UNROLLED_LOOPS */
-
-    accum0 += accum1;
-    accum0 += c[4];
-    accum1 += c[0];
-    c[4] = ((uint64_t)(accum0)) & mask;
-    c[0] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    c[5] += ((uint64_t)(accum0));
-    c[1] += ((uint64_t)(accum1));
+        for (; j<5; j++) {
+            accum += widemul(bh[i-j+4], a[j]);
+        }
+        c[i] = accum & mask;
+        accum >>= 51;
+    }
+    /* PERF: parallelize? eh well this is reference */
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & mask;
+    accum >>= 51;
+    
+    assert(accum < mask);
+    c[1] += accum;
 }
 
 void
@@ -188,27 +57,25 @@ p255_mulw (
     const p255_t *as,
     uint64_t b
 ) {
-    const uint64_t *a = as->limb;
+    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
+    int i;
+    
     uint64_t *c = cs->limb;
 
-    __uint128_t accum0 = 0, accum4 = 0;
-    uint64_t mask = (1ull<<56) - 1;  
-
-    int i;
-    for (i=0; i<4; i++) {
-        accum0 += widemul(b, a[i]);
-        accum4 += widemul(b, a[i+4]);
-        c[i]   = accum0 & mask; accum0 >>= 56;
-        c[i+4] = accum4 & mask; accum4 >>= 56;
+    __uint128_t accum = 0;
+    for (i=0; i<5; i++) {
+        accum += widemul(b, a[i]);
+        c[i] = accum & mask;
+        accum >>= 51;
     }
+    /* PERF: parallelize? eh well this is reference */
+    accum *= 19;
+    accum += c[0];
+    c[0] = accum & mask;
+    accum >>= 51;
     
-    accum0 += accum4 + c[4];
-    c[4] = accum0 & mask;
-    c[5] += accum0 >> 56;
-
-    accum4 += c[0];
-    c[0] = accum4 & mask;
-    c[1] += accum4 >> 56;
+    assert(accum < mask);
+    c[1] += accum;
 }
 
 void
@@ -223,23 +90,21 @@ void
 p255_strong_reduce (
     p255_t *a
 ) {
-    uint64_t mask = (1ull<<56)-1;
+    uint64_t mask = (1ull<<51)-1;
 
     /* first, clear high */
-    a->limb[4] += a->limb[7]>>56;
-    a->limb[0] += a->limb[7]>>56;
-    a->limb[7] &= mask;
+    a->limb[0] += (a->limb[4]>>51)*19;
+    a->limb[4] &= mask;
 
-    /* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */
+    /* now the total is less than 2p */
 
     /* compute total_value - p.  No need to reduce mod p. */
-
     __int128_t scarry = 0;
     int i;
-    for (i=0; i<8; i++) {
-        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
+    for (i=0; i<5; i++) {
+        scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
         a->limb[i] = scarry & mask;
-        scarry >>= 56;
+        scarry >>= 51;
     }
 
     /* uncommon case: it was >= p, so now scarry = 0 and this = x
@@ -253,10 +118,10 @@ p255_strong_reduce (
     __uint128_t carry = 0;
 
     /* add it back */
-    for (i=0; i<8; i++) {
-        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
+    for (i=0; i<5; i++) {
+        carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
         a->limb[i] = carry & mask;
-        carry >>= 56;
+        carry >>= 51;
     }
 
     assert(is_zero(carry + scarry));
@@ -271,12 +136,13 @@ p255_serialize (
     p255_t red;
     p255_copy(&red, x);
     p255_strong_reduce(&red);
-    for (i=0; i<8; i++) {
-        for (j=0; j<7; j++) {
-            serial[7*i+j] = red.limb[i];
-            red.limb[i] >>= 8;
+    uint64_t *r = red.limb;
+    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
+    for (i=0; i<4; i++) {
+        for (j=0; j<8; j++) {
+            serial[8*i+j] = ser64[i];
+            ser64[i] >>= 8;
         }
-        assert(red.limb[i] == 0);
     }
 }
 
@@ -286,33 +152,27 @@ p255_deserialize (
     const uint8_t serial[32]
 ) {
     int i,j;
-    for (i=0; i<8; i++) {
+    uint64_t ser64[4], mask = ((1ull<<51)-1);
+    for (i=0; i<4; i++) {
         uint64_t out = 0;
-        for (j=0; j<7; j++) {
-            out |= ((uint64_t)serial[7*i+j])<<(8*j);
+        for (j=0; j<8; j++) {
+            out |= ((uint64_t)serial[8*i+j])<<(8*j);
         }
-        x->limb[i] = out;
+        ser64[i] = out;
     }
     
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 56 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    uint64_t ge = -1, mask = (1ull<<56)-1;
-    for (i=0; i<4; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    /* Test for >= 2^255-19 */
+    uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
+    ge &= ser64[1];
+    ge &= ser64[2];
+    ge &= (ser64[3]<<1) + 1;
+    ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
     
-    /* Propagate the rest */
-    for (i=5; i<8; i++) {
-        ge &= x->limb[i];
-    }
+    x->limb[0] = ser64[0] & mask;
+    x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
+    x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
+    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
+    x->limb[4] = ser64[3]>>12;
     
-    return ~is_zero(ge ^ mask);
+    return ~is_zero(~ge);
 }
diff --git a/src/p25519/arch_ref64/p25519.h b/src/p25519/arch_ref64/p25519.h
index d291222..be64923 100644
--- a/src/p25519/arch_ref64/p25519.h
+++ b/src/p25519/arch_ref64/p25519.h
@@ -15,7 +15,17 @@ typedef struct p255_t {
 } p255_t;
 
 #define LBITS 51
-#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}}
+#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
+
+/*
+#define FIELD_LITERAL(a,b,c,d) {{ \
+    (a##ull) & LMASK, \
+    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
+    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
+    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
+    (d##ull)>>12 \
+}}
+*/
 
 #ifdef __cplusplus
 extern "C" {
@@ -140,9 +150,9 @@ p255_weak_reduce (
     p255_t *a
 ) {
     uint64_t mask = (1ull<<51) - 1;
-    uint64_t tmp = a->limb[5] >> 51;
+    uint64_t tmp = a->limb[4] >> 51;
     int i;
-    for (i=7; i>0; i--) {
+    for (i=4; i>0; i--) {
         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
     }
     a->limb[0] = (a->limb[0] & mask) + tmp*19;
diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c
index 07c140b..eab2640 100644
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -10,58 +10,51 @@
 
 #include "field.h"
 
-extern field_a_t ONE; // TODO
-
-static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere?
+static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere?
     0x61b274a0ea0b0,
     0x0d5a5fc8f189d,
     0x7ef5e9cbd0c60,
     0x78595a6804c9e,
     0x2b8324804fc1d
-);
+)};
+    
+static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted
+    1,0,0,0,0
+)}; 
 
-void 
-field_isr (
-    field_a_t a,
-    const field_a_t x
-) {
-    field_a_t st[3], tmp1, tmp2;
-    const struct { unsigned char sh, idx } ops[] = {
-        {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
-    };
-    field_cpy(st[0],x);
-    field_cpy(st[1],x);
-    field_cpy(st[2],x);
+// ARCH MAGIC FIXME copy-pasted from decaf_fast.c
+static mask_t gf_eq(const field_a_t a, const field_a_t b) {
+    field_a_t c;
+    field_sub(c,a,b);
+    field_strong_reduce(c);
+    mask_t ret=0;
     int i;
-    for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
-        field_sqrn(tmp1, st[1^i&1], ops[i].sh);
-        field_mul(tmp2, tmp1, st[ops[i].idx]);
-        field_cpy(st[i&1], tmp2);
-    }
-    
-    mask_t m = field_eq(st[1], ONE);
-    cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m);
-    field_mul(a,tmp1,st[0]);
-};
+    for (i=0; i<5; i++) { ret |= c->limb[i]; }
+    return ((__uint128_t)ret - 1) >> 64;
+}
 
+/* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
 void 
 field_isr (
     field_a_t a,
     const field_a_t x
 ) {
     field_a_t st[3], tmp1, tmp2;
-    const struct { unsigned char sh, idx } ops[] = {
+    const struct { unsigned char sh, idx; } ops[] = {
         {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2}
     };
-    field_cpy(st[0],x);
-    field_cpy(st[1],x);
-    field_cpy(st[2],x);
-    int i;
+    st[0][0] = st[1][0] = st[2][0] = x[0];
+    unsigned int i;
     for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) {
         field_sqrn(tmp1, st[1^i&1], ops[i].sh);
         field_mul(tmp2, tmp1, st[ops[i].idx]);
-        field_cpy(st[i&1], tmp2);
+        st[i&1][0] = tmp2[0];
     }
     
-    mask_t m = field_eq(st[1], ONE);
+    mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE);
+    
+    // ARCH MAGIC FIXME: should be cond_sel
+    for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i]            &  mask)
+                                      | (SQRT_MINUS_ONE->limb[i] & ~mask);
+    field_mul(a,tmp1,st[0]);
 }