From 2e23ac747b700526820be43deee85ac8086512a2 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 15 Jan 2016 14:13:47 -0800
Subject: [PATCH] move (some of the?) per-field code to src/per_field.c

---
 Makefile                                   |  11 ++-
 src/curve_ed25519/curve_data.inc.c         |   1 -
 src/curve_ed448goldilocks/curve_data.inc.c |   1 -
 src/decaf.c                                | 108 ---------------------
 src/decaf_crypto.c                         |   1 +
 src/gen_headers/f_field_h.py               |   9 +-
 src/include/word.h                         |  11 +++
 src/p25519/f_arithmetic.c                  |  10 +-
 src/per_field.c                            |  99 +++++++++++++++++++
 9 files changed, 129 insertions(+), 122 deletions(-)
 create mode 100644 src/per_field.c

diff --git a/Makefile b/Makefile
index 420f010..abc59cb 100644
--- a/Makefile
+++ b/Makefile
@@ -126,7 +126,7 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/*
 ################################################################
 define define_field
 ARCH_FOR_$(1) ?= $(2)
-COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
+COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o $$(BUILD_OBJ)/$(1)_per_field.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
 
 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
@@ -138,6 +138,11 @@ $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
 	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
+
+$$(BUILD_ASM)/$(1)_per_field.s: src/per_field.c $$(HEADERS)
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
+	-S -c -o $$@ $$<
 endef
 
 ################################################################
@@ -171,8 +176,8 @@ $$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS)
 
 $$(BUILD_ASM)/decaf_crypto_$(1).s: src/decaf_crypto.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) \
-		-I src/curve_$(1)/ \
-		-I $(BUILD_H)/curve_$(1) \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
+		-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \
 		-S -c -o $$@ $$<
 
 LIBCOMPONENTS += $$(BUILD_OBJ)/decaf_$(1).o $$(BUILD_OBJ)/decaf_tables_$(1).o
diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c
index 9012b4c..e9b302a 100644
--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -8,7 +8,6 @@
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
-#define SER_BYTES DECAF_255_SER_BYTES
 #define IMAGINE_TWIST 1
 #define P_MOD_8 5
 #define COFACTOR 8
diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c
index b5c8217..89b0cd0 100644
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -7,7 +7,6 @@
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
-#define SER_BYTES DECAF_448_SER_BYTES
 #define IMAGINE_TWIST 0
 #define P_MOD_8 7
 #define COFACTOR 4
diff --git a/src/decaf.c b/src/decaf.c
index d2564e9..ca19f23 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -50,17 +50,8 @@ extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const scalar_t API_NS(sc_r2);
 extern const decaf_word_t API_NS(MONTGOMERY_FACTOR);
-
 extern const point_t API_NS(point_base);
 
-/* These are externally exposed (but private) instead of static so that
- * f_arithmetic.c can use it
- */
-#define ONE API_NS(ONE)
-#define ZERO API_NS(ZERO)
-#define gf_eq API_NS(gf_eq)
-const gf ZERO = {{{0}}}, ONE = {{{1}}};
-
 /* Projective Niels coordinates */
 typedef struct { gf a, b, c; } niels_s, niels_t[1];
 typedef struct { niels_t n; gf z; } __attribute__((aligned(32))) pniels_s, pniels_t[1]; /* MAGIC alignment */
@@ -75,93 +66,9 @@ const precomputed_s *API_NS(precomputed_base) =
 const size_t API_NS2(sizeof,precomputed_s) = sizeof(precomputed_s);
 const size_t API_NS2(alignof,precomputed_s) = 32;
 
-/* TODO PERF: Vectorize vs unroll */
-#ifdef __clang__
-#if 100*__clang_major__ + __clang_minor__ > 305
-#define UNROLL _Pragma("clang loop unroll(full)") // PERF TODO: vectorize?
-#endif
-#endif
-
-#ifndef UNROLL
-#define UNROLL
-#endif
-
 #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
 #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
 
-/* FUTURE: move this code from per-curve to per-field header
- * (like f_arithmetic.c but same for all fields)
- */
-void gf_serialize (uint8_t serial[SER_BYTES], const gf x) {
-    gf red;
-    gf_copy(red, x);
-    gf_strong_reduce(red);
-    
-    unsigned int j=0, fill=0;
-    dword_t buffer = 0;
-    UNROLL for (unsigned int i=0; i<SER_BYTES; i++) {
-        if (fill < 8 && j < NLIMBS) {
-            buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill;
-            fill += LIMB_PLACE_VALUE(LIMBPERM(j));
-            j++;
-        }
-        serial[i] = buffer;
-        fill -= 8;
-        buffer >>= 8;
-    }
-}
-
-mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) {
-    unsigned int j=0, fill=0;
-    dword_t buffer = 0;
-    dsword_t scarry = 0;
-    UNROLL for (unsigned int i=0; i<NLIMBS; i++) {
-        UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < SER_BYTES) {
-            buffer |= ((dword_t)serial[j]) << fill;
-            fill += 8;
-            j++;
-        }
-        x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
-        fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
-        buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
-        scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t));
-    }
-    return word_is_zero(buffer) & ~word_is_zero(scarry);
-}
-
-void gf_strong_reduce (gf a) {
-    /* first, clear high */
-    gf_weak_reduce(a); /* PERF: only really need one step of this, but whatevs */
-
-    /* now the total is less than 2p */
-
-    /* compute total_value - p.  No need to reduce mod p. */
-    dsword_t scarry = 0;
-    for (unsigned int i=0; i<NLIMBS; i++) {
-        scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)];
-        a->limb[i] = scarry & LIMB_MASK(LIMBPERM(i));
-        scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
-    }
-
-    /* uncommon case: it was >= p, so now scarry = 0 and this = x
-     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
-     * so let's add back in p.  will carry back off the top for 2^255.
-     */
-    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
-
-    word_t scarry_0 = scarry;
-    dword_t carry = 0;
-
-    /* add it back */
-    for (unsigned int i=0; i<NLIMBS; i++) {
-        carry = carry + a->limb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]);
-        a->limb[i] = carry & LIMB_MASK(LIMBPERM(i));
-        carry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
-    }
-
-    assert(word_is_zero(carry + scarry_0));
-}
-
 /** Constant time, x = is_z ? z : y */
 static INLINE void
 cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
@@ -186,21 +93,6 @@ cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
     }
 }
 
-/** Compare a==b */
-/* Not static because it's used in inverse square root. */
-decaf_word_t gf_eq(const gf a, const gf b);
-decaf_word_t gf_eq(const gf a, const gf b) {
-    gf c;
-    gf_sub(c,a,b);
-    gf_strong_reduce(c);
-    decaf_word_t ret=0;
-    for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
-        ret |= c->limb[i];
-    }
-
-    return word_is_zero(ret);
-}
-
 /** Inverse square root using addition chain. */
 static decaf_bool_t
 gf_isqrt_chk(gf y, const gf x, decaf_bool_t allow_zero) {
diff --git a/src/decaf_crypto.c b/src/decaf_crypto.c
index 47c42ee..3a74ce0 100644
--- a/src/decaf_crypto.c
+++ b/src/decaf_crypto.c
@@ -8,6 +8,7 @@
  * @brief Example Decaf crypto routines
  */
 
+#include "f_field.h" /* for SER_BYTES; FUTURE: find a better way to do this? */
 #include <decaf/crypto.h>
 #include <string.h>
 
diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index 420b588..44fbcee 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -14,14 +14,19 @@ f_field_h = gen_file(
 
 #define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
 #define NLIMBS (%(gf_impl_bits)d/sizeof(word_t)/8)
+#define SER_BYTES ((%(gf_bits)d-1)/8 + 1)
 typedef struct gf_%(gf_shortname)s_s {
     word_t limb[NLIMBS];
 } __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
 
 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
+#define ZERO              gf_%(gf_shortname)s_ZERO
+#define ONE               gf_%(gf_shortname)s_ONE
+#define MODULUS           gf_%(gf_shortname)s_MODULUS
 #define gf                gf_%(gf_shortname)s_t
 #define gf_s              gf_%(gf_shortname)s_s
+#define gf_eq             gf_%(gf_shortname)s_eq
 #define gf_copy           gf_%(gf_shortname)s_copy
 #define gf_add_RAW        gf_%(gf_shortname)s_add_RAW
 #define gf_sub_RAW        gf_%(gf_shortname)s_sub_RAW
@@ -34,7 +39,6 @@ typedef struct gf_%(gf_shortname)s_s {
 #define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_serialize      gf_%(gf_shortname)s_serialize
 #define gf_deserialize    gf_%(gf_shortname)s_deserialize
-#define MODULUS           gf_%(gf_shortname)s_MODULUS
 
 #define SQRT_MINUS_ONE    P%(gf_shortname)s_SQRT_MINUS_ONE /* might not be defined */
 
@@ -44,7 +48,7 @@ typedef struct gf_%(gf_shortname)s_s {
 extern "C" {
 #endif
 
-const gf MODULUS;
+const gf MODULUS, ZERO, ONE;
 
 /* Defined below in f_impl.h */
 static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
@@ -58,6 +62,7 @@ void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
 void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
 void gf_sqr (gf_s *__restrict__ out, const gf a);
 void gf_serialize (uint8_t *serial, const gf x);
+mask_t gf_eq (const gf x, const gf y);
 mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
 
 #ifdef __cplusplus
diff --git a/src/include/word.h b/src/include/word.h
index 2261b13..54f2ff8 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -241,4 +241,15 @@ malloc_vector(size_t size) {
     }
 }
 
+/* PERF: vectorize vs unroll */
+#ifdef __clang__
+#if 100*__clang_major__ + __clang_minor__ > 305
+#define UNROLL _Pragma("clang loop unroll(full)") // PERF TODO: vectorize?
+#endif
+#endif
+
+#ifndef UNROLL
+#define UNROLL
+#endif
+
 #endif /* __WORD_H__ */
diff --git a/src/p25519/f_arithmetic.c b/src/p25519/f_arithmetic.c
index f348307..9d42892 100644
--- a/src/p25519/f_arithmetic.c
+++ b/src/p25519/f_arithmetic.c
@@ -11,7 +11,7 @@
 #include "field.h"
 #include "constant_time.h"
 
-const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
+const gf_25519_t SQRT_MINUS_ONE = {FIELD_LITERAL(
     0x61b274a0ea0b0,
     0x0d5a5fc8f189d,
     0x7ef5e9cbd0c60,
@@ -22,10 +22,6 @@ const gf_25519_t P25519_SQRT_MINUS_ONE = {FIELD_LITERAL(
 const gf MODULUS = {FIELD_LITERAL(
     0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff
 )};
-    
-/* TODO put in header */
-extern const gf_25519_t decaf_255_ONE;
-extern mask_t decaf_255_gf_eq(const gf_25519_t a, const gf_25519_t b);
 
 /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */
 void gf_isr (
@@ -44,8 +40,8 @@ void gf_isr (
         st[i&1][0] = tmp2[0];
     }
     
-    mask_t mask = decaf_255_gf_eq(st[1],decaf_255_ONE) | decaf_255_gf_eq(st[1],SQRT_MINUS_ONE);
+    mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE);
     
-    constant_time_select(tmp1, decaf_255_ONE, SQRT_MINUS_ONE, sizeof(tmp1), mask, 0);
+    constant_time_select(tmp1, ONE, SQRT_MINUS_ONE, sizeof(tmp1), mask, 0);
     gf_mul(a,tmp1,st[0]);
 }
diff --git a/src/per_field.c b/src/per_field.c
new file mode 100644
index 0000000..b826b40
--- /dev/null
+++ b/src/per_field.c
@@ -0,0 +1,99 @@
+/**
+ * @cond internal
+ * @file decaf_crypto.c
+ * @copyright
+ *   Copyright (c) 2015-2016 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Generic arithmetic which has to be compiled per field.
+ */
+
+#include "field.h"
+
+const gf ZERO = {{{0}}}, ONE = {{{1}}};
+
+/** Serialize to wire format. */
+void gf_serialize (uint8_t serial[SER_BYTES], const gf x) {
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
+    
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    UNROLL for (unsigned int i=0; i<SER_BYTES; i++) {
+        if (fill < 8 && j < NLIMBS) {
+            buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill;
+            fill += LIMB_PLACE_VALUE(LIMBPERM(j));
+            j++;
+        }
+        serial[i] = buffer;
+        fill -= 8;
+        buffer >>= 8;
+    }
+}
+
+/** Deserialize from wire format; return -1 on success and 0 on failure. */
+mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES]) {
+    unsigned int j=0, fill=0;
+    dword_t buffer = 0;
+    dsword_t scarry = 0;
+    UNROLL for (unsigned int i=0; i<NLIMBS; i++) {
+        UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < SER_BYTES) {
+            buffer |= ((dword_t)serial[j]) << fill;
+            fill += 8;
+            j++;
+        }
+        x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
+        fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
+        buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
+        scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t));
+    }
+    return word_is_zero(buffer) & ~word_is_zero(scarry);
+}
+
+/** Reduce to canonical form. */
+void gf_strong_reduce (gf a) {
+    /* first, clear high */
+    gf_weak_reduce(a); /* Determined to have negligible perf impact. */
+
+    /* now the total is less than 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+    dsword_t scarry = 0;
+    for (unsigned int i=0; i<NLIMBS; i++) {
+        scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)];
+        a->limb[i] = scarry & LIMB_MASK(LIMBPERM(i));
+        scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
+     * so let's add back in p.  will carry back off the top for 2^255.
+     */
+    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
+
+    word_t scarry_0 = scarry;
+    dword_t carry = 0;
+
+    /* add it back */
+    for (unsigned int i=0; i<NLIMBS; i++) {
+        carry = carry + a->limb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]);
+        a->limb[i] = carry & LIMB_MASK(LIMBPERM(i));
+        carry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
+    }
+
+    assert(word_is_zero(carry + scarry_0));
+}
+
+/** Compare a==b */
+mask_t gf_eq(const gf a, const gf b) {
+    gf c;
+    gf_sub(c,a,b);
+    gf_strong_reduce(c);
+    mask_t ret=0;
+    for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
+        ret |= c->limb[i];
+    }
+
+    return word_is_zero(ret);
+}