Working on getting cross-arch working again. Several TODOs.

Currently compiles and passes tests on x86_64 with arch_32 and DECAF_FORCE_32_BIT=1 (as well as the native settigs of course), so that's a start. Want to make serialization routine cross-arch. Need to check that perf is good enough (likely). Current routine in p25519/arch_32 is almost cross-arch, but has known bugs (FIXMEs). Needs to take into account separate p and, for NEON, the LIMBPERM. Want to decouple arches for each curve/field. Currently the split between decaf_word_t and word_t makes this fraught with peril. Fix is probably to rename decaf_word_t to decaf_api_word_t and fix it to either uint32 or uint64, then make internal things separate per field. That way we don't have to try arch detection in the header, which is nice. Need to make decaf_gen_tables use SC_LIMB. Might as well get rid of API_NS there too.
8 anos atrás · a5bed6b351
--- a/+ 8
+++ b/+ 8
@@ -31,13 +31,6 @@ LD = $(CC)
 LDXX = $(CXX)
 ASM ?= $(CC)

 ifneq (,$(findstring x86_64,$(MACHINE)))
 ARCH ?= arch_x86_64
 else
 # no i386 port yet
 ARCH ?= arch_ref32
 endif

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)

@@ -55,17 +48,8 @@ endif

 TODAY = $(shell date "+%Y-%m-%d")

 ifneq (,$(findstring arm,$(MACHINE)))
 ifneq (,$(findstring neon,$(ARCH)))
 ARCHFLAGS += -mfpu=neon
 else
 ARCHFLAGS += -mfpu=vfpv3-d16
 endif
 ARCHFLAGS += -mcpu=cortex-a8 # FIXME
 GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
 else
 ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
 endif
 #FIXME ARCHFLAGS
 ARCHFLAGS ?= -maes -mavx2 -mbmi2 #TODO

 ifeq ($(CC),clang)
 WARNFLAGS += -Wgcc-compat
@@ -141,18 +125,18 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/*
 # Per-field code: call with field, arch
 ################################################################
 define define_field
 ARCH_FOR_$(1) = $(2)
 ARCH_FOR_$(1) ?= $(2)
 COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))

 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
 	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
 	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<

 $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
 	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
 $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
 	-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
 	-S -c -o $$@ $$<
 endef

--- a/src/curve_ed25519/curve_data.inc.c
+++ b/src/curve_ed25519/curve_data.inc.c
@@ -5,7 +5,6 @@

 #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_255_SCALAR_BITS
 #define NLIMBS DECAF_255_LIMBS
 #define scalar_t decaf_255_scalar_t
 #define point_t decaf_255_point_t
 #define precomputed_s decaf_255_precomputed_s
--- a/src/curve_ed448goldilocks/curve_data.inc.c
+++ b/src/curve_ed448goldilocks/curve_data.inc.c
@@ -4,7 +4,6 @@

 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
 #define SCALAR_BITS DECAF_448_SCALAR_BITS
 #define NLIMBS DECAF_448_LIMBS
 #define scalar_t decaf_448_scalar_t
 #define point_t decaf_448_point_t
 #define precomputed_s decaf_448_precomputed_s
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -10,13 +10,14 @@

 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
 #include <decaf.h>
 #include <string.h>

 #include "word.h"
 #include "field.h"
 #include "decaf_config.h"

 #include <decaf.h>

 /* Include the curve data here */
 #include "curve_data.inc.c"

@@ -41,7 +42,10 @@ extern const gf SQRT_MINUS_ONE;
 extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
 #endif

 #define WBITS DECAF_WORD_BITS
 /* FIXME: this can be different from DECAF_WORD_BITS, and word_t can be different from decaf_word_t,
 * eg when mixing and matching implementations for different curves.  Homogenize this.
 */
 #define WBITS WORD_BITS

 const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const scalar_t API_NS(sc_r2);
@@ -82,8 +86,8 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
 #define UNROLL
 #endif

 #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
 #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
 #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}
 #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<sizeof(gf)/sizeof(word_t); i++)  { op; }}

 /** Copy x = y */
 static INLINE void
@@ -106,11 +110,11 @@ cond_neg(gf x, decaf_bool_t neg) {
 /** Constant time, if (swap) (x,y) = (y,x); */
 static INLINE void
 cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
    FOR_LIMB_U(i, {
    UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
        x->limb[i] ^= s;
        y->limb[i] ^= s;
    });
    }
 }

 /** Compare a==b */
@@ -123,9 +127,11 @@ gf_eq(const gf a, const gf b) {
    gf_sub(c,a,b);
    gf_strong_reduce(c);
    decaf_word_t ret=0;
    FOR_LIMB(i, ret |= c->limb[i] );
    /* Hope the compiler is too dumb to optimize this, thus noinline */
    return ((decaf_dword_t)ret - 1) >> WBITS;
    for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
        ret |= c->limb[i];
    }

    return word_is_zero(ret);
 }

 /** Inverse square root using addition chain. */
@@ -385,7 +391,7 @@ API_NS(scalar_eq) (
    for (i=0; i<SCALAR_LIMBS; i++) {
        diff |= a->limb[i] ^ b->limb[i];
    }
    return (((decaf_dword_t)diff)-1)>>WBITS;
    return word_is_zero(diff);
 }

 /* *** API begins here *** */    
@@ -1280,7 +1286,7 @@ API_NS(invert_elligator_nonuniform) (
    const point_t p,
    uint16_t hint_
 ) {
    uint64_t hint = hint_;
    decaf_bool_t hint = hint_;
    decaf_bool_t sgn_s = -(hint & 1),
        sgn_t_over_s = -(hint>>1 & 1),
        sgn_r0 = -(hint>>2 & 1),
@@ -1293,13 +1299,13 @@ API_NS(invert_elligator_nonuniform) (
    gf_sub(b,ONE,b); /* t+1 */
    gf_sqr(c,a); /* s^2 */
    decaf_bool_t is_identity = gf_eq(p->t,ZERO);
    {   /* identity adjustments */
    {
        /* identity adjustments */
        /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
        /* if hint is 0, -> 0 */
        /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
        cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
        
        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */        
    }
    gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
    gf_add(a,b,d); /* num? */
--- a/src/decaf_gen_tables.c
+++ b/src/decaf_gen_tables.c
@@ -11,9 +11,10 @@
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
 #include <stdio.h>
 #include <stdlib.h>

 #include "field.h"
 #include "decaf.h"
 #include "decaf_config.h"
 #include "field.h"

 #define GEN_TABLES
 #include "curve_data.inc.c"
@@ -91,8 +92,8 @@ int main(int argc, char **argv) {
    unsigned i;
    
    printf("/** @warning: this file was automatically generated. */\n");
    printf("#include <decaf.h>\n\n");
    printf("#include \"field.h\"\n\n");
    printf("#include <decaf.h>\n\n");
    printf("#define API_NS(_id) %s_##_id\n", API_NAME);
    printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);
    
--- a/src/gen_headers/curve_data.py
+++ b/src/gen_headers/curve_data.py
@@ -21,7 +21,6 @@ curve_data = {
        "name" : "IsoEd25519",
        "cxx_ns" : "IsoEd25519",
        "shortname" : "255",
        "longnum" : "25519",
        "c_ns" : "decaf_255",
        "cofactor" : 8,
        "field" : "p25519",
@@ -32,7 +31,6 @@ curve_data = {
        "name" : "Ed448-Goldilocks",
        "cxx_ns" : "Ed448Goldilocks",
        "shortname" : "448",
        "longnum" : "448",
        "c_ns" : "decaf_448",
        "cofactor" : 4,
        "field" : "p448",
--- a/src/gen_headers/decaf_h.py
+++ b/src/gen_headers/decaf_h.py
@@ -13,7 +13,6 @@ extern "C" {
 #endif

 /** @cond internal */
 #define %(C_NS)s_LIMBS (%(gf_impl_bits)d/DECAF_WORD_BITS)
 #define %(C_NS)s_SCALAR_LIMBS ((%(scalar_bits)d-1)/DECAF_WORD_BITS+1)
 /** @endcond */

@@ -21,13 +20,13 @@ extern "C" {
 #define %(C_NS)s_SCALAR_BITS %(scalar_bits)d

 /** @cond internal */
 #ifndef __%(C_NS)s_GF_DEFINED__
 #define __%(C_NS)s_GF_DEFINED__ 1
 #ifndef __DECAF_%(gf_shortname)s_GF_DEFINED__
 #define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
 /** @brief Galois field element internal structure */
 typedef struct gf_%(longnum)s_s {
    decaf_word_t limb[%(C_NS)s_LIMBS];
 } __attribute__((aligned(32))) gf_%(longnum)s_s, gf_%(longnum)s_t[1];
 #endif /* __%(C_NS)s_GF_DEFINED__ */
 typedef struct gf_%(gf_shortname)s_s {
    decaf_word_t limb[%(gf_impl_bits)d/DECAF_WORD_BITS];
 } __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
 #endif /* __DECAF_%(gf_shortname)s_GF_DEFINED__ */
 /** @endcond */

 /** Number of bytes in a serialized point. */
@@ -39,7 +38,7 @@ typedef struct gf_%(longnum)s_s {
 /** Twisted Edwards extended homogeneous coordinates */
 typedef struct %(c_ns)s_point_s {
    /** @cond internal */
    gf_%(longnum)s_t x,y,z,t;
    gf_%(gf_shortname)s_t x,y,z,t;
    /** @endcond */
 } %(c_ns)s_point_t[1];

--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -10,9 +10,13 @@ f_field_h = gen_file(
 #include <string.h>
 #include <assert.h>

 #include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
 #include "word.h"

 #define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
 typedef struct gf_%(gf_shortname)s_s {
    word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8];
 } __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];

 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
 #define gf                gf_%(gf_shortname)s_t
@@ -57,4 +61,4 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
 #endif

 #include "f_impl.h" /* Bring in the inline implementations */
 """)
 """)
--- a/src/include/arch_32/arch_intrinsics.h
+++ b/src/include/arch_32/arch_intrinsics.h
@@ -0,0 +1,22 @@
 /* Copyright (c) 2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
 #define __ARCH_ARCH_32_ARCH_INTRINSICS_H__

 #define WORD_BITS 32

 static __inline__ __attribute((always_inline,unused))
 uint32_t word_is_zero(uint32_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((uint64_t)a)-1)>>32;
 }

 static __inline__ __attribute((always_inline,unused))
 uint64_t widemul(uint32_t a, uint32_t b) {
    return ((uint64_t)a) * b;
 }

 #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */

--- a/src/include/arch_arm_32/arch_intrinsics.h
+++ b/src/include/arch_arm_32/arch_intrinsics.h
@@ -0,0 +1,24 @@
 /* Copyright (c) 2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
 #define __ARCH_ARM_32_ARCH_INTRINSICS_H__

 #define WORD_BITS 32

 static __inline__ __attribute((always_inline,unused))
 uint32_t word_is_zero(uint32_t a) {
    uint32_t ret;
    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
    return ret;
 }

 static __inline__ __attribute((always_inline,unused))
 uint64_t widemul(uint32_t a, uint32_t b) {
    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
    return ((uint64_t)a) * b; 
 }

 #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */

--- a/src/include/arch_neon/arch_intrinsics.h
+++ b/src/include/arch_neon/arch_intrinsics.h
@@ -0,0 +1,24 @@
 /* Copyright (c) 2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
 #define __ARCH_NEON_ARCH_INTRINSICS_H__

 #define WORD_BITS 32

 static __inline__ __attribute((always_inline,unused))
 uint32_t word_is_zero(uint32_t a) {
    uint32_t ret;
    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
    return ret;
 }

 static __inline__ __attribute((always_inline,unused))
 uint64_t widemul(uint32_t a, uint32_t b) {
    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
    return ((uint64_t)a) * b; 
 }

 #endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */

--- a/src/include/arch_ref64/arch_intrinsics.h
+++ b/src/include/arch_ref64/arch_intrinsics.h
@@ -0,0 +1,22 @@
 /* Copyright (c) 2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
 #define __ARCH_REF64_ARCH_INTRINSICS_H__

 #define WORD_BITS 64

 static __inline__ __attribute((always_inline,unused))
 uint64_t word_is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 static __inline__ __attribute((always_inline,unused))
 uint64_t widemul(uint64_t a, uint64_t b) {
    return ((__uint128_t)a) * b; 
 }

 #endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */

--- a/src/include/arch_x86_64/arch_intrinsics.h
+++ b/src/include/arch_x86_64/arch_intrinsics.h
@@ -5,6 +5,8 @@
 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
 #define __ARCH_X86_64_ARCH_INTRINSICS_H__

 #define WORD_BITS 64

 #include <stdint.h>

 /* FUTURE: non x86-64 versions of these.
@@ -294,7 +296,7 @@ static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ uint64_t is_zero(uint64_t x) {
 static __inline__ uint64_t word_is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -74,7 +74,6 @@ gf_add (

 /** Subtract mod p.  Bias by 2 and don't reduce  */
 static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
 //    FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
    gf_sub_RAW(c,a,b);
    gf_bias(c, 2);
    if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -8,7 +8,7 @@
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600

 #include "arch_config.h"
 #include <stdint.h>
 #include "arch_intrinsics.h"

 #include <decaf/common.h>
@@ -21,7 +21,6 @@
 #include <endian.h>
 #endif

 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <inttypes.h>
@@ -64,7 +63,7 @@
    #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
    #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
    #define letohWORD letoh32
    #define SC_LIMB(x) (x##ull)
    #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
 #else
    #error "For now, libdecaf only supports 32- and 64-bit architectures."
 #endif
@@ -159,14 +158,6 @@ typedef struct {
 typedef struct {
    uint32xn_t unaligned;
 } __attribute__((packed)) unaligned_uint32xn_t;
    
 /**
 * Return -1 if x==0, and 0 otherwise.
 */
 static INLINE UNUSED mask_t
 word_is_zero(word_t x) {
    return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
 }

 #if __AVX2__
    static INLINE big_register_t
@@ -185,15 +176,10 @@ word_is_zero(word_t x) {
        return vceqq_u32(x,x^x);
    }
 #else
    static INLINE mask_t
    br_is_zero(word_t x) {
        return (((dword_t)x) - 1)>>WORD_BITS;
    }
    #define br_is_zero word_is_zero
 #endif




 #ifdef __APPLE__
    static INLINE uint64_t htole64 (uint64_t x) { return x; }
    static INLINE uint64_t letoh64 (uint64_t x) { return x; }
--- a/src/p25519/arch_32/f_impl.c
+++ b/src/p25519/arch_32/f_impl.c
@@ -0,0 +1,178 @@
 /* Copyright (c) 2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "f_field.h"

 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
    
    uint64_t bh[9];
    int i,j;
    for (i=0; i<9; i++) bh[i] = b[i+1] * 19;
    
    uint32_t *c = cs->limb;

    uint64_t accum = 0;
    for (i=0; i<10; /*i+=2*/) {
        /* Even case. */
        for (j=0; j<i; /*j+=2*/) {
            accum += widemul(b[i-j], a[j]); j++;
            accum += widemul(2*b[i-j], a[j]); j++;
        }
        accum += widemul(b[0], a[j]); j++;
        accum += widemul(2*bh[8], a[j]); j++;
        for (; j<10; /* j+=2*/) {
            accum += widemul(bh[i-j+9], a[j]); j++;
            accum += widemul(2*bh[i-j+9], a[j]); j++;
        }
        c[i] = accum & maske;
        accum >>= 26;
        i++;

        /* Odd case is easier: all place values are exact. */
        for (j=0; j<=i; j++) {
            accum += widemul(b[i-j], a[j]);
        }
        for (; j<10; j++) {
            accum += widemul(bh[i-j+9], a[j]);
        }
        c[i] = accum & masko;
        accum >>= 25;
        i++;
    }
    
    accum *= 19;
    accum += c[0];
    c[0] = accum & maske;
    accum >>= 26;
    
    assert(accum < masko);
    c[1] += accum;
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
    const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
    uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
    uint32_t *c = cs->limb;
    uint64_t accum = 0;

    accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]);
    c[0] = accum & maske;
    accum >>= 26;

    accum += widemul(blo, a[1]) + widemul(bhi,a[0]);
    c[1] = accum & masko;
    accum >>= 25;

    for (int i=2; i<10; /*i+=2*/) {
        accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]);
        c[i] = accum & maske;
        accum >>= 26;
        i++;

        accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]);
        c[i] = accum & masko;
        accum >>= 25;
        i++;
    }
    
    accum *= 19;
    accum += c[0];
    c[0] = accum & maske;
    accum >>= 26;
    
    assert(accum < masko);
    c[1] += accum;
 }

 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    gf_mul(cs,as,as); // PERF
 }

 void gf_strong_reduce (gf a) {
    uint32_t maske = (1<<26)-1, masko = (1<<25)-1;

    /* first, clear high */
    a->limb[0] += (a->limb[9]>>25)*19;
    a->limb[9] &= masko;

    /* now the total is less than 2p */

    /* compute total_value - p.  No need to reduce mod p. */
    int64_t scarry = 0;
    int i;
    for (i=0; i<10; /*i+=2*/) {
        scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske);
        a->limb[i] = scarry & maske;
        scarry >>= 26;
        i++;

        scarry = scarry + a->limb[i] - masko;
        a->limb[i] = scarry & masko;
        scarry >>= 25;
        i++;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
     * so let's add back in p.  will carry back off the top for 2^255.
     */

    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske;
    uint64_t carry = 0;

    /* add it back */
    for (i=0; i<10; /*i+=2*/) {
        carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske);
        a->limb[i] = carry & maske;
        carry >>= 26;
        i++;

        carry = carry + a->limb[i] + scarry_masko;
        a->limb[i] = carry & masko;
        carry >>= 25;
        i++;
    }

    assert(word_is_zero(carry + scarry));
 }

 #define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
 void gf_serialize (uint8_t serial[32], const gf x) {
    gf red;
    gf_copy(red, x);
    gf_strong_reduce(red);
    unsigned int j=0, fill=0;
    dword_t buffer = 0;
    for (unsigned int i=0; i<32; i++) {
        if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) {
            buffer |= ((dword_t)red->limb[j]) << fill;
            fill += LIMB_PLACE_VALUE(j);
            j++;
        }
        serial[i] = buffer;
        fill -= 8;
        buffer >>= 8;
    }
 }

 mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
    unsigned int j=0, fill=0;
    dword_t buffer = 0;
    for (unsigned int i=0; i<32; i++) {
        buffer |= ((dword_t)serial[i]) << fill;
        fill += 8;
        if (fill >= LIMB_PLACE_VALUE(j) || i == 31) {
            assert(j < sizeof(x->limb)/sizeof(x->limb[0]));
            word_t mask = ((1ull)<<LIMB_PLACE_VALUE(j))-1;
            x->limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field.
            buffer >>= LIMB_PLACE_VALUE(j);
            fill -= LIMB_PLACE_VALUE(j);
            j++;
        }
    }
    return -1; // FIXME: test whether in field.
 }
--- a/src/p25519/arch_32/f_impl.h
+++ b/src/p25519/arch_32/f_impl.h
@@ -0,0 +1,40 @@
 /* Copyright (c) 2014-2016 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26
 #define FIELD_LITERAL(a,b,c,d,e) \
    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}

 void gf_add_RAW (gf out, const gf a, const gf b) {
    for (unsigned int i=0; i<10; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    gf_weak_reduce(out);
 }

 void gf_sub_RAW (gf out, const gf a, const gf b) {
    uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36;
    for (unsigned int i=0; i<10; i+=2) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe);
        out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo;
    }
    gf_weak_reduce(out);
 }

 void gf_bias (gf a, int amt) {
    (void) a;
    (void) amt;
 }

 void gf_weak_reduce (gf a) {
    uint32_t maske = (1ull<<26) - 1, masko = (1ull<<25) - 1;
    uint32_t tmp = a->limb[9] >> 25;
    for (unsigned int i=8; i>0; i-=2) {
        a->limb[i+1] = (a->limb[i+1] & masko) + (a->limb[i]>>26);
        a->limb[i] = (a->limb[i] & maske) + (a->limb[i-1]>>25);
    }
    a->limb[1] = (a->limb[1] & masko) + (a->limb[0]>>26);
    a->limb[0] = (a->limb[0] & maske) + tmp*19;
 }

--- a/src/p25519/arch_ref64/arch_config.h
+++ b/src/p25519/arch_ref64/arch_config.h
@@ -1,2 +0,0 @@
 #define WORD_BITS 64
 #define DECAF_255_LIMB_BITS 51
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@

 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
    const uint64_t b
 ) {
    return ((__uint128_t)a) * ((__uint128_t)b);
 }

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    
@@ -95,7 +83,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^255.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -107,15 +95,15 @@ void gf_strong_reduce (gf a) {
        carry >>= 51;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t serial[32], const struct gf x) {
 void gf_serialize (uint8_t serial[32], const gf x) {
    int i,j;
    gf red;
    gf_copy(&red, x);
    gf_strong_reduce(&red);
    uint64_t *r = red.limb;
    gf_copy(red, x);
    gf_strong_reduce(red);
    uint64_t *r = red->limb;
    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
    for (i=0; i<4; i++) {
        for (j=0; j<8; j++) {
@@ -149,5 +137,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
    x->limb[4] = ser64[3]>>12;
    
    return ~is_zero(~ge);
    return ~word_is_zero(~ge);
 }
--- a/src/p25519/arch_x86_64/arch_config.h
+++ b/src/p25519/arch_x86_64/arch_config.h
@@ -1,2 +0,0 @@
 #define WORD_BITS 64
 #define DECAF_255_LIMB_BITS 51
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -194,7 +194,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^255.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -206,7 +206,7 @@ void gf_strong_reduce (gf a) {
        carry >>= 51;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t serial[32], const gf x) {
@@ -248,5 +248,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
    x->limb[4] = ser64[3]>>12;
    
    return ~is_zero(~ge);
    return ~word_is_zero(~ge);
 }
--- a/src/p448/arch_32/arch_config.h
+++ b/src/p448/arch_32/arch_config.h
@@ -1,2 +0,0 @@
 #define WORD_BITS 32
 #define DECAF_448_LIMB_BITS 28
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -4,19 +4,6 @@

 #include "f_field.h"

 static inline mask_t is_zero (word_t x) {
    dword_t xx = x;
    xx--;
    return xx >> WORD_BITS;
 }

 static uint64_t widemul (
    const uint32_t a,
    const uint32_t b
 ) {
    return ((uint64_t)a)* b;
 }

 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
@@ -141,7 +128,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    word_t scarry_mask = scarry & mask;
    dword_t carry = 0;
@@ -153,7 +140,7 @@ void gf_strong_reduce (gf a) {
        carry >>= 28;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const gf x) {
@@ -195,13 +182,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
    
    /* Propagate the rest */
    for (i=9; i<16; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }

--- a/src/p448/arch_arm_32/arch_config.h
+++ b/src/p448/arch_arm_32/arch_config.h
@@ -1,2 +0,0 @@
 #define WORD_BITS 32
 #define DECAF_448_LIMB_BITS 28
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -4,19 +4,6 @@

 #include "f_field.h"

 static inline mask_t is_zero (word_t x) {
    dword_t xx = x;
    xx--;
    return xx >> WORD_BITS;
 }

 static uint64_t widemul (
    const uint32_t a,
    const uint32_t b
 ) {
    return ((uint64_t)a)* b;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 smlal (
    uint64_t *acc,
@@ -874,7 +861,7 @@ void gf_strong_reduce (
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    word_t scarry_mask = scarry & mask;
    dword_t carry = 0;
@@ -886,7 +873,7 @@ void gf_strong_reduce (
        carry >>= 28;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (
@@ -935,12 +922,12 @@ gf_deserialize (
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
    ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
    
    /* Propagate the rest */
    for (i=9; i<16; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -4,15 +4,6 @@

 #include "f_field.h"

 static inline mask_t __attribute__((always_inline))
 is_zero (
    word_t x
 ) {
    dword_t xx = x;
    xx--;
    return xx >> WORD_BITS;
 }

 static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
 xx_vaddup_u64(uint64x2_t x) {
    __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
@@ -629,7 +620,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    word_t scarry_mask = scarry & mask;
    dword_t carry = 0;
@@ -641,7 +632,7 @@ void gf_strong_reduce (gf a) {
        carry >>= 28;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const gf x) {
@@ -684,13 +675,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask);
    ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | word_is_zero(x->limb[LIMBPERM(8)] ^ mask);
    
    /* Propagate the rest */
    for (i=9; i<16; i++) {
        ge &= x->limb[LIMBPERM(i)];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }

--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
--- a/src/p448/arch_neon_experimental/arch_config.h
+++ b/src/p448/arch_neon_experimental/arch_config.h
@@ -1,3 +0,0 @@
 #define WORD_BITS 32
 #define DECAF_448_LIMB_BITS 28

--- a/src/p448/arch_ref64/arch_config.h
+++ b/src/p448/arch_ref64/arch_config.h
@@ -1,3 +0,0 @@
 #define WORD_BITS 64
 #define DECAF_448_LIMB_BITS 56

--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@

 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
    const uint64_t b
 ) {
    return ((__uint128_t)a) * ((__uint128_t)b);
 }

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;
@@ -337,7 +325,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -349,7 +337,7 @@ void gf_strong_reduce (gf a) {
        carry >>= 56;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const gf x) {
@@ -389,12 +377,12 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }
--- a/src/p448/arch_x86_64/arch_config.h
+++ b/src/p448/arch_x86_64/arch_config.h
@@ -1,2 +0,0 @@
 #define WORD_BITS 64
 #define DECAF_448_LIMB_BITS 56
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf a) {
        carry >>= 56;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const gf x) {
@@ -367,13 +367,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }

--- a/src/p480/arch_x86_64/arch_config.h
+++ b/src/p480/arch_x86_64/arch_config.h
@@ -1 +0,0 @@
 #define WORD_BITS 64
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -315,7 +315,7 @@ void gf_strong_reduce (gf *a) {
    * so let's add back in p.  will carry back off the top for 2^480.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf *a) {
        carry >>= 60;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const struct gf *x) {
@@ -381,13 +381,13 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
    return ~word_is_zero(ge ^ mask);
 }

--- a/src/p521/arch_ref64/arch_config.h
+++ b/src/p521/arch_ref64/arch_config.h
@@ -1 +0,0 @@
 #define WORD_BITS 64
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -4,18 +4,6 @@

 #include "f_field.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
    const uint64_t b
 ) {
    return ((__uint128_t)a) * ((__uint128_t)b);
 }

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    uint64_t *c = cs->limb;
    const uint64_t *a = as->limb, *b = bs->limb;
@@ -318,7 +306,7 @@ void gf_strong_reduce (gf a) {
    * so let's add back in p.  will carry back off the top for 2^521.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -330,7 +318,7 @@ void gf_strong_reduce (gf a) {
        carry >>= (i==8) ? 57 : 58;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));
 }

 void gf_serialize (uint8_t *serial, const struct gf x) {
@@ -367,14 +355,14 @@ mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
    }
    
    /* Check for reduction.  First, high has to be < 2^57 */
    mask_t good = is_zero(out>>57);
    mask_t good = word_is_zero(out>>57);
    
    uint64_t and = -1ull;
    for (i=0; i<8; i++) {
        and &= x->limb[i];
    }
    and &= (2*out+1);
    good &= is_zero((and+1)>>58);
    good &= word_is_zero((and+1)>>58);
    
    return good;
 }
--- a/src/p521/arch_x86_64_r12/arch_config.h
+++ b/src/p521/arch_x86_64_r12/arch_config.h
@@ -1 +0,0 @@
 #define WORD_BITS 64
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -8,11 +8,6 @@ typedef struct {
  uint64x3_t lo, hi, hier;
 } nonad_t;

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
    return ((__uint128_t)(a)) * b;
 }
@@ -378,7 +373,7 @@ void gf_strong_reduce (gf *a) {
    * so let's add back in p.  will carry back off the top for 2^521.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));
    assert(word_is_zero(scarry) | word_is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;
@@ -390,7 +385,7 @@ void gf_strong_reduce (gf *a) {
        carry >>= (i==8) ? 57 : 58;
    }

    assert(is_zero(carry + scarry));
    assert(word_is_zero(carry + scarry));

    a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }
@@ -429,14 +424,14 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
    }
    
    /* Check for reduction.  First, high has to be < 2^57 */
    mask_t good = is_zero(out>>57);
    mask_t good = word_is_zero(out>>57);
    
    uint64_t and = -1ull;
    for (i=0; i<8; i++) {
        and &= x->limb[LIMBPERM(i)];
    }
    and &= (2*out+1);
    good &= is_zero((and+1)>>58);
    good &= word_is_zero((and+1)>>58);

    x->limb[3] = x->limb[7] = x->limb[11] = 0;