diff --git a/Makefile b/Makefile index 16eb948..420f010 100644 --- a/Makefile +++ b/Makefile @@ -31,13 +31,6 @@ LD = $(CC) LDXX = $(CXX) ASM ?= $(CC) -ifneq (,$(findstring x86_64,$(MACHINE))) -ARCH ?= arch_x86_64 -else -# no i386 port yet -ARCH ?= arch_ref32 -endif - WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) @@ -55,17 +48,8 @@ endif TODAY = $(shell date "+%Y-%m-%d") -ifneq (,$(findstring arm,$(MACHINE))) -ifneq (,$(findstring neon,$(ARCH))) -ARCHFLAGS += -mfpu=neon -else -ARCHFLAGS += -mfpu=vfpv3-d16 -endif -ARCHFLAGS += -mcpu=cortex-a8 # FIXME -GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow -else -ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO -endif +#FIXME ARCHFLAGS +ARCHFLAGS ?= -maes -mavx2 -mbmi2 #TODO ifeq ($(CC),clang) WARNFLAGS += -Wgcc-compat @@ -141,18 +125,18 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/* # Per-field code: call with field, arch ################################################################ define define_field -ARCH_FOR_$(1) = $(2) +ARCH_FOR_$(1) ?= $(2) COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o LIBCOMPONENTS += $$(COMPONENTS_OF_$(1)) $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS) - $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ - -I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ + $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \ + -I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \ -S -c -o $$@ $$< -$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS) - $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \ - -I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \ +$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS) + $$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \ + -I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \ -S -c -o $$@ $$< endef diff --git a/src/curve_ed25519/curve_data.inc.c b/src/curve_ed25519/curve_data.inc.c index b3d0c56..9012b4c 100644 --- a/src/curve_ed25519/curve_data.inc.c +++ b/src/curve_ed25519/curve_data.inc.c @@ -5,7 +5,6 @@ #define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS #define SCALAR_BITS DECAF_255_SCALAR_BITS -#define NLIMBS DECAF_255_LIMBS #define scalar_t decaf_255_scalar_t #define point_t decaf_255_point_t #define precomputed_s decaf_255_precomputed_s diff --git a/src/curve_ed448goldilocks/curve_data.inc.c b/src/curve_ed448goldilocks/curve_data.inc.c index b42c944..b5c8217 100644 --- a/src/curve_ed448goldilocks/curve_data.inc.c +++ b/src/curve_ed448goldilocks/curve_data.inc.c @@ -4,7 +4,6 @@ #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS #define SCALAR_BITS DECAF_448_SCALAR_BITS -#define NLIMBS DECAF_448_LIMBS #define scalar_t decaf_448_scalar_t #define point_t decaf_448_point_t #define precomputed_s decaf_448_precomputed_s diff --git a/src/decaf.c b/src/decaf.c index 2025ca3..a690678 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -10,13 +10,14 @@ #define _XOPEN_SOURCE 600 /* for posix_memalign */ #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */ -#include #include #include "word.h" #include "field.h" #include "decaf_config.h" +#include + /* Include the curve data here */ #include "curve_data.inc.c" @@ -41,7 +42,10 @@ extern const gf SQRT_MINUS_ONE; extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */ #endif -#define WBITS DECAF_WORD_BITS +/* FIXME: this can be different from DECAF_WORD_BITS, and word_t can be different from decaf_word_t, + * eg when mixing and matching implementations for different curves. Homogenize this. + */ +#define WBITS WORD_BITS const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; extern const scalar_t API_NS(sc_r2); @@ -82,8 +86,8 @@ const size_t API_NS2(alignof,precomputed_s) = 32; #define UNROLL #endif -#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; ilimb)/sizeof(x->limb[0]); i++) { decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; x->limb[i] ^= s; y->limb[i] ^= s; - }); + } } /** Compare a==b */ @@ -123,9 +127,11 @@ gf_eq(const gf a, const gf b) { gf_sub(c,a,b); gf_strong_reduce(c); decaf_word_t ret=0; - FOR_LIMB(i, ret |= c->limb[i] ); - /* Hope the compiler is too dumb to optimize this, thus noinline */ - return ((decaf_dword_t)ret - 1) >> WBITS; + for (unsigned int i=0; ilimb)/sizeof(c->limb[0]); i++) { + ret |= c->limb[i]; + } + + return word_is_zero(ret); } /** Inverse square root using addition chain. */ @@ -385,7 +391,7 @@ API_NS(scalar_eq) ( for (i=0; ilimb[i] ^ b->limb[i]; } - return (((decaf_dword_t)diff)-1)>>WBITS; + return word_is_zero(diff); } /* *** API begins here *** */ @@ -1280,7 +1286,7 @@ API_NS(invert_elligator_nonuniform) ( const point_t p, uint16_t hint_ ) { - uint64_t hint = hint_; + decaf_bool_t hint = hint_; decaf_bool_t sgn_s = -(hint & 1), sgn_t_over_s = -(hint>>1 & 1), sgn_r0 = -(hint>>2 & 1), @@ -1293,13 +1299,13 @@ API_NS(invert_elligator_nonuniform) ( gf_sub(b,ONE,b); /* t+1 */ gf_sqr(c,a); /* s^2 */ decaf_bool_t is_identity = gf_eq(p->t,ZERO); - { /* identity adjustments */ + { + /* identity adjustments */ /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */ /* if hint is 0, -> 0 */ /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */ cond_sel(c,c,ONE,is_identity & sgn_t_over_s); - cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */ - + cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */ } gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */ gf_add(a,b,d); /* num? */ diff --git a/src/decaf_gen_tables.c b/src/decaf_gen_tables.c index de917d6..85feced 100644 --- a/src/decaf_gen_tables.c +++ b/src/decaf_gen_tables.c @@ -11,9 +11,10 @@ #define _XOPEN_SOURCE 600 /* for posix_memalign */ #include #include + +#include "field.h" #include "decaf.h" #include "decaf_config.h" -#include "field.h" #define GEN_TABLES #include "curve_data.inc.c" @@ -91,8 +92,8 @@ int main(int argc, char **argv) { unsigned i; printf("/** @warning: this file was automatically generated. */\n"); - printf("#include \n\n"); printf("#include \"field.h\"\n\n"); + printf("#include \n\n"); printf("#define API_NS(_id) %s_##_id\n", API_NAME); printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME); diff --git a/src/gen_headers/curve_data.py b/src/gen_headers/curve_data.py index 772a217..ed0e901 100644 --- a/src/gen_headers/curve_data.py +++ b/src/gen_headers/curve_data.py @@ -21,7 +21,6 @@ curve_data = { "name" : "IsoEd25519", "cxx_ns" : "IsoEd25519", "shortname" : "255", - "longnum" : "25519", "c_ns" : "decaf_255", "cofactor" : 8, "field" : "p25519", @@ -32,7 +31,6 @@ curve_data = { "name" : "Ed448-Goldilocks", "cxx_ns" : "Ed448Goldilocks", "shortname" : "448", - "longnum" : "448", "c_ns" : "decaf_448", "cofactor" : 4, "field" : "p448", diff --git a/src/gen_headers/decaf_h.py b/src/gen_headers/decaf_h.py index 8a6151f..f092e61 100644 --- a/src/gen_headers/decaf_h.py +++ b/src/gen_headers/decaf_h.py @@ -13,7 +13,6 @@ extern "C" { #endif /** @cond internal */ -#define %(C_NS)s_LIMBS (%(gf_impl_bits)d/DECAF_WORD_BITS) #define %(C_NS)s_SCALAR_LIMBS ((%(scalar_bits)d-1)/DECAF_WORD_BITS+1) /** @endcond */ @@ -21,13 +20,13 @@ extern "C" { #define %(C_NS)s_SCALAR_BITS %(scalar_bits)d /** @cond internal */ -#ifndef __%(C_NS)s_GF_DEFINED__ -#define __%(C_NS)s_GF_DEFINED__ 1 +#ifndef __DECAF_%(gf_shortname)s_GF_DEFINED__ +#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1 /** @brief Galois field element internal structure */ -typedef struct gf_%(longnum)s_s { - decaf_word_t limb[%(C_NS)s_LIMBS]; -} __attribute__((aligned(32))) gf_%(longnum)s_s, gf_%(longnum)s_t[1]; -#endif /* __%(C_NS)s_GF_DEFINED__ */ +typedef struct gf_%(gf_shortname)s_s { + decaf_word_t limb[%(gf_impl_bits)d/DECAF_WORD_BITS]; +} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1]; +#endif /* __DECAF_%(gf_shortname)s_GF_DEFINED__ */ /** @endcond */ /** Number of bytes in a serialized point. */ @@ -39,7 +38,7 @@ typedef struct gf_%(longnum)s_s { /** Twisted Edwards extended homogeneous coordinates */ typedef struct %(c_ns)s_point_s { /** @cond internal */ - gf_%(longnum)s_t x,y,z,t; + gf_%(gf_shortname)s_t x,y,z,t; /** @endcond */ } %(c_ns)s_point_t[1]; diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py index a06360b..388faba 100644 --- a/src/gen_headers/f_field_h.py +++ b/src/gen_headers/f_field_h.py @@ -10,9 +10,13 @@ f_field_h = gen_file( #include #include -#include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */ #include "word.h" +#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1 +typedef struct gf_%(gf_shortname)s_s { + word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8]; +} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1]; + #define GF_LIT_LIMB_BITS %(gf_lit_limb_bits)d #define GF_BITS %(gf_bits)d #define gf gf_%(gf_shortname)s_t @@ -57,4 +61,4 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]); #endif #include "f_impl.h" /* Bring in the inline implementations */ -""") \ No newline at end of file +""") diff --git a/src/include/arch_32/arch_intrinsics.h b/src/include/arch_32/arch_intrinsics.h new file mode 100644 index 0000000..4e9d159 --- /dev/null +++ b/src/include/arch_32/arch_intrinsics.h @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__ +#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__ + +#define WORD_BITS 32 + +static __inline__ __attribute((always_inline,unused)) +uint32_t word_is_zero(uint32_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((uint64_t)a)-1)>>32; +} + +static __inline__ __attribute((always_inline,unused)) +uint64_t widemul(uint32_t a, uint32_t b) { + return ((uint64_t)a) * b; +} + +#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ + diff --git a/src/include/arch_arm_32/arch_intrinsics.h b/src/include/arch_arm_32/arch_intrinsics.h new file mode 100644 index 0000000..86080b1 --- /dev/null +++ b/src/include/arch_arm_32/arch_intrinsics.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__ +#define __ARCH_ARM_32_ARCH_INTRINSICS_H__ + +#define WORD_BITS 32 + +static __inline__ __attribute((always_inline,unused)) +uint32_t word_is_zero(uint32_t a) { + uint32_t ret; + asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc"); + return ret; +} + +static __inline__ __attribute((always_inline,unused)) +uint64_t widemul(uint32_t a, uint32_t b) { + /* Could be UMULL, but it's hard to express to CC that the registers must be different */ + return ((uint64_t)a) * b; +} + +#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ + diff --git a/src/include/arch_neon/arch_intrinsics.h b/src/include/arch_neon/arch_intrinsics.h new file mode 100644 index 0000000..b138796 --- /dev/null +++ b/src/include/arch_neon/arch_intrinsics.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__ +#define __ARCH_NEON_ARCH_INTRINSICS_H__ + +#define WORD_BITS 32 + +static __inline__ __attribute((always_inline,unused)) +uint32_t word_is_zero(uint32_t a) { + uint32_t ret; + asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc"); + return ret; +} + +static __inline__ __attribute((always_inline,unused)) +uint64_t widemul(uint32_t a, uint32_t b) { + /* Could be UMULL, but it's hard to express to CC that the registers must be different */ + return ((uint64_t)a) * b; +} + +#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */ + diff --git a/src/include/arch_ref64/arch_intrinsics.h b/src/include/arch_ref64/arch_intrinsics.h new file mode 100644 index 0000000..8413a2e --- /dev/null +++ b/src/include/arch_ref64/arch_intrinsics.h @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__ +#define __ARCH_REF64_ARCH_INTRINSICS_H__ + +#define WORD_BITS 64 + +static __inline__ __attribute((always_inline,unused)) +uint64_t word_is_zero(uint64_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((__uint128_t)a)-1)>>64; +} + +static __inline__ __attribute((always_inline,unused)) +uint64_t widemul(uint64_t a, uint64_t b) { + return ((__uint128_t)a) * b; +} + +#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */ + diff --git a/src/include/arch_x86_64/arch_intrinsics.h b/src/include/arch_x86_64/arch_intrinsics.h index d2b03e1..843f337 100644 --- a/src/include/arch_x86_64/arch_intrinsics.h +++ b/src/include/arch_x86_64/arch_intrinsics.h @@ -5,6 +5,8 @@ #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__ #define __ARCH_X86_64_ARCH_INTRINSICS_H__ +#define WORD_BITS 64 + #include /* FUTURE: non x86-64 versions of these. @@ -294,7 +296,7 @@ static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t * *acc = (((__uint128_t)(d))<<64) | c; } -static __inline__ uint64_t is_zero(uint64_t x) { +static __inline__ uint64_t word_is_zero(uint64_t x) { __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); return ~x; } diff --git a/src/include/field.h b/src/include/field.h index 0121c39..9850f1c 100644 --- a/src/include/field.h +++ b/src/include/field.h @@ -74,7 +74,6 @@ gf_add ( /** Subtract mod p. Bias by 2 and don't reduce */ static inline void gf_sub_nr ( gf c, const gf a, const gf b ) { -// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); gf_sub_RAW(c,a,b); gf_bias(c, 2); if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK diff --git a/src/include/word.h b/src/include/word.h index b44a92e..2261b13 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -8,7 +8,7 @@ /* for posix_memalign */ #define _XOPEN_SOURCE 600 -#include "arch_config.h" +#include #include "arch_intrinsics.h" #include @@ -21,7 +21,6 @@ #include #endif -#include #include #include #include @@ -64,7 +63,7 @@ #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30 #define letohWORD letoh32 - #define SC_LIMB(x) (x##ull) + #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32) #else #error "For now, libdecaf only supports 32- and 64-bit architectures." #endif @@ -159,14 +158,6 @@ typedef struct { typedef struct { uint32xn_t unaligned; } __attribute__((packed)) unaligned_uint32xn_t; - -/** - * Return -1 if x==0, and 0 otherwise. - */ -static INLINE UNUSED mask_t -word_is_zero(word_t x) { - return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS); -} #if __AVX2__ static INLINE big_register_t @@ -185,15 +176,10 @@ word_is_zero(word_t x) { return vceqq_u32(x,x^x); } #else - static INLINE mask_t - br_is_zero(word_t x) { - return (((dword_t)x) - 1)>>WORD_BITS; - } + #define br_is_zero word_is_zero #endif - - #ifdef __APPLE__ static INLINE uint64_t htole64 (uint64_t x) { return x; } static INLINE uint64_t letoh64 (uint64_t x) { return x; } diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c new file mode 100644 index 0000000..cfc3fb3 --- /dev/null +++ b/src/p25519/arch_32/f_impl.c @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "f_field.h" + +void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { + const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); + + uint64_t bh[9]; + int i,j; + for (i=0; i<9; i++) bh[i] = b[i+1] * 19; + + uint32_t *c = cs->limb; + + uint64_t accum = 0; + for (i=0; i<10; /*i+=2*/) { + /* Even case. */ + for (j=0; j>= 26; + i++; + + /* Odd case is easier: all place values are exact. */ + for (j=0; j<=i; j++) { + accum += widemul(b[i-j], a[j]); + } + for (; j<10; j++) { + accum += widemul(bh[i-j+9], a[j]); + } + c[i] = accum & masko; + accum >>= 25; + i++; + } + + accum *= 19; + accum += c[0]; + c[0] = accum & maske; + accum >>= 26; + + assert(accum < masko); + c[1] += accum; +} + +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { + const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); + uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; + uint32_t *c = cs->limb; + uint64_t accum = 0; + + accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]); + c[0] = accum & maske; + accum >>= 26; + + accum += widemul(blo, a[1]) + widemul(bhi,a[0]); + c[1] = accum & masko; + accum >>= 25; + + for (int i=2; i<10; /*i+=2*/) { + accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]); + c[i] = accum & maske; + accum >>= 26; + i++; + + accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]); + c[i] = accum & masko; + accum >>= 25; + i++; + } + + accum *= 19; + accum += c[0]; + c[0] = accum & maske; + accum >>= 26; + + assert(accum < masko); + c[1] += accum; +} + +void gf_sqr (gf_s *__restrict__ cs, const gf as) { + gf_mul(cs,as,as); // PERF +} + +void gf_strong_reduce (gf a) { + uint32_t maske = (1<<26)-1, masko = (1<<25)-1; + + /* first, clear high */ + a->limb[0] += (a->limb[9]>>25)*19; + a->limb[9] &= masko; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + int64_t scarry = 0; + int i; + for (i=0; i<10; /*i+=2*/) { + scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske); + a->limb[i] = scarry & maske; + scarry >>= 26; + i++; + + scarry = scarry + a->limb[i] - masko; + a->limb[i] = scarry & masko; + scarry >>= 25; + i++; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^255 + * so let's add back in p. will carry back off the top for 2^255. + */ + + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); + + uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske; + uint64_t carry = 0; + + /* add it back */ + for (i=0; i<10; /*i+=2*/) { + carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske); + a->limb[i] = carry & maske; + carry >>= 26; + i++; + + carry = carry + a->limb[i] + scarry_masko; + a->limb[i] = carry & masko; + carry >>= 25; + i++; + } + + assert(word_is_zero(carry + scarry)); +} + +#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26) +void gf_serialize (uint8_t serial[32], const gf x) { + gf red; + gf_copy(red, x); + gf_strong_reduce(red); + unsigned int j=0, fill=0; + dword_t buffer = 0; + for (unsigned int i=0; i<32; i++) { + if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) { + buffer |= ((dword_t)red->limb[j]) << fill; + fill += LIMB_PLACE_VALUE(j); + j++; + } + serial[i] = buffer; + fill -= 8; + buffer >>= 8; + } +} + +mask_t gf_deserialize (gf x, const uint8_t serial[32]) { + unsigned int j=0, fill=0; + dword_t buffer = 0; + for (unsigned int i=0; i<32; i++) { + buffer |= ((dword_t)serial[i]) << fill; + fill += 8; + if (fill >= LIMB_PLACE_VALUE(j) || i == 31) { + assert(j < sizeof(x->limb)/sizeof(x->limb[0])); + word_t mask = ((1ull)<limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field. + buffer >>= LIMB_PLACE_VALUE(j); + fill -= LIMB_PLACE_VALUE(j); + j++; + } + } + return -1; // FIXME: test whether in field. +} diff --git a/src/p25519/arch_32/f_impl.h b/src/p25519/arch_32/f_impl.h new file mode 100644 index 0000000..5e51bf0 --- /dev/null +++ b/src/p25519/arch_32/f_impl.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2014-2016 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26 +#define FIELD_LITERAL(a,b,c,d,e) \ + {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}} + +void gf_add_RAW (gf out, const gf a, const gf b) { + for (unsigned int i=0; i<10; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + gf_weak_reduce(out); +} + +void gf_sub_RAW (gf out, const gf a, const gf b) { + uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36; + for (unsigned int i=0; i<10; i+=2) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe); + out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo; + } + gf_weak_reduce(out); +} + +void gf_bias (gf a, int amt) { + (void) a; + (void) amt; +} + +void gf_weak_reduce (gf a) { + uint32_t maske = (1ull<<26) - 1, masko = (1ull<<25) - 1; + uint32_t tmp = a->limb[9] >> 25; + for (unsigned int i=8; i>0; i-=2) { + a->limb[i+1] = (a->limb[i+1] & masko) + (a->limb[i]>>26); + a->limb[i] = (a->limb[i] & maske) + (a->limb[i-1]>>25); + } + a->limb[1] = (a->limb[1] & masko) + (a->limb[0]>>26); + a->limb[0] = (a->limb[0] & maske) + tmp*19; +} + diff --git a/src/p25519/arch_ref64/arch_config.h b/src/p25519/arch_ref64/arch_config.h deleted file mode 100644 index b9504c3..0000000 --- a/src/p25519/arch_ref64/arch_config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define WORD_BITS 64 -#define DECAF_255_LIMB_BITS 51 \ No newline at end of file diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c index 7afd485..414fd66 100644 --- a/src/p25519/arch_ref64/f_impl.c +++ b/src/p25519/arch_ref64/f_impl.c @@ -4,18 +4,6 @@ #include "f_field.h" -static __inline__ __uint128_t widemul( - const uint64_t a, - const uint64_t b -) { - return ((__uint128_t)a) * ((__uint128_t)b); -} - -static __inline__ uint64_t is_zero(uint64_t a) { - /* let's hope the compiler isn't clever enough to optimize this. */ - return (((__uint128_t)a)-1)>>64; -} - void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); @@ -95,7 +83,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^255. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -107,15 +95,15 @@ void gf_strong_reduce (gf a) { carry >>= 51; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } -void gf_serialize (uint8_t serial[32], const struct gf x) { +void gf_serialize (uint8_t serial[32], const gf x) { int i,j; gf red; - gf_copy(&red, x); - gf_strong_reduce(&red); - uint64_t *r = red.limb; + gf_copy(red, x); + gf_strong_reduce(red); + uint64_t *r = red->limb; uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; for (i=0; i<4; i++) { for (j=0; j<8; j++) { @@ -149,5 +137,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) { x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; x->limb[4] = ser64[3]>>12; - return ~is_zero(~ge); + return ~word_is_zero(~ge); } diff --git a/src/p25519/arch_x86_64/arch_config.h b/src/p25519/arch_x86_64/arch_config.h deleted file mode 100644 index 6d2cbd9..0000000 --- a/src/p25519/arch_x86_64/arch_config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define WORD_BITS 64 -#define DECAF_255_LIMB_BITS 51 diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c index 168dbd5..0b02519 100644 --- a/src/p25519/arch_x86_64/f_impl.c +++ b/src/p25519/arch_x86_64/f_impl.c @@ -194,7 +194,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^255. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -206,7 +206,7 @@ void gf_strong_reduce (gf a) { carry >>= 51; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t serial[32], const gf x) { @@ -248,5 +248,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) { x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; x->limb[4] = ser64[3]>>12; - return ~is_zero(~ge); + return ~word_is_zero(~ge); } diff --git a/src/p448/arch_32/arch_config.h b/src/p448/arch_32/arch_config.h deleted file mode 100644 index d4ada31..0000000 --- a/src/p448/arch_32/arch_config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define WORD_BITS 32 -#define DECAF_448_LIMB_BITS 28 diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c index 739b1fb..24e8fe2 100644 --- a/src/p448/arch_32/f_impl.c +++ b/src/p448/arch_32/f_impl.c @@ -4,19 +4,6 @@ #include "f_field.h" -static inline mask_t is_zero (word_t x) { - dword_t xx = x; - xx--; - return xx >> WORD_BITS; -} - -static uint64_t widemul ( - const uint32_t a, - const uint32_t b -) { - return ((uint64_t)a)* b; -} - void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; @@ -141,7 +128,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^448. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); word_t scarry_mask = scarry & mask; dword_t carry = 0; @@ -153,7 +140,7 @@ void gf_strong_reduce (gf a) { carry >>= 28; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const gf x) { @@ -195,13 +182,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) { } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); + ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask); /* Propagate the rest */ for (i=9; i<16; i++) { ge &= x->limb[i]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p448/arch_arm_32/arch_config.h b/src/p448/arch_arm_32/arch_config.h deleted file mode 100644 index d4ada31..0000000 --- a/src/p448/arch_arm_32/arch_config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define WORD_BITS 32 -#define DECAF_448_LIMB_BITS 28 diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c index 62eda0f..b1719ad 100644 --- a/src/p448/arch_arm_32/f_impl.c +++ b/src/p448/arch_arm_32/f_impl.c @@ -4,19 +4,6 @@ #include "f_field.h" -static inline mask_t is_zero (word_t x) { - dword_t xx = x; - xx--; - return xx >> WORD_BITS; -} - -static uint64_t widemul ( - const uint32_t a, - const uint32_t b -) { - return ((uint64_t)a)* b; -} - static inline void __attribute__((gnu_inline,always_inline)) smlal ( uint64_t *acc, @@ -874,7 +861,7 @@ void gf_strong_reduce ( * so let's add back in p. will carry back off the top for 2^448. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); word_t scarry_mask = scarry & mask; dword_t carry = 0; @@ -886,7 +873,7 @@ void gf_strong_reduce ( carry >>= 28; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize ( @@ -935,12 +922,12 @@ gf_deserialize ( } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); + ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask); /* Propagate the rest */ for (i=9; i<16; i++) { ge &= x->limb[i]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon/f_impl.c similarity index 98% rename from src/p448/arch_neon_experimental/f_impl.c rename to src/p448/arch_neon/f_impl.c index 1225f5e..845f31e 100644 --- a/src/p448/arch_neon_experimental/f_impl.c +++ b/src/p448/arch_neon/f_impl.c @@ -4,15 +4,6 @@ #include "f_field.h" -static inline mask_t __attribute__((always_inline)) -is_zero ( - word_t x -) { - dword_t xx = x; - xx--; - return xx >> WORD_BITS; -} - static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused)) xx_vaddup_u64(uint64x2_t x) { __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); @@ -629,7 +620,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^448. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); word_t scarry_mask = scarry & mask; dword_t carry = 0; @@ -641,7 +632,7 @@ void gf_strong_reduce (gf a) { carry >>= 28; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const gf x) { @@ -684,13 +675,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) { } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask); + ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | word_is_zero(x->limb[LIMBPERM(8)] ^ mask); /* Propagate the rest */ for (i=9; i<16; i++) { ge &= x->limb[LIMBPERM(i)]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p448/arch_neon_experimental/f_impl.h b/src/p448/arch_neon/f_impl.h similarity index 100% rename from src/p448/arch_neon_experimental/f_impl.h rename to src/p448/arch_neon/f_impl.h diff --git a/src/p448/arch_neon_experimental/arch_config.h b/src/p448/arch_neon_experimental/arch_config.h deleted file mode 100644 index e65216f..0000000 --- a/src/p448/arch_neon_experimental/arch_config.h +++ /dev/null @@ -1,3 +0,0 @@ -#define WORD_BITS 32 -#define DECAF_448_LIMB_BITS 28 - diff --git a/src/p448/arch_ref64/arch_config.h b/src/p448/arch_ref64/arch_config.h deleted file mode 100644 index f58980e..0000000 --- a/src/p448/arch_ref64/arch_config.h +++ /dev/null @@ -1,3 +0,0 @@ -#define WORD_BITS 64 -#define DECAF_448_LIMB_BITS 56 - diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c index 74aeeb1..4717b0e 100644 --- a/src/p448/arch_ref64/f_impl.c +++ b/src/p448/arch_ref64/f_impl.c @@ -4,18 +4,6 @@ #include "f_field.h" -static __inline__ __uint128_t widemul( - const uint64_t a, - const uint64_t b -) { - return ((__uint128_t)a) * ((__uint128_t)b); -} - -static __inline__ uint64_t is_zero(uint64_t a) { - /* let's hope the compiler isn't clever enough to optimize this. */ - return (((__uint128_t)a)-1)>>64; -} - void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb; uint64_t *c = cs->limb; @@ -337,7 +325,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^448. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -349,7 +337,7 @@ void gf_strong_reduce (gf a) { carry >>= 56; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const gf x) { @@ -389,12 +377,12 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) { } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); + ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask); /* Propagate the rest */ for (i=5; i<8; i++) { ge &= x->limb[i]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p448/arch_x86_64/arch_config.h b/src/p448/arch_x86_64/arch_config.h deleted file mode 100644 index 3f449f4..0000000 --- a/src/p448/arch_x86_64/arch_config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define WORD_BITS 64 -#define DECAF_448_LIMB_BITS 56 diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c index 07744fa..8ebb569 100644 --- a/src/p448/arch_x86_64/f_impl.c +++ b/src/p448/arch_x86_64/f_impl.c @@ -315,7 +315,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^448. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -327,7 +327,7 @@ void gf_strong_reduce (gf a) { carry >>= 56; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const gf x) { @@ -367,13 +367,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) { } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); + ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask); /* Propagate the rest */ for (i=5; i<8; i++) { ge &= x->limb[i]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p480/arch_x86_64/arch_config.h b/src/p480/arch_x86_64/arch_config.h deleted file mode 100644 index 58758cc..0000000 --- a/src/p480/arch_x86_64/arch_config.h +++ /dev/null @@ -1 +0,0 @@ -#define WORD_BITS 64 diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c index b3c565b..e021241 100644 --- a/src/p480/arch_x86_64/f_impl.c +++ b/src/p480/arch_x86_64/f_impl.c @@ -315,7 +315,7 @@ void gf_strong_reduce (gf *a) { * so let's add back in p. will carry back off the top for 2^480. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -327,7 +327,7 @@ void gf_strong_reduce (gf *a) { carry >>= 60; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const struct gf *x) { @@ -381,13 +381,13 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[60]) { } /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ - ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); + ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask); /* Propagate the rest */ for (i=5; i<8; i++) { ge &= x->limb[i]; } - return ~is_zero(ge ^ mask); + return ~word_is_zero(ge ^ mask); } diff --git a/src/p521/arch_ref64/arch_config.h b/src/p521/arch_ref64/arch_config.h deleted file mode 100644 index 58758cc..0000000 --- a/src/p521/arch_ref64/arch_config.h +++ /dev/null @@ -1 +0,0 @@ -#define WORD_BITS 64 diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c index 03c98ee..c3aee6f 100644 --- a/src/p521/arch_ref64/f_impl.c +++ b/src/p521/arch_ref64/f_impl.c @@ -4,18 +4,6 @@ #include "f_field.h" -static __inline__ __uint128_t widemul( - const uint64_t a, - const uint64_t b -) { - return ((__uint128_t)a) * ((__uint128_t)b); -} - -static __inline__ uint64_t is_zero(uint64_t a) { - /* let's hope the compiler isn't clever enough to optimize this. */ - return (((__uint128_t)a)-1)>>64; -} - void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { uint64_t *c = cs->limb; const uint64_t *a = as->limb, *b = bs->limb; @@ -318,7 +306,7 @@ void gf_strong_reduce (gf a) { * so let's add back in p. will carry back off the top for 2^521. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -330,7 +318,7 @@ void gf_strong_reduce (gf a) { carry >>= (i==8) ? 57 : 58; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); } void gf_serialize (uint8_t *serial, const struct gf x) { @@ -367,14 +355,14 @@ mask_t gf_deserialize (gf x, const uint8_t serial[66]) { } /* Check for reduction. First, high has to be < 2^57 */ - mask_t good = is_zero(out>>57); + mask_t good = word_is_zero(out>>57); uint64_t and = -1ull; for (i=0; i<8; i++) { and &= x->limb[i]; } and &= (2*out+1); - good &= is_zero((and+1)>>58); + good &= word_is_zero((and+1)>>58); return good; } diff --git a/src/p521/arch_x86_64_r12/arch_config.h b/src/p521/arch_x86_64_r12/arch_config.h deleted file mode 100644 index 58758cc..0000000 --- a/src/p521/arch_x86_64_r12/arch_config.h +++ /dev/null @@ -1 +0,0 @@ -#define WORD_BITS 64 diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c index 39d0f1e..2040531 100644 --- a/src/p521/arch_x86_64_r12/f_impl.c +++ b/src/p521/arch_x86_64_r12/f_impl.c @@ -8,11 +8,6 @@ typedef struct { uint64x3_t lo, hi, hier; } nonad_t; -static __inline__ uint64_t is_zero(uint64_t a) { - /* let's hope the compiler isn't clever enough to optimize this. */ - return (((__uint128_t)a)-1)>>64; -} - static inline __uint128_t widemulu(uint64_t a, uint64_t b) { return ((__uint128_t)(a)) * b; } @@ -378,7 +373,7 @@ void gf_strong_reduce (gf *a) { * so let's add back in p. will carry back off the top for 2^521. */ - assert(is_zero(scarry) | is_zero(scarry+1)); + assert(word_is_zero(scarry) | word_is_zero(scarry+1)); uint64_t scarry_mask = scarry & mask; __uint128_t carry = 0; @@ -390,7 +385,7 @@ void gf_strong_reduce (gf *a) { carry >>= (i==8) ? 57 : 58; } - assert(is_zero(carry + scarry)); + assert(word_is_zero(carry + scarry)); a->limb[3] = a->limb[7] = a->limb[11] = 0; } @@ -429,14 +424,14 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) { } /* Check for reduction. First, high has to be < 2^57 */ - mask_t good = is_zero(out>>57); + mask_t good = word_is_zero(out>>57); uint64_t and = -1ull; for (i=0; i<8; i++) { and &= x->limb[LIMBPERM(i)]; } and &= (2*out+1); - good &= is_zero((and+1)>>58); + good &= word_is_zero((and+1)>>58); x->limb[3] = x->limb[7] = x->limb[11] = 0;