Browse Source

Working on getting cross-arch working again. Several TODOs.

Currently compiles and passes tests on x86_64 with arch_32 and
DECAF_FORCE_32_BIT=1 (as well as the native settigs of course),
so that's a start.

Want to make serialization routine cross-arch.  Need to check that
perf is good enough (likely).  Current routine in p25519/arch_32
is almost cross-arch, but has known bugs (FIXMEs).  Needs to take
into account separate p and, for NEON, the LIMBPERM.

Want to decouple arches for each curve/field.  Currently the split
between decaf_word_t and word_t makes this fraught with peril.  Fix
is probably to rename decaf_word_t to decaf_api_word_t and fix it
to either uint32 or uint64, then make internal things separate per
field.  That way we don't have to try arch detection in the header,
which is nice.

Need to make decaf_gen_tables use SC_LIMB.  Might as well get rid
of API_NS there too.
master
Mike Hamburg 9 years ago
parent
commit
a5bed6b351
38 changed files with 402 additions and 210 deletions
  1. +8
    -24
      Makefile
  2. +0
    -1
      src/curve_ed25519/curve_data.inc.c
  3. +0
    -1
      src/curve_ed448goldilocks/curve_data.inc.c
  4. +20
    -14
      src/decaf.c
  5. +3
    -2
      src/decaf_gen_tables.c
  6. +0
    -2
      src/gen_headers/curve_data.py
  7. +7
    -8
      src/gen_headers/decaf_h.py
  8. +6
    -2
      src/gen_headers/f_field_h.py
  9. +22
    -0
      src/include/arch_32/arch_intrinsics.h
  10. +24
    -0
      src/include/arch_arm_32/arch_intrinsics.h
  11. +24
    -0
      src/include/arch_neon/arch_intrinsics.h
  12. +22
    -0
      src/include/arch_ref64/arch_intrinsics.h
  13. +3
    -1
      src/include/arch_x86_64/arch_intrinsics.h
  14. +0
    -1
      src/include/field.h
  15. +3
    -17
      src/include/word.h
  16. +178
    -0
      src/p25519/arch_32/f_impl.c
  17. +40
    -0
      src/p25519/arch_32/f_impl.h
  18. +0
    -2
      src/p25519/arch_ref64/arch_config.h
  19. +7
    -19
      src/p25519/arch_ref64/f_impl.c
  20. +0
    -2
      src/p25519/arch_x86_64/arch_config.h
  21. +3
    -3
      src/p25519/arch_x86_64/f_impl.c
  22. +0
    -2
      src/p448/arch_32/arch_config.h
  23. +4
    -17
      src/p448/arch_32/f_impl.c
  24. +0
    -2
      src/p448/arch_arm_32/arch_config.h
  25. +4
    -17
      src/p448/arch_arm_32/f_impl.c
  26. +4
    -13
      src/p448/arch_neon/f_impl.c
  27. +0
    -0
      src/p448/arch_neon/f_impl.h
  28. +0
    -3
      src/p448/arch_neon_experimental/arch_config.h
  29. +0
    -3
      src/p448/arch_ref64/arch_config.h
  30. +4
    -16
      src/p448/arch_ref64/f_impl.c
  31. +0
    -2
      src/p448/arch_x86_64/arch_config.h
  32. +4
    -4
      src/p448/arch_x86_64/f_impl.c
  33. +0
    -1
      src/p480/arch_x86_64/arch_config.h
  34. +4
    -4
      src/p480/arch_x86_64/f_impl.c
  35. +0
    -1
      src/p521/arch_ref64/arch_config.h
  36. +4
    -16
      src/p521/arch_ref64/f_impl.c
  37. +0
    -1
      src/p521/arch_x86_64_r12/arch_config.h
  38. +4
    -9
      src/p521/arch_x86_64_r12/f_impl.c

+ 8
- 24
Makefile View File

@@ -31,13 +31,6 @@ LD = $(CC)
LDXX = $(CXX)
ASM ?= $(CC)

ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_x86_64
else
# no i386 port yet
ARCH ?= arch_ref32
endif

WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)

@@ -55,17 +48,8 @@ endif

TODAY = $(shell date "+%Y-%m-%d")

ifneq (,$(findstring arm,$(MACHINE)))
ifneq (,$(findstring neon,$(ARCH)))
ARCHFLAGS += -mfpu=neon
else
ARCHFLAGS += -mfpu=vfpv3-d16
endif
ARCHFLAGS += -mcpu=cortex-a8 # FIXME
GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
else
ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
endif
#FIXME ARCHFLAGS
ARCHFLAGS ?= -maes -mavx2 -mbmi2 #TODO

ifeq ($(CC),clang)
WARNFLAGS += -Wgcc-compat
@@ -141,18 +125,18 @@ $(GEN_HEADERS): src/gen_headers/*.py src/public_include/decaf/*
# Per-field code: call with field, arch
################################################################
define define_field
ARCH_FOR_$(1) = $(2)
ARCH_FOR_$(1) ?= $(2)
COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))

$$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
-S -c -o $$@ $$<

$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$$(ARCH_FOR_$(1))/f_impl.c $$(HEADERS)
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$$(ARCH_FOR_$(1)) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$$(ARCH_FOR_$(1)) -I src/include/$$(ARCH_FOR_$(1)) \
-S -c -o $$@ $$<
endef



+ 0
- 1
src/curve_ed25519/curve_data.inc.c View File

@@ -5,7 +5,6 @@

#define SCALAR_LIMBS DECAF_255_SCALAR_LIMBS
#define SCALAR_BITS DECAF_255_SCALAR_BITS
#define NLIMBS DECAF_255_LIMBS
#define scalar_t decaf_255_scalar_t
#define point_t decaf_255_point_t
#define precomputed_s decaf_255_precomputed_s


+ 0
- 1
src/curve_ed448goldilocks/curve_data.inc.c View File

@@ -4,7 +4,6 @@

#define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
#define SCALAR_BITS DECAF_448_SCALAR_BITS
#define NLIMBS DECAF_448_LIMBS
#define scalar_t decaf_448_scalar_t
#define point_t decaf_448_point_t
#define precomputed_s decaf_448_precomputed_s


+ 20
- 14
src/decaf.c View File

@@ -10,13 +10,14 @@

#define _XOPEN_SOURCE 600 /* for posix_memalign */
#define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
#include <decaf.h>
#include <string.h>

#include "word.h"
#include "field.h"
#include "decaf_config.h"

#include <decaf.h>

/* Include the curve data here */
#include "curve_data.inc.c"

@@ -41,7 +42,10 @@ extern const gf SQRT_MINUS_ONE;
extern const gf SQRT_ONE_MINUS_D; /* TODO: Intern this? */
#endif

#define WBITS DECAF_WORD_BITS
/* FIXME: this can be different from DECAF_WORD_BITS, and word_t can be different from decaf_word_t,
* eg when mixing and matching implementations for different curves. Homogenize this.
*/
#define WBITS WORD_BITS

const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
extern const scalar_t API_NS(sc_r2);
@@ -82,8 +86,8 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
#define UNROLL
#endif

#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<sizeof(gf)/sizeof(word_t); i++) { op; }}
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<sizeof(gf)/sizeof(word_t); i++) { op; }}

/** Copy x = y */
static INLINE void
@@ -106,11 +110,11 @@ cond_neg(gf x, decaf_bool_t neg) {
/** Constant time, if (swap) (x,y) = (y,x); */
static INLINE void
cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
FOR_LIMB_U(i, {
UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
x->limb[i] ^= s;
y->limb[i] ^= s;
});
}
}

/** Compare a==b */
@@ -123,9 +127,11 @@ gf_eq(const gf a, const gf b) {
gf_sub(c,a,b);
gf_strong_reduce(c);
decaf_word_t ret=0;
FOR_LIMB(i, ret |= c->limb[i] );
/* Hope the compiler is too dumb to optimize this, thus noinline */
return ((decaf_dword_t)ret - 1) >> WBITS;
for (unsigned int i=0; i<sizeof(c->limb)/sizeof(c->limb[0]); i++) {
ret |= c->limb[i];
}

return word_is_zero(ret);
}

/** Inverse square root using addition chain. */
@@ -385,7 +391,7 @@ API_NS(scalar_eq) (
for (i=0; i<SCALAR_LIMBS; i++) {
diff |= a->limb[i] ^ b->limb[i];
}
return (((decaf_dword_t)diff)-1)>>WBITS;
return word_is_zero(diff);
}

/* *** API begins here *** */
@@ -1280,7 +1286,7 @@ API_NS(invert_elligator_nonuniform) (
const point_t p,
uint16_t hint_
) {
uint64_t hint = hint_;
decaf_bool_t hint = hint_;
decaf_bool_t sgn_s = -(hint & 1),
sgn_t_over_s = -(hint>>1 & 1),
sgn_r0 = -(hint>>2 & 1),
@@ -1293,13 +1299,13 @@ API_NS(invert_elligator_nonuniform) (
gf_sub(b,ONE,b); /* t+1 */
gf_sqr(c,a); /* s^2 */
decaf_bool_t is_identity = gf_eq(p->t,ZERO);
{ /* identity adjustments */
{
/* identity adjustments */
/* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
/* if hint is 0, -> 0 */
/* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
}
gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
gf_add(a,b,d); /* num? */


+ 3
- 2
src/decaf_gen_tables.c View File

@@ -11,9 +11,10 @@
#define _XOPEN_SOURCE 600 /* for posix_memalign */
#include <stdio.h>
#include <stdlib.h>

#include "field.h"
#include "decaf.h"
#include "decaf_config.h"
#include "field.h"

#define GEN_TABLES
#include "curve_data.inc.c"
@@ -91,8 +92,8 @@ int main(int argc, char **argv) {
unsigned i;
printf("/** @warning: this file was automatically generated. */\n");
printf("#include <decaf.h>\n\n");
printf("#include \"field.h\"\n\n");
printf("#include <decaf.h>\n\n");
printf("#define API_NS(_id) %s_##_id\n", API_NAME);
printf("#define API_NS2(_pref,_id) _pref##_%s_##_id\n", API_NAME);


+ 0
- 2
src/gen_headers/curve_data.py View File

@@ -21,7 +21,6 @@ curve_data = {
"name" : "IsoEd25519",
"cxx_ns" : "IsoEd25519",
"shortname" : "255",
"longnum" : "25519",
"c_ns" : "decaf_255",
"cofactor" : 8,
"field" : "p25519",
@@ -32,7 +31,6 @@ curve_data = {
"name" : "Ed448-Goldilocks",
"cxx_ns" : "Ed448Goldilocks",
"shortname" : "448",
"longnum" : "448",
"c_ns" : "decaf_448",
"cofactor" : 4,
"field" : "p448",


+ 7
- 8
src/gen_headers/decaf_h.py View File

@@ -13,7 +13,6 @@ extern "C" {
#endif

/** @cond internal */
#define %(C_NS)s_LIMBS (%(gf_impl_bits)d/DECAF_WORD_BITS)
#define %(C_NS)s_SCALAR_LIMBS ((%(scalar_bits)d-1)/DECAF_WORD_BITS+1)
/** @endcond */

@@ -21,13 +20,13 @@ extern "C" {
#define %(C_NS)s_SCALAR_BITS %(scalar_bits)d

/** @cond internal */
#ifndef __%(C_NS)s_GF_DEFINED__
#define __%(C_NS)s_GF_DEFINED__ 1
#ifndef __DECAF_%(gf_shortname)s_GF_DEFINED__
#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
/** @brief Galois field element internal structure */
typedef struct gf_%(longnum)s_s {
decaf_word_t limb[%(C_NS)s_LIMBS];
} __attribute__((aligned(32))) gf_%(longnum)s_s, gf_%(longnum)s_t[1];
#endif /* __%(C_NS)s_GF_DEFINED__ */
typedef struct gf_%(gf_shortname)s_s {
decaf_word_t limb[%(gf_impl_bits)d/DECAF_WORD_BITS];
} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];
#endif /* __DECAF_%(gf_shortname)s_GF_DEFINED__ */
/** @endcond */

/** Number of bytes in a serialized point. */
@@ -39,7 +38,7 @@ typedef struct gf_%(longnum)s_s {
/** Twisted Edwards extended homogeneous coordinates */
typedef struct %(c_ns)s_point_s {
/** @cond internal */
gf_%(longnum)s_t x,y,z,t;
gf_%(gf_shortname)s_t x,y,z,t;
/** @endcond */
} %(c_ns)s_point_t[1];



+ 6
- 2
src/gen_headers/f_field_h.py View File

@@ -10,9 +10,13 @@ f_field_h = gen_file(
#include <string.h>
#include <assert.h>

#include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
#include "word.h"

#define __DECAF_%(gf_shortname)s_GF_DEFINED__ 1
typedef struct gf_%(gf_shortname)s_s {
word_t limb[%(gf_impl_bits)d/sizeof(word_t)/8];
} __attribute__((aligned(32))) gf_%(gf_shortname)s_s, gf_%(gf_shortname)s_t[1];

#define GF_LIT_LIMB_BITS %(gf_lit_limb_bits)d
#define GF_BITS %(gf_bits)d
#define gf gf_%(gf_shortname)s_t
@@ -57,4 +61,4 @@ mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
#endif

#include "f_impl.h" /* Bring in the inline implementations */
""")
""")

+ 22
- 0
src/include/arch_32/arch_intrinsics.h View File

@@ -0,0 +1,22 @@
/* Copyright (c) 2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__

#define WORD_BITS 32

static __inline__ __attribute((always_inline,unused))
uint32_t word_is_zero(uint32_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((uint64_t)a)-1)>>32;
}

static __inline__ __attribute((always_inline,unused))
uint64_t widemul(uint32_t a, uint32_t b) {
return ((uint64_t)a) * b;
}

#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */


+ 24
- 0
src/include/arch_arm_32/arch_intrinsics.h View File

@@ -0,0 +1,24 @@
/* Copyright (c) 2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
#define __ARCH_ARM_32_ARCH_INTRINSICS_H__

#define WORD_BITS 32

static __inline__ __attribute((always_inline,unused))
uint32_t word_is_zero(uint32_t a) {
uint32_t ret;
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
return ret;
}

static __inline__ __attribute((always_inline,unused))
uint64_t widemul(uint32_t a, uint32_t b) {
/* Could be UMULL, but it's hard to express to CC that the registers must be different */
return ((uint64_t)a) * b;
}

#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */


+ 24
- 0
src/include/arch_neon/arch_intrinsics.h View File

@@ -0,0 +1,24 @@
/* Copyright (c) 2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
#define __ARCH_NEON_ARCH_INTRINSICS_H__

#define WORD_BITS 32

static __inline__ __attribute((always_inline,unused))
uint32_t word_is_zero(uint32_t a) {
uint32_t ret;
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
return ret;
}

static __inline__ __attribute((always_inline,unused))
uint64_t widemul(uint32_t a, uint32_t b) {
/* Could be UMULL, but it's hard to express to CC that the registers must be different */
return ((uint64_t)a) * b;
}

#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */


+ 22
- 0
src/include/arch_ref64/arch_intrinsics.h View File

@@ -0,0 +1,22 @@
/* Copyright (c) 2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
#define __ARCH_REF64_ARCH_INTRINSICS_H__

#define WORD_BITS 64

static __inline__ __attribute((always_inline,unused))
uint64_t word_is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}

static __inline__ __attribute((always_inline,unused))
uint64_t widemul(uint64_t a, uint64_t b) {
return ((__uint128_t)a) * b;
}

#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */


+ 3
- 1
src/include/arch_x86_64/arch_intrinsics.h View File

@@ -5,6 +5,8 @@
#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
#define __ARCH_X86_64_ARCH_INTRINSICS_H__

#define WORD_BITS 64

#include <stdint.h>

/* FUTURE: non x86-64 versions of these.
@@ -294,7 +296,7 @@ static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *
*acc = (((__uint128_t)(d))<<64) | c;
}

static __inline__ uint64_t is_zero(uint64_t x) {
static __inline__ uint64_t word_is_zero(uint64_t x) {
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
return ~x;
}


+ 0
- 1
src/include/field.h View File

@@ -74,7 +74,6 @@ gf_add (

/** Subtract mod p. Bias by 2 and don't reduce */
static inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
gf_sub_RAW(c,a,b);
gf_bias(c, 2);
if (DECAF_WORD_BITS==32) gf_weak_reduce(c); // HACK


+ 3
- 17
src/include/word.h View File

@@ -8,7 +8,7 @@
/* for posix_memalign */
#define _XOPEN_SOURCE 600

#include "arch_config.h"
#include <stdint.h>
#include "arch_intrinsics.h"

#include <decaf/common.h>
@@ -21,7 +21,6 @@
#include <endian.h>
#endif

#include <stdint.h>
#include <stdlib.h>
#include <sys/types.h>
#include <inttypes.h>
@@ -64,7 +63,7 @@
#define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
#define letohWORD letoh32
#define SC_LIMB(x) (x##ull)
#define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
#else
#error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif
@@ -159,14 +158,6 @@ typedef struct {
typedef struct {
uint32xn_t unaligned;
} __attribute__((packed)) unaligned_uint32xn_t;
/**
* Return -1 if x==0, and 0 otherwise.
*/
static INLINE UNUSED mask_t
word_is_zero(word_t x) {
return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
}

#if __AVX2__
static INLINE big_register_t
@@ -185,15 +176,10 @@ word_is_zero(word_t x) {
return vceqq_u32(x,x^x);
}
#else
static INLINE mask_t
br_is_zero(word_t x) {
return (((dword_t)x) - 1)>>WORD_BITS;
}
#define br_is_zero word_is_zero
#endif




#ifdef __APPLE__
static INLINE uint64_t htole64 (uint64_t x) { return x; }
static INLINE uint64_t letoh64 (uint64_t x) { return x; }


+ 178
- 0
src/p25519/arch_32/f_impl.c View File

@@ -0,0 +1,178 @@
/* Copyright (c) 2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "f_field.h"

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint32_t *a = as->limb, *b = bs->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
uint64_t bh[9];
int i,j;
for (i=0; i<9; i++) bh[i] = b[i+1] * 19;
uint32_t *c = cs->limb;

uint64_t accum = 0;
for (i=0; i<10; /*i+=2*/) {
/* Even case. */
for (j=0; j<i; /*j+=2*/) {
accum += widemul(b[i-j], a[j]); j++;
accum += widemul(2*b[i-j], a[j]); j++;
}
accum += widemul(b[0], a[j]); j++;
accum += widemul(2*bh[8], a[j]); j++;
for (; j<10; /* j+=2*/) {
accum += widemul(bh[i-j+9], a[j]); j++;
accum += widemul(2*bh[i-j+9], a[j]); j++;
}
c[i] = accum & maske;
accum >>= 26;
i++;

/* Odd case is easier: all place values are exact. */
for (j=0; j<=i; j++) {
accum += widemul(b[i-j], a[j]);
}
for (; j<10; j++) {
accum += widemul(bh[i-j+9], a[j]);
}
c[i] = accum & masko;
accum >>= 25;
i++;
}
accum *= 19;
accum += c[0];
c[0] = accum & maske;
accum >>= 26;
assert(accum < masko);
c[1] += accum;
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
uint32_t *c = cs->limb;
uint64_t accum = 0;

accum = widemul(blo, a[0]) + widemul(bhi*38,a[9]);
c[0] = accum & maske;
accum >>= 26;

accum += widemul(blo, a[1]) + widemul(bhi,a[0]);
c[1] = accum & masko;
accum >>= 25;

for (int i=2; i<10; /*i+=2*/) {
accum += widemul(blo, a[i]) + widemul(bhi2, a[i-1]);
c[i] = accum & maske;
accum >>= 26;
i++;

accum += widemul(blo, a[i]) + widemul(bhi, a[i-1]);
c[i] = accum & masko;
accum >>= 25;
i++;
}
accum *= 19;
accum += c[0];
c[0] = accum & maske;
accum >>= 26;
assert(accum < masko);
c[1] += accum;
}

void gf_sqr (gf_s *__restrict__ cs, const gf as) {
gf_mul(cs,as,as); // PERF
}

void gf_strong_reduce (gf a) {
uint32_t maske = (1<<26)-1, masko = (1<<25)-1;

/* first, clear high */
a->limb[0] += (a->limb[9]>>25)*19;
a->limb[9] &= masko;

/* now the total is less than 2p */

/* compute total_value - p. No need to reduce mod p. */
int64_t scarry = 0;
int i;
for (i=0; i<10; /*i+=2*/) {
scarry = scarry + a->limb[i] - ((i==0)?maske-18:maske);
a->limb[i] = scarry & maske;
scarry >>= 26;
i++;

scarry = scarry + a->limb[i] - masko;
a->limb[i] = scarry & masko;
scarry >>= 25;
i++;
}

/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^255
* so let's add back in p. will carry back off the top for 2^255.
*/

assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint32_t scarry_masko = scarry & masko, scarry_maske = scarry & maske;
uint64_t carry = 0;

/* add it back */
for (i=0; i<10; /*i+=2*/) {
carry = carry + a->limb[i] + ((i==0)?(scarry_maske&~18):scarry_maske);
a->limb[i] = carry & maske;
carry >>= 26;
i++;

carry = carry + a->limb[i] + scarry_masko;
a->limb[i] = carry & masko;
carry >>= 25;
i++;
}

assert(word_is_zero(carry + scarry));
}

#define LIMB_PLACE_VALUE(i) (((i)&1)?25:26)
void gf_serialize (uint8_t serial[32], const gf x) {
gf red;
gf_copy(red, x);
gf_strong_reduce(red);
unsigned int j=0, fill=0;
dword_t buffer = 0;
for (unsigned int i=0; i<32; i++) {
if (fill < 8 && j < sizeof(red->limb)/sizeof(red->limb[0])) {
buffer |= ((dword_t)red->limb[j]) << fill;
fill += LIMB_PLACE_VALUE(j);
j++;
}
serial[i] = buffer;
fill -= 8;
buffer >>= 8;
}
}

mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
unsigned int j=0, fill=0;
dword_t buffer = 0;
for (unsigned int i=0; i<32; i++) {
buffer |= ((dword_t)serial[i]) << fill;
fill += 8;
if (fill >= LIMB_PLACE_VALUE(j) || i == 31) {
assert(j < sizeof(x->limb)/sizeof(x->limb[0]));
word_t mask = ((1ull)<<LIMB_PLACE_VALUE(j))-1;
x->limb[j] = (i==31) ? buffer : (buffer & mask); // FIXME: this can in theory truncate the buffer if it's not in field.
buffer >>= LIMB_PLACE_VALUE(j);
fill -= LIMB_PLACE_VALUE(j);
j++;
}
}
return -1; // FIXME: test whether in field.
}

+ 40
- 0
src/p25519/arch_32/f_impl.h View File

@@ -0,0 +1,40 @@
/* Copyright (c) 2014-2016 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#define LIMB(x) (x##ull)&((1ull<<26)-1), (x##ull)>>26
#define FIELD_LITERAL(a,b,c,d,e) \
{{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e)}}

void gf_add_RAW (gf out, const gf a, const gf b) {
for (unsigned int i=0; i<10; i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
gf_weak_reduce(out);
}

void gf_sub_RAW (gf out, const gf a, const gf b) {
uint32_t coe = ((1ull<<26)-1)*2, coo = ((1ull<<25)-1)*2, co0 = coe-36;
for (unsigned int i=0; i<10; i+=2) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co0 : coe);
out->limb[i+1] = a->limb[i+1] - b->limb[i+1] + coo;
}
gf_weak_reduce(out);
}

void gf_bias (gf a, int amt) {
(void) a;
(void) amt;
}

void gf_weak_reduce (gf a) {
uint32_t maske = (1ull<<26) - 1, masko = (1ull<<25) - 1;
uint32_t tmp = a->limb[9] >> 25;
for (unsigned int i=8; i>0; i-=2) {
a->limb[i+1] = (a->limb[i+1] & masko) + (a->limb[i]>>26);
a->limb[i] = (a->limb[i] & maske) + (a->limb[i-1]>>25);
}
a->limb[1] = (a->limb[1] & masko) + (a->limb[0]>>26);
a->limb[0] = (a->limb[0] & maske) + tmp*19;
}


+ 0
- 2
src/p25519/arch_ref64/arch_config.h View File

@@ -1,2 +0,0 @@
#define WORD_BITS 64
#define DECAF_255_LIMB_BITS 51

+ 7
- 19
src/p25519/arch_ref64/f_impl.c View File

@@ -4,18 +4,6 @@

#include "f_field.h"

static __inline__ __uint128_t widemul(
const uint64_t a,
const uint64_t b
) {
return ((__uint128_t)a) * ((__uint128_t)b);
}

static __inline__ uint64_t is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
@@ -95,7 +83,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^255.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -107,15 +95,15 @@ void gf_strong_reduce (gf a) {
carry >>= 51;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t serial[32], const struct gf x) {
void gf_serialize (uint8_t serial[32], const gf x) {
int i,j;
gf red;
gf_copy(&red, x);
gf_strong_reduce(&red);
uint64_t *r = red.limb;
gf_copy(red, x);
gf_strong_reduce(red);
uint64_t *r = red->limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
for (j=0; j<8; j++) {
@@ -149,5 +137,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(~ge);
return ~word_is_zero(~ge);
}

+ 0
- 2
src/p25519/arch_x86_64/arch_config.h View File

@@ -1,2 +0,0 @@
#define WORD_BITS 64
#define DECAF_255_LIMB_BITS 51

+ 3
- 3
src/p25519/arch_x86_64/f_impl.c View File

@@ -194,7 +194,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^255.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -206,7 +206,7 @@ void gf_strong_reduce (gf a) {
carry >>= 51;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t serial[32], const gf x) {
@@ -248,5 +248,5 @@ mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(~ge);
return ~word_is_zero(~ge);
}

+ 0
- 2
src/p448/arch_32/arch_config.h View File

@@ -1,2 +0,0 @@
#define WORD_BITS 32
#define DECAF_448_LIMB_BITS 28

+ 4
- 17
src/p448/arch_32/f_impl.c View File

@@ -4,19 +4,6 @@

#include "f_field.h"

static inline mask_t is_zero (word_t x) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}

static uint64_t widemul (
const uint32_t a,
const uint32_t b
) {
return ((uint64_t)a)* b;
}

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
@@ -141,7 +128,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

word_t scarry_mask = scarry & mask;
dword_t carry = 0;
@@ -153,7 +140,7 @@ void gf_strong_reduce (gf a) {
carry >>= 28;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const gf x) {
@@ -195,13 +182,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
/* Propagate the rest */
for (i=9; i<16; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}


+ 0
- 2
src/p448/arch_arm_32/arch_config.h View File

@@ -1,2 +0,0 @@
#define WORD_BITS 32
#define DECAF_448_LIMB_BITS 28

+ 4
- 17
src/p448/arch_arm_32/f_impl.c View File

@@ -4,19 +4,6 @@

#include "f_field.h"

static inline mask_t is_zero (word_t x) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}

static uint64_t widemul (
const uint32_t a,
const uint32_t b
) {
return ((uint64_t)a)* b;
}

static inline void __attribute__((gnu_inline,always_inline))
smlal (
uint64_t *acc,
@@ -874,7 +861,7 @@ void gf_strong_reduce (
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

word_t scarry_mask = scarry & mask;
dword_t carry = 0;
@@ -886,7 +873,7 @@ void gf_strong_reduce (
carry >>= 28;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (
@@ -935,12 +922,12 @@ gf_deserialize (
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
ge = (ge & (x->limb[8] + 1)) | word_is_zero(x->limb[8] ^ mask);
/* Propagate the rest */
for (i=9; i<16; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}

src/p448/arch_neon_experimental/f_impl.c → src/p448/arch_neon/f_impl.c View File

@@ -4,15 +4,6 @@

#include "f_field.h"

static inline mask_t __attribute__((always_inline))
is_zero (
word_t x
) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}

static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
xx_vaddup_u64(uint64x2_t x) {
__asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
@@ -629,7 +620,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

word_t scarry_mask = scarry & mask;
dword_t carry = 0;
@@ -641,7 +632,7 @@ void gf_strong_reduce (gf a) {
carry >>= 28;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const gf x) {
@@ -684,13 +675,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask);
ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | word_is_zero(x->limb[LIMBPERM(8)] ^ mask);
/* Propagate the rest */
for (i=9; i<16; i++) {
ge &= x->limb[LIMBPERM(i)];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}


src/p448/arch_neon_experimental/f_impl.h → src/p448/arch_neon/f_impl.h View File


+ 0
- 3
src/p448/arch_neon_experimental/arch_config.h View File

@@ -1,3 +0,0 @@
#define WORD_BITS 32
#define DECAF_448_LIMB_BITS 28


+ 0
- 3
src/p448/arch_ref64/arch_config.h View File

@@ -1,3 +0,0 @@
#define WORD_BITS 64
#define DECAF_448_LIMB_BITS 56


+ 4
- 16
src/p448/arch_ref64/f_impl.c View File

@@ -4,18 +4,6 @@

#include "f_field.h"

static __inline__ __uint128_t widemul(
const uint64_t a,
const uint64_t b
) {
return ((__uint128_t)a) * ((__uint128_t)b);
}

static __inline__ uint64_t is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb;
@@ -337,7 +325,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -349,7 +337,7 @@ void gf_strong_reduce (gf a) {
carry >>= 56;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const gf x) {
@@ -389,12 +377,12 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}

+ 0
- 2
src/p448/arch_x86_64/arch_config.h View File

@@ -1,2 +0,0 @@
#define WORD_BITS 64
#define DECAF_448_LIMB_BITS 56

+ 4
- 4
src/p448/arch_x86_64/f_impl.c View File

@@ -315,7 +315,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf a) {
carry >>= 56;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const gf x) {
@@ -367,13 +367,13 @@ mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}


+ 0
- 1
src/p480/arch_x86_64/arch_config.h View File

@@ -1 +0,0 @@
#define WORD_BITS 64

+ 4
- 4
src/p480/arch_x86_64/f_impl.c View File

@@ -315,7 +315,7 @@ void gf_strong_reduce (gf *a) {
* so let's add back in p. will carry back off the top for 2^480.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -327,7 +327,7 @@ void gf_strong_reduce (gf *a) {
carry >>= 60;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const struct gf *x) {
@@ -381,13 +381,13 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
ge = (ge & (x->limb[4] + 1)) | word_is_zero(x->limb[4] ^ mask);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
return ~word_is_zero(ge ^ mask);
}


+ 0
- 1
src/p521/arch_ref64/arch_config.h View File

@@ -1 +0,0 @@
#define WORD_BITS 64

+ 4
- 16
src/p521/arch_ref64/f_impl.c View File

@@ -4,18 +4,6 @@

#include "f_field.h"

static __inline__ __uint128_t widemul(
const uint64_t a,
const uint64_t b
) {
return ((__uint128_t)a) * ((__uint128_t)b);
}

static __inline__ uint64_t is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
uint64_t *c = cs->limb;
const uint64_t *a = as->limb, *b = bs->limb;
@@ -318,7 +306,7 @@ void gf_strong_reduce (gf a) {
* so let's add back in p. will carry back off the top for 2^521.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -330,7 +318,7 @@ void gf_strong_reduce (gf a) {
carry >>= (i==8) ? 57 : 58;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));
}

void gf_serialize (uint8_t *serial, const struct gf x) {
@@ -367,14 +355,14 @@ mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
}
/* Check for reduction. First, high has to be < 2^57 */
mask_t good = is_zero(out>>57);
mask_t good = word_is_zero(out>>57);
uint64_t and = -1ull;
for (i=0; i<8; i++) {
and &= x->limb[i];
}
and &= (2*out+1);
good &= is_zero((and+1)>>58);
good &= word_is_zero((and+1)>>58);
return good;
}

+ 0
- 1
src/p521/arch_x86_64_r12/arch_config.h View File

@@ -1 +0,0 @@
#define WORD_BITS 64

+ 4
- 9
src/p521/arch_x86_64_r12/f_impl.c View File

@@ -8,11 +8,6 @@ typedef struct {
uint64x3_t lo, hi, hier;
} nonad_t;

static __inline__ uint64_t is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}

static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
return ((__uint128_t)(a)) * b;
}
@@ -378,7 +373,7 @@ void gf_strong_reduce (gf *a) {
* so let's add back in p. will carry back off the top for 2^521.
*/

assert(is_zero(scarry) | is_zero(scarry+1));
assert(word_is_zero(scarry) | word_is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
@@ -390,7 +385,7 @@ void gf_strong_reduce (gf *a) {
carry >>= (i==8) ? 57 : 58;
}

assert(is_zero(carry + scarry));
assert(word_is_zero(carry + scarry));

a->limb[3] = a->limb[7] = a->limb[11] = 0;
}
@@ -429,14 +424,14 @@ mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
}
/* Check for reduction. First, high has to be < 2^57 */
mask_t good = is_zero(out>>57);
mask_t good = word_is_zero(out>>57);
uint64_t and = -1ull;
for (i=0; i<8; i++) {
and &= x->limb[LIMBPERM(i)];
}
and &= (2*out+1);
good &= is_zero((and+1)>>58);
good &= word_is_zero((and+1)>>58);

x->limb[3] = x->limb[7] = x->limb[11] = 0;


Loading…
Cancel
Save