Browse Source

Internal changes which break compatibility with previous versions

(you knew this would happen).

Added ARM NEON support.

Added support for precomputation on public keys, which speeds up
later signatures and ECDH calls.  See history.txt or the doc for
details.

Reworked internals so that private keys can be derived from any
32-byte secret random value.  This also means that secret keys
can be "compressed" for cold storage.

Added more tests.  Running the tests now requires GMP, though
Goldilocks itself does not.

Linking now uses visibility instead of exported.sym.
master
Michael Hamburg 10 years ago
parent
commit
d4085b9606
25 changed files with 3764 additions and 275 deletions
  1. +43
    -0
      HISTORY.txt
  2. +54
    -15
      Makefile
  3. +170
    -11
      include/goldilocks.h
  4. +20
    -3
      src/arch_arm_32/p448.c
  5. +10
    -2
      src/arch_arm_32/p448.h
  6. +959
    -0
      src/arch_neon/ec_point.c
  7. +150
    -0
      src/arch_neon/neon_emulation.h
  8. +749
    -0
      src/arch_neon/p448.c
  9. +378
    -0
      src/arch_neon/p448.h
  10. +16
    -24
      src/arch_x86_64/p448.c
  11. +19
    -1
      src/arch_x86_64/p448.h
  12. +0
    -6
      src/exported.sym
  13. +293
    -86
      src/goldilocks.c
  14. +2
    -0
      src/include/intrinsics.h
  15. +34
    -6
      src/include/scalarmul.h
  16. +3
    -1
      src/include/sha512.h
  17. +58
    -4
      src/include/word.h
  18. +197
    -72
      src/scalarmul.c
  19. +1
    -13
      src/sha512.c
  20. +68
    -27
      test/bench.c
  21. +14
    -4
      test/test.c
  22. +6
    -0
      test/test.h
  23. +196
    -0
      test/test_arithmetic.c
  24. +195
    -0
      test/test_goldilocks.c
  25. +129
    -0
      test/test_scalarmul.c

+ 43
- 0
HISTORY.txt View File

@@ -1,3 +1,46 @@
May 3, 2104:
Minor changes to internal routines mean that this version is not
compatible with the previous one.

Added ARM NEON code.
Added the ability to precompute multiples of a partner's public key. This
takes slightly longer than a signature verification, but reduces future
verifications with the precomputed key by ~63% and ECDH by ~70%.
goldilocks_precompute_public_key
goldilocks_destroy_precomputed_public_key
goldilocks_verify_precomputed
goldilocks_shared_secret_precomputed
The precomputation feature are is protected by a macro
GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
which can be #defined to 0 to compile these functions out. Unlike most
of Goldilocks' functions, goldilocks_precompute_public_key uses malloc()
(and goldilocks_destroy_precomputed_public_key uses free()).
Changed private keys to be derived from just the symmetric part. This
means that you can compress them to 32 bytes for cold storage, or derive
keypairs from crypto secrets from other systems.
goldilocks_derive_private_key
goldilocks_underive_private_key
goldilocks_private_to_public
Fixed a number of bugs related to vector alignment on Sandy Bridge, which
has AVX but uses SSE2 alignment (because it doesn't have AVX2). Maybe I
should just switch it to use AVX2 alignment?
Beginning to factor out curve-specific magic, so as to build other curves
with the Goldilocks framework. That would enable fair tests against eg
E-521, Ed25519 etc. Still would be a lot of work.
More thorough testing of arithmetic. Now uses GMP for testing framework,
but not in the actual library.
Added some high-level tests for the whole library, including some (bs)
negative testing. Obviously, effective negative testing is a very difficult
proposition in a crypto library.

March 29, 2014:
Added a test directory with various tests. Currently testing SHA512 Monte
Carlo, compatibility of the different scalarmul functions, and some


+ 54
- 15
Makefile View File

@@ -1,23 +1,57 @@
# Copyright (c) 2014 Cryptography Research, Inc.
# Released under the MIT License. See LICENSE.txt for license information.


UNAME := $(shell uname)
MACHINE := $(shell uname -m)

ifeq ($(UNAME),Darwin)
CC = clang
LD = clang
else
CC = gcc
endif
LD = $(CC)

ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_x86_64
else
# no i386 port yet
ARCH ?= arch_arm_32
endif

ARCH = arch_x86_64

WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wgcc-compat -Wmissing-declarations
-Wmissing-declarations -Wunused-function $(EXWARN)
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
LANGFLAGS = -std=c99
GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
OFLAGS = -O3
#XFLAGS = -DN_TESTS_BASE=1000
ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16

CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
LDFLAGS = $(ARCHFLAGS)
ifneq (,$(findstring arm,$(MACHINE)))
ifneq (,$(findstring neon,$(ARCH)))
ARCHFLAGS += -mfpu=neon
else
ARCHFLAGS += -mfpu=vfpv3-d16
endif
ARCHFLAGS += -mcpu=cortex-a9 # FIXME
GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
else
ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO
endif

ifeq ($(CC),clang)
WARNFLAGS += -Wgcc-compat
endif

ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC)))
# ARCHFLAGS += -m32
ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1
endif

CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
ASFLAGS = $(ARCHFLAGS)

.PHONY: clean all test bench todo doc lib
@@ -29,7 +63,7 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o

TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
build/test_pointops.o
build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o

BENCHCOMPONENTS=build/bench.o

@@ -45,15 +79,20 @@ build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
$(LD) $(LDFLAGS) -o $@ $^

build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
$(LD) $(LDFLAGS) -o $@ $^
$(LD) $(LDFLAGS) -o $@ $^ -lgmp

lib: build/goldilocks.so

build/goldilocks.so: $(LIBCOMPONENTS)
rm -f $@
ifeq ($(UNAME),Darwin)
libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-exported_symbols_list src/exported.sym \
$(LIBCOMPONENTS)
else
$(LD) -shared -Wl,-soname,goldilocks.so.1 -Wl,--gc-sections -o $@ $(LIBCOMPONENTS)
strip --discard-all $@
ln -sf $@ build/goldilocks.so.1
endif

build/timestamp:
mkdir -p build
@@ -80,9 +119,9 @@ doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/

todo::
@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
@echo '============================='
@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
/bin/echo -n $$i' ' | head -c 10; \
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
@@ -90,7 +129,7 @@ todo::
@echo '============================='
@echo -n 'Total '
@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l

bench: build/bench
./$<


+ 170
- 11
include/goldilocks.h View File

@@ -12,13 +12,42 @@

#include <stdint.h>

#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
/** If nonzero, implement precomputation for verify and ECDH. */
#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
#endif

/** The size of the Goldilocks field, in bits. */
#define GOLDI_FIELD_BITS 448

/** The size of the Goldilocks scalars, in bits. */
#define GOLDI_SCALAR_BITS 446

/** The same size, in bytes. */
#define GOLDI_FIELD_BYTES (GOLDI_FIELD_BITS/8)

/** The size of a Goldilocks public key, in bytes. */
#define GOLDI_PUBLIC_KEY_BYTES GOLDI_FIELD_BYTES

/** The extra bytes in a Goldilocks private key for the symmetric key. */
#define GOLDI_SYMKEY_BYTES 32

/** The size of a shared secret. */
#define GOLDI_SHARED_SECRET_BYTES 64

/** The size of a Goldilocks private key, in bytes. */
#define GOLDI_PRIVATE_KEY_BYTES (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)

/** The size of a Goldilocks private key, in bytes. */
#define GOLDI_SIGNATURE_BYTES (2*GOLDI_FIELD_BYTES)

/**
* @brief Serialized form of a Goldilocks public key.
*
* @warning This isn't even my final form!
*/
struct goldilocks_public_key_t {
uint8_t opaque[56]; /**< Serialized data. */
uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
};

/**
@@ -30,7 +59,7 @@ struct goldilocks_public_key_t {
* @warning This isn't even my final form!
*/
struct goldilocks_private_key_t {
uint8_t opaque[144]; /**< Serialized data. */
uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
};

#ifdef __cplusplus
@@ -72,7 +101,7 @@ static const int GOLDI_EALREADYINIT = 44805;
*/
int
goldilocks_init ()
__attribute__((warn_unused_result));
__attribute__((warn_unused_result,visibility ("default")));


/**
@@ -90,7 +119,40 @@ int
goldilocks_keygen (
struct goldilocks_private_key_t *privkey,
struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2)));
) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));

/**
* @brief Derive a key from its compressed form.
* @param [out] privkey The derived private key.
* @param [in] proto The compressed or proto-key, which must be 32 random bytes.
*
* @warning This isn't even my final form!
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_derive_private_key (
struct goldilocks_private_key_t *privkey,
const unsigned char proto[GOLDI_SYMKEY_BYTES]
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Compress a private key (by copying out the proto-key)
* @param [out] proto The proto-key.
* @param [in] privkey The private key.
*
* @warning This isn't even my final form!
* @todo test.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
void
goldilocks_underive_private_key (
unsigned char proto[GOLDI_SYMKEY_BYTES],
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Extract the public key from a private key.
@@ -107,7 +169,7 @@ int
goldilocks_private_to_public (
struct goldilocks_public_key_t *pubkey,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2)));
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Generate a Diffie-Hellman shared secret in constant time.
@@ -140,10 +202,10 @@ goldilocks_private_to_public (
*/
int
goldilocks_shared_secret (
uint8_t shared[64],
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_public_key_t *your_pubkey
) __attribute__((warn_unused_result,nonnull(1,2,3)));
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
/**
* @brief Sign a message.
@@ -166,11 +228,11 @@ goldilocks_shared_secret (
*/
int
goldilocks_sign (
uint8_t signature_out[56*2],
uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2,4)));
) __attribute__((nonnull(1,2,4),visibility ("default")));

/**
* @brief Verify a signature.
@@ -197,11 +259,108 @@ goldilocks_sign (
*/
int
goldilocks_verify (
const uint8_t signature[56*2],
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2,4)));
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));

#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

/** A public key which has been expanded by precomputation for higher speed. */
struct goldilocks_precomputed_public_key_t;

/**
* @brief Expand a public key by precomputation.
*
* @todo Give actual error returns, instead of ambiguous NULL.
*
* @warning This isn't even my final form!
*
* @param [in] pub The public key.
* @retval NULL We ran out of memory, or the
*/
struct goldilocks_precomputed_public_key_t *
goldilocks_precompute_public_key (
const struct goldilocks_public_key_t *pub
) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));

/**
* @brief Overwrite an expanded public key with zeros, then destroy it.
*
* If the input is NULL, this function does nothing.
*
* @param [in] precom The public key.
*/
void
goldilocks_destroy_precomputed_public_key (
struct goldilocks_precomputed_public_key_t *precom
) __attribute__((visibility ("default")));

/**
* @brief Verify a signature.
*
* This function is fairly strict. It will correctly detect when
* the signature has the wrong cofactor component, or when the sig
* values aren't less than p or q.
*
* @warning This isn't even my final form!
*
* @param [in] signature The signature.
* @param [in] message The message to be verified.
* @param [in] message_len The length of the message to be verified.
* @param [in] pubkey The signer's public key, expanded by precomputation.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EINVAL The public key or signature is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_verify_precomputed (
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_precomputed_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
/**
* @brief Generate a Diffie-Hellman shared secret in constant time.
* Uses a precomputation on the other party's public key for efficiency.
*
* This function uses some compile-time flags whose merit remains to
* be decided.
*
* If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
* of zeros to the secret before hashing. In the case that the other
* party's key is detectably corrupt, instead the symmetric part
* of the secret key is used to produce a pseudorandom value.
*
* If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
* the two parties' public keys is prepended to the hash.
*
* In the current version, this function can safely be run even without
* goldilocks_init(). But this property is not guaranteed for future
* versions, so call it anyway.
*
* @warning This isn't even my final form!
*
* @param [out] shared The shared secret established with the other party.
* @param [in] my_privkey My private key.
* @param [in] your_pubkey The other party's precomputed public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EINVAL The other party's key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_shared_secret_precomputed (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_precomputed_public_key_t *your_pubkey
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */

#ifdef __cplusplus
}; /* extern "C" */


+ 20
- 3
src/arch_arm_32/p448.c View File

@@ -28,6 +28,8 @@ smlal (
const uint32_t a,
const uint32_t b
) {

#ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc)>>32;
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -35,6 +37,9 @@ smlal (
: [a]"r"(a), [b]"r"(b));
*acc = lo + (((uint64_t)hi)<<32);
#else
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
#endif
}

static inline void __attribute__((gnu_inline,always_inline))
@@ -43,6 +48,7 @@ smlal2 (
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc)>>32;
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
@@ -50,6 +56,9 @@ smlal2 (
: [a]"r"(a), [b]"r"(2*b));
*acc = lo + (((uint64_t)hi)<<32);
#else
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
#endif
}

static inline void __attribute__((gnu_inline,always_inline))
@@ -58,6 +67,7 @@ smull (
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__
uint32_t lo, hi;
__asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
@@ -65,6 +75,9 @@ smull (
: [a]"r"(a), [b]"r"(b));
*acc = lo + (((uint64_t)hi)<<32);
#else
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
#endif
}

static inline void __attribute__((gnu_inline,always_inline))
@@ -73,6 +86,7 @@ smull2 (
const uint32_t a,
const uint32_t b
) {
#ifdef __ARMEL__
uint32_t lo, hi;
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
@@ -80,6 +94,9 @@ smull2 (
: [a]"r"(a), [b]"r"(2*b));
*acc = lo + (((uint64_t)hi)<<32);
#else
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
#endif
}

void
@@ -760,13 +777,13 @@ p448_mulw (
const p448_t *as,
uint64_t b
) {
const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
uint32_t mask = (1ull<<28)-1;
const uint32_t bhi = b>>28, blo = b & mask;
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;

uint64_t accum0, accum8;
uint32_t mask = (1ull<<28)-1;

int i;

@@ -957,7 +974,7 @@ p448_deserialize (
for (j=0; j<7; j++) {
out |= ((uint64_t)serial[7*i+j])<<(8*j);
}
x->limb[2*i] = out & (1ull<<28)-1;
x->limb[2*i] = out & ((1ull<<28)-1);
x->limb[2*i+1] = out >> 28;
}


+ 10
- 2
src/arch_arm_32/p448.h View File

@@ -173,7 +173,7 @@ p448_set_ui (
uint64_t x
) {
int i;
out->limb[0] = x & (1<<28)-1;
out->limb[0] = x & ((1<<28)-1);
out->limb[1] = x>>28;
for (i=2; i<16; i++) {
out->limb[i] = 0;
@@ -188,7 +188,11 @@ p448_cond_swap (
) {
big_register_t *aa = (big_register_t*)a;
big_register_t *bb = (big_register_t*)b;
#if __ARM_NEON__
big_register_t m = vdupq_n_u32(doswap);
#else
big_register_t m = doswap;
#endif

unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
@@ -260,8 +264,12 @@ p448_cond_neg(
struct p448_t negated;
big_register_t *aa = (big_register_t *)a;
big_register_t *nn = (big_register_t*)&negated;
#if __ARM_NEON__
big_register_t m = vdupq_n_u32(doNegate);
#else
big_register_t m = doNegate;
#endif

p448_neg(&negated, a);
p448_bias(&negated, 2);


+ 959
- 0
src/arch_neon/ec_point.c View File

@@ -0,0 +1,959 @@
/**
* @cond internal
* @file ec_point.c
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @warning This file was automatically generated.
*/

#include "ec_point.h"


void
p448_isr (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L1, x );
p448_mul ( &L2, x, &L1 );
p448_sqr ( &L1, &L2 );
p448_mul ( &L2, x, &L1 );
p448_sqrn ( &L1, &L2, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L1, &L0, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L2, &L0, 9 );
p448_mul ( &L1, &L0, &L2 );
p448_sqr ( &L0, &L1 );
p448_mul ( &L2, x, &L0 );
p448_sqrn ( &L0, &L2, 18 );
p448_mul ( &L2, &L1, &L0 );
p448_sqrn ( &L0, &L2, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 111 );
p448_mul ( &L2, &L1, &L0 );
p448_sqr ( &L0, &L2 );
p448_mul ( &L1, x, &L0 );
p448_sqrn ( &L0, &L1, 223 );
p448_mul ( a, &L2, &L0 );
}

void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1;
p448_isr ( &L0, x );
p448_sqr ( &L1, &L0 );
p448_sqr ( &L0, &L1 );
p448_mul ( a, x, &L0 );
}

void
add_tw_niels_to_tw_extensible (
struct tw_extensible_t* d,
const struct tw_niels_t* e
) {
struct p448_t L0, L1;
p448_sub ( &L1, &d->y, &d->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L0, &e->a, &L1 );
p448_add ( &L1, &d->x, &d->y );
p448_mul ( &d->y, &e->b, &L1 );
p448_mul ( &L1, &d->u, &d->t );
p448_mul ( &d->x, &e->c, &L1 );
p448_add ( &d->u, &L0, &d->y );
p448_sub ( &d->t, &d->y, &L0 );
p448_bias ( &d->t, 2 );
p448_weak_reduce( &d->t );
p448_sub ( &d->y, &d->z, &d->x );
p448_bias ( &d->y, 2 );
p448_weak_reduce( &d->y );
p448_add ( &L0, &d->x, &d->z );
p448_mul ( &d->z, &L0, &d->y );
p448_mul ( &d->x, &d->y, &d->t );
p448_mul ( &d->y, &L0, &d->u );
}

void
sub_tw_niels_from_tw_extensible (
struct tw_extensible_t* d,
const struct tw_niels_t* e
) {
struct p448_t L0, L1;
p448_sub ( &L1, &d->y, &d->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L0, &e->b, &L1 );
p448_add ( &L1, &d->x, &d->y );
p448_mul ( &d->y, &e->a, &L1 );
p448_mul ( &L1, &d->u, &d->t );
p448_mul ( &d->x, &e->c, &L1 );
p448_add ( &d->u, &L0, &d->y );
p448_sub ( &d->t, &d->y, &L0 );
p448_bias ( &d->t, 2 );
p448_weak_reduce( &d->t );
p448_add ( &d->y, &d->x, &d->z );
p448_sub ( &L0, &d->z, &d->x );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_mul ( &d->z, &L0, &d->y );
p448_mul ( &d->x, &d->y, &d->t );
p448_mul ( &d->y, &L0, &d->u );
}

void
add_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
) {
struct p448_t L0;
p448_mul ( &L0, &e->z, &a->z );
p448_copy ( &e->z, &L0 );
add_tw_niels_to_tw_extensible( e, &a->n );
}

void
sub_tw_pniels_from_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
) {
struct p448_t L0;
p448_mul ( &L0, &e->z, &a->z );
p448_copy ( &e->z, &L0 );
sub_tw_niels_from_tw_extensible( e, &a->n );
}

void
double_tw_extensible (
struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &a->u, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &L1, &a->t );
p448_sub ( &a->t, &L1, &a->u );
p448_bias ( &a->t, 3 );
p448_weak_reduce( &a->t );
p448_sub ( &L1, &L0, &L2 );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 1 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_weak_reduce( &L0 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
double_extensible (
struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &L1, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &a->u, &a->t );
p448_sub ( &a->t, &a->u, &L1 );
p448_bias ( &a->t, 3 );
p448_weak_reduce( &a->t );
p448_sub ( &a->u, &L0, &L2 );
p448_bias ( &a->u, 2 );
p448_weak_reduce( &a->u );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 2 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_weak_reduce( &L0 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
twist_and_double (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &b->u, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &L0, &b->t );
p448_sub ( &b->t, &L0, &b->u );
p448_bias ( &b->t, 3 );
p448_weak_reduce( &b->t );
p448_sub ( &L0, &b->z, &b->x );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 2 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_weak_reduce( &b->y );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
untwist_and_double (
struct extensible_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &L0, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &b->u, &b->t );
p448_sub ( &b->t, &b->u, &L0 );
p448_bias ( &b->t, 3 );
p448_weak_reduce( &b->t );
p448_sub ( &b->u, &b->z, &b->x );
p448_bias ( &b->u, 2 );
p448_weak_reduce( &b->u );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 1 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_weak_reduce( &b->y );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
convert_tw_affine_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_affine_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->y, &a->x );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_set_ui( &b->z, 2 );
}

void
convert_tw_affine_to_tw_extensible (
struct tw_extensible_t* b,
const struct tw_affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_affine_to_extensible (
struct extensible_t* b,
const struct affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_tw_extensible_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_extensible_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->u, &a->t );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_add ( &b->z, &a->z, &a->z );
p448_weak_reduce( &b->z );
}

void
convert_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* d
) {
p448_add ( &e->u, &d->n.b, &d->n.a );
p448_sub ( &e->t, &d->n.b, &d->n.a );
p448_bias ( &e->t, 2 );
p448_weak_reduce( &e->t );
p448_mul ( &e->x, &d->z, &e->t );
p448_mul ( &e->y, &d->z, &e->u );
p448_sqr ( &e->z, &d->z );
}

void
convert_tw_niels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_niels_t* d
) {
p448_add ( &e->y, &d->b, &d->a );
p448_weak_reduce( &e->y );
p448_sub ( &e->x, &d->b, &d->a );
p448_bias ( &e->x, 2 );
p448_weak_reduce( &e->x );
p448_set_ui( &e->z, 1 );
p448_copy ( &e->t, &e->x );
p448_copy ( &e->u, &e->y );
}

void
montgomery_step (
struct montgomery_t* a
) {
struct p448_t L0, L1;
p448_add ( &L0, &a->zd, &a->xd );
p448_sub ( &L1, &a->xd, &a->zd );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_sub ( &a->zd, &a->xa, &a->za );
p448_bias ( &a->zd, 2 );
p448_weak_reduce( &a->zd );
p448_mul ( &a->xd, &L0, &a->zd );
p448_add ( &a->zd, &a->za, &a->xa );
p448_mul ( &a->za, &L1, &a->zd );
p448_add ( &a->xa, &a->za, &a->xd );
p448_sqr ( &a->zd, &a->xa );
p448_mul ( &a->xa, &a->z0, &a->zd );
p448_sub ( &a->zd, &a->xd, &a->za );
p448_bias ( &a->zd, 2 );
p448_weak_reduce( &a->zd );
p448_sqr ( &a->za, &a->zd );
p448_sqr ( &a->xd, &L0 );
p448_sqr ( &L0, &L1 );
p448_mulw ( &a->zd, &a->xd, 39082 );
p448_sub ( &L1, &a->xd, &L0 );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &a->xd, &L0, &a->zd );
p448_sub ( &L0, &a->zd, &L1 );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_mul ( &a->zd, &L0, &L1 );
}

void
deserialize_montgomery (
struct montgomery_t* a,
const struct p448_t* sbz
) {
p448_sqr ( &a->z0, sbz );
p448_set_ui( &a->xd, 1 );
p448_set_ui( &a->zd, 0 );
p448_set_ui( &a->xa, 1 );
p448_copy ( &a->za, &a->z0 );
}

mask_t
serialize_montgomery (
struct p448_t* b,
const struct montgomery_t* a,
const struct p448_t* sbz
) {
mask_t L0, L1, L2;
struct p448_t L3, L4, L5, L6;
p448_mul ( &L6, &a->z0, &a->zd );
p448_sub ( &L4, &L6, &a->xd );
p448_bias ( &L4, 2 );
p448_weak_reduce( &L4 );
p448_mul ( &L6, &a->za, &L4 );
p448_mul ( &L5, &a->z0, &a->xd );
p448_sub ( &L4, &L5, &a->zd );
p448_bias ( &L4, 2 );
p448_weak_reduce( &L4 );
p448_mul ( &L3, &a->xa, &L4 );
p448_add ( &L5, &L3, &L6 );
p448_sub ( &L4, &L6, &L3 );
p448_bias ( &L4, 2 );
p448_weak_reduce( &L4 );
p448_mul ( &L6, &L4, &L5 );
p448_copy ( &L5, &a->z0 );
p448_addw ( &L5, 1 );
p448_sqr ( &L4, &L5 );
p448_mulw ( &L5, &L4, 39082 );
p448_neg ( &L4, &L5 );
p448_add ( &L5, &a->z0, &a->z0 );
p448_bias ( &L5, 1 );
p448_add ( &L3, &L5, &L5 );
p448_add ( &L5, &L3, &L4 );
p448_weak_reduce( &L5 );
p448_mul ( &L3, &a->xd, &L5 );
L1 = p448_is_zero( &a->zd );
L2 = - L1;
p448_mask ( &L4, &L3, L1 );
p448_add ( &L5, &L4, &a->zd );
L0 = ~ L1;
p448_mul ( &L4, sbz, &L6 );
p448_addw ( &L4, L2 );
p448_mul ( &L6, &L5, &L4 );
p448_mul ( &L4, &L6, &L5 );
p448_mul ( &L5, &L6, &a->xd );
p448_mul ( &L6, &L4, &L5 );
p448_isr ( &L3, &L6 );
p448_mul ( &L5, &L4, &L3 );
p448_sqr ( &L4, &L3 );
p448_mul ( &L3, &L6, &L4 );
p448_mask ( b, &L5, L0 );
p448_subw ( &L3, 1 );
p448_bias ( &L3, 1 );
L1 = p448_is_zero( &L3 );
L0 = p448_is_zero( sbz );
return L1 | L0;
}

void
serialize_extensible (
struct p448_t* b,
const struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sub ( &L0, &a->y, &a->z );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_add ( b, &a->z, &a->y );
p448_mul ( &L1, &a->z, &a->x );
p448_mul ( &L2, &L0, &L1 );
p448_mul ( &L1, &L2, &L0 );
p448_mul ( &L0, &L2, b );
p448_mul ( &L2, &L1, &L0 );
p448_isr ( &L0, &L2 );
p448_mul ( b, &L1, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, &L2, &L1 );
}

void
untwist_and_double_and_serialize (
struct p448_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_mul ( &L3, &a->y, &a->x );
p448_add ( b, &a->y, &a->x );
p448_sqr ( &L1, b );
p448_add ( &L2, &L3, &L3 );
p448_sub ( b, &L1, &L2 );
p448_bias ( b, 3 );
p448_weak_reduce( b );
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &L2 );
p448_add ( &L2, b, b );
p448_mulw ( b, &L2, 39082 );
p448_neg ( &L2, b );
p448_bias ( &L2, 2 );
p448_mulw ( &L0, &L2, 39082 );
p448_neg ( b, &L0 );
p448_bias ( b, 2 );
p448_mul ( &L0, &L2, &L1 );
p448_mul ( &L2, b, &L0 );
p448_isr ( &L0, &L2 );
p448_mul ( &L1, b, &L0 );
p448_sqr ( b, &L0 );
p448_mul ( &L0, &L2, b );
p448_mul ( b, &L1, &L3 );
}

void
twist_even (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
mask_t L0, L1;
p448_sqr ( &b->y, &a->z );
p448_sqr ( &b->z, &a->x );
p448_sub ( &b->u, &b->y, &b->z );
p448_bias ( &b->u, 2 );
p448_weak_reduce( &b->u );
p448_sub ( &b->z, &a->z, &a->x );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->y, &b->z, &a->y );
p448_sub ( &b->z, &a->z, &a->y );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->x, &b->z, &b->y );
p448_mul ( &b->t, &b->x, &b->u );
p448_mul ( &b->y, &b->x, &b->t );
p448_isr ( &b->t, &b->y );
p448_mul ( &b->u, &b->x, &b->t );
p448_sqr ( &b->x, &b->t );
p448_mul ( &b->t, &b->y, &b->x );
p448_mul ( &b->x, &a->x, &b->u );
p448_mul ( &b->y, &a->y, &b->u );
L1 = p448_is_zero( &b->z );
L0 = - L1;
p448_addw ( &b->y, L0 );
p448_weak_reduce( &b->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &b->x );
p448_copy ( &b->u, &b->y );
}

void
test_only_twist (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
mask_t L0, L1;
struct p448_t L2, L3;
p448_sqr ( &b->u, &a->z );
p448_sqr ( &b->y, &a->x );
p448_sub ( &b->z, &b->u, &b->y );
p448_bias ( &b->z, 2 );
p448_add ( &b->y, &b->z, &b->z );
p448_add ( &b->u, &b->y, &b->y );
p448_weak_reduce( &b->u );
p448_sub ( &b->y, &a->z, &a->x );
p448_bias ( &b->y, 2 );
p448_weak_reduce( &b->y );
p448_mul ( &b->x, &b->y, &a->y );
p448_sub ( &b->z, &a->z, &a->y );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->t, &b->z, &b->x );
p448_mul ( &L3, &b->t, &b->u );
p448_mul ( &b->x, &b->t, &L3 );
p448_isr ( &L2, &b->x );
p448_mul ( &b->u, &b->t, &L2 );
p448_sqr ( &L3, &L2 );
p448_mul ( &b->t, &b->x, &L3 );
p448_add ( &b->x, &a->y, &a->x );
p448_weak_reduce( &b->x );
p448_sub ( &L2, &a->x, &a->y );
p448_bias ( &L2, 2 );
p448_weak_reduce( &L2 );
p448_mul ( &L3, &b->t, &L2 );
p448_add ( &L2, &L3, &b->x );
p448_sub ( &b->t, &b->x, &L3 );
p448_bias ( &b->t, 2 );
p448_weak_reduce( &b->t );
p448_mul ( &b->x, &L2, &b->u );
L0 = p448_is_zero( &b->y );
L1 = - L0;
p448_addw ( &b->x, L1 );
p448_weak_reduce( &b->x );
p448_mul ( &b->y, &b->t, &b->u );
L0 = p448_is_zero( &b->z );
L1 = - L0;
p448_addw ( &b->y, L1 );
p448_weak_reduce( &b->y );
L1 = p448_is_zero( &a->y );
L0 = L1 + 1;
p448_set_ui( &b->z, L0 );
p448_copy ( &b->t, &b->x );
p448_copy ( &b->u, &b->y );
}

mask_t
is_square (
const struct p448_t* x
) {
mask_t L0, L1;
struct p448_t L2, L3;
p448_isr ( &L2, x );
p448_sqr ( &L3, &L2 );
p448_mul ( &L2, x, &L3 );
p448_subw ( &L2, 1 );
p448_bias ( &L2, 1 );
L1 = p448_is_zero( &L2 );
L0 = p448_is_zero( x );
return L1 | L0;
}

mask_t
is_even_pt (
const struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &a->x );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
return is_square ( &L0 );
}

mask_t
is_even_tw (
const struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &a->x );
p448_add ( &L0, &L1, &L2 );
p448_weak_reduce( &L0 );
return is_square ( &L0 );
}

mask_t
deserialize_affine (
struct affine_t* a,
const struct p448_t* sz
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L1, sz );
p448_copy ( &L3, &L1 );
p448_addw ( &L3, 1 );
p448_sqr ( &a->x, &L3 );
p448_mulw ( &L3, &a->x, 39082 );
p448_neg ( &a->x, &L3 );
p448_add ( &L3, &L1, &L1 );
p448_bias ( &L3, 1 );
p448_add ( &a->y, &L3, &L3 );
p448_add ( &L3, &a->y, &a->x );
p448_weak_reduce( &L3 );
p448_copy ( &a->y, &L1 );
p448_subw ( &a->y, 1 );
p448_neg ( &a->x, &a->y );
p448_bias ( &a->x, 2 );
p448_weak_reduce( &a->x );
p448_mul ( &a->y, &a->x, &L3 );
p448_sqr ( &L2, &a->x );
p448_mul ( &L0, &L2, &a->y );
p448_mul ( &a->y, &a->x, &L0 );
p448_isr ( &L3, &a->y );
p448_mul ( &a->y, &L2, &L3 );
p448_sqr ( &L2, &L3 );
p448_mul ( &L3, &L0, &L2 );
p448_mul ( &L0, &a->x, &L3 );
p448_add ( &L2, &a->y, &a->y );
p448_mul ( &a->x, sz, &L2 );
p448_addw ( &L1, 1 );
p448_mul ( &a->y, &L1, &L3 );
p448_subw ( &L0, 1 );
p448_bias ( &L0, 1 );
return p448_is_zero( &L0 );
}

mask_t
deserialize_and_twist_approx (
struct tw_extensible_t* a,
const struct p448_t* sdm1,
const struct p448_t* sz
) {
struct p448_t L0, L1;
p448_sqr ( &a->z, sz );
p448_copy ( &a->y, &a->z );
p448_addw ( &a->y, 1 );
p448_sqr ( &a->x, &a->y );
p448_mulw ( &a->y, &a->x, 39082 );
p448_neg ( &a->x, &a->y );
p448_add ( &a->y, &a->z, &a->z );
p448_bias ( &a->y, 1 );
p448_add ( &a->u, &a->y, &a->y );
p448_add ( &a->y, &a->u, &a->x );
p448_weak_reduce( &a->y );
p448_sqr ( &a->x, &a->z );
p448_subw ( &a->x, 1 );
p448_neg ( &a->u, &a->x );
p448_bias ( &a->u, 2 );
p448_weak_reduce( &a->u );
p448_mul ( &a->x, sdm1, &a->u );
p448_mul ( &L0, &a->x, &a->y );
p448_mul ( &a->t, &L0, &a->y );
p448_mul ( &a->u, &a->x, &a->t );
p448_mul ( &a->t, &a->u, &L0 );
p448_mul ( &a->y, &a->x, &a->t );
p448_isr ( &L0, &a->y );
p448_mul ( &a->y, &a->u, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &a->u, &a->t, &L1 );
p448_mul ( &a->t, &a->x, &a->u );
p448_add ( &a->x, sz, sz );
p448_mul ( &L0, &a->u, &a->x );
p448_copy ( &a->x, &a->z );
p448_subw ( &a->x, 1 );
p448_neg ( &L1, &a->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &a->x, &L1, &L0 );
p448_mul ( &L0, &a->u, &a->y );
p448_addw ( &a->z, 1 );
p448_mul ( &a->y, &a->z, &L0 );
p448_subw ( &a->t, 1 );
p448_bias ( &a->t, 1 );
mask_t ret = p448_is_zero( &a->t );
p448_set_ui( &a->z, 1 );
p448_copy ( &a->t, &a->x );
p448_copy ( &a->u, &a->y );
return ret;
}

void
set_identity_extensible (
struct extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_tw_extensible (
struct tw_extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_affine (
struct affine_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
}

mask_t
eq_affine (
const struct affine_t* a,
const struct affine_t* b
) {
mask_t L0, L1;
struct p448_t L2;
p448_sub ( &L2, &a->x, &b->x );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_sub ( &L2, &a->y, &b->y );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

mask_t
eq_extensible (
const struct extensible_t* a,
const struct extensible_t* b
) {
mask_t L0, L1;
struct p448_t L2, L3, L4;
p448_mul ( &L4, &b->z, &a->x );
p448_mul ( &L3, &a->z, &b->x );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_mul ( &L4, &b->z, &a->y );
p448_mul ( &L3, &a->z, &b->y );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

mask_t
eq_tw_extensible (
const struct tw_extensible_t* a,
const struct tw_extensible_t* b
) {
mask_t L0, L1;
struct p448_t L2, L3, L4;
p448_mul ( &L4, &b->z, &a->x );
p448_mul ( &L3, &a->z, &b->x );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_mul ( &L4, &b->z, &a->y );
p448_mul ( &L3, &a->z, &b->y );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

void
elligator_2s_inject (
struct affine_t* a,
const struct p448_t* r
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
p448_sqr ( &a->x, r );
p448_sqr ( &L3, &a->x );
p448_copy ( &a->y, &L3 );
p448_subw ( &a->y, 1 );
p448_neg ( &L9, &a->y );
p448_bias ( &L9, 2 );
p448_weak_reduce( &L9 );
p448_sqr ( &L2, &L9 );
p448_mulw ( &L8, &L2, 1527402724 );
p448_mulw ( &L7, &L3, 6108985600 );
p448_add ( &a->y, &L7, &L8 );
p448_weak_reduce( &a->y );
p448_mulw ( &L8, &L2, 6109454568 );
p448_sub ( &L7, &a->y, &L8 );
p448_bias ( &L7, 2 );
p448_weak_reduce( &L7 );
p448_mulw ( &L4, &a->y, 78160 );
p448_mul ( &L6, &L7, &L9 );
p448_mul ( &L8, &L6, &L4 );
p448_mul ( &L4, &L7, &L8 );
p448_isr ( &L5, &L4 );
p448_mul ( &L4, &L6, &L5 );
p448_sqr ( &L6, &L5 );
p448_mul ( &L5, &L8, &L6 );
p448_mul ( &L8, &L7, &L5 );
p448_mul ( &L7, &L8, &L5 );
p448_copy ( &L5, &a->x );
p448_subw ( &L5, 1 );
p448_addw ( &a->x, 1 );
p448_mul ( &L6, &a->x, &L8 );
p448_sub ( &a->x, &L5, &L6 );
p448_bias ( &a->x, 3 );
p448_weak_reduce( &a->x );
p448_mul ( &L5, &L4, &a->x );
p448_mulw ( &L4, &L5, 78160 );
p448_neg ( &a->x, &L4 );
p448_bias ( &a->x, 2 );
p448_weak_reduce( &a->x );
p448_add ( &L4, &L3, &L3 );
p448_add ( &L3, &L4, &L2 );
p448_subw ( &L3, 2 );
p448_bias ( &L3, 1 );
p448_weak_reduce( &L3 );
p448_mul ( &L2, &L3, &L8 );
p448_mulw ( &L3, &L2, 3054649120 );
p448_add ( &L2, &L3, &a->y );
p448_mul ( &a->y, &L7, &L2 );
L1 = p448_is_zero( &L9 );
L0 = - L1;
p448_addw ( &a->y, L0 );
p448_weak_reduce( &a->y );
}

mask_t
validate_affine (
const struct affine_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L0, &a->y );
p448_sqr ( &L2, &a->x );
p448_add ( &L3, &L2, &L0 );
p448_subw ( &L3, 1 );
p448_mulw ( &L1, &L2, 39081 );
p448_neg ( &L2, &L1 );
p448_bias ( &L2, 2 );
p448_mul ( &L1, &L0, &L2 );
p448_sub ( &L0, &L3, &L1 );
p448_bias ( &L0, 3 );
return p448_is_zero( &L0 );
}

mask_t
validate_tw_extensible (
const struct tw_extensible_t* ext
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5;
/*
* Check invariant:
* 0 = -x*y + z*t*u
*/
p448_mul ( &L2, &ext->t, &ext->u );
p448_mul ( &L4, &ext->z, &L2 );
p448_addw ( &L4, 0 );
p448_mul ( &L3, &ext->x, &ext->y );
p448_neg ( &L2, &L3 );
p448_add ( &L3, &L2, &L4 );
p448_bias ( &L3, 2 );
L1 = p448_is_zero( &L3 );
/*
* Check invariant:
* 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
*/
p448_sqr ( &L4, &ext->y );
p448_neg ( &L2, &L4 );
p448_addw ( &L2, 0 );
p448_sqr ( &L3, &ext->x );
p448_add ( &L4, &L3, &L2 );
p448_sqr ( &L5, &ext->u );
p448_sqr ( &L3, &ext->t );
p448_mul ( &L2, &L3, &L5 );
p448_mulw ( &L3, &L2, 39081 );
p448_neg ( &L5, &L3 );
p448_add ( &L3, &L5, &L4 );
p448_neg ( &L5, &L2 );
p448_add ( &L4, &L5, &L3 );
p448_sqr ( &L3, &ext->z );
p448_add ( &L2, &L3, &L4 );
p448_bias ( &L2, 4 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

mask_t
validate_extensible (
const struct extensible_t* ext
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5;
/*
* Check invariant:
* 0 = d*t^2*u^2 - x^2 - y^2 + z^2
*/
p448_sqr ( &L4, &ext->y );
p448_neg ( &L3, &L4 );
p448_addw ( &L3, 0 );
p448_sqr ( &L2, &ext->z );
p448_add ( &L4, &L2, &L3 );
p448_sqr ( &L5, &ext->u );
p448_sqr ( &L2, &ext->t );
p448_mul ( &L3, &L2, &L5 );
p448_mulw ( &L5, &L3, 39081 );
p448_neg ( &L2, &L5 );
p448_add ( &L3, &L2, &L4 );
p448_sqr ( &L2, &ext->x );
p448_neg ( &L4, &L2 );
p448_add ( &L2, &L4, &L3 );
p448_bias ( &L2, 4 );
L1 = p448_is_zero( &L2 );
/*
* Check invariant:
* 0 = -x*y + z*t*u
*/
p448_mul ( &L3, &ext->t, &ext->u );
p448_mul ( &L4, &ext->z, &L3 );
p448_addw ( &L4, 0 );
p448_mul ( &L2, &ext->x, &ext->y );
p448_neg ( &L3, &L2 );
p448_add ( &L2, &L3, &L4 );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}



+ 150
- 0
src/arch_neon/neon_emulation.h View File

@@ -0,0 +1,150 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/**
* @file "neon_emulation.h"
* @brief NEON intrinsic emulation using clang's vector extensions.
*
* This lets you test and debug NEON code on x86.
*/
#ifndef __NEON_EMULATION_H__
#define __NEON_EMULATION_H__ 1

#include "word.h"

#include <stdint.h>
#include <assert.h>

static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) {
a.x += b.x;
a.y += b.y;
return a;
}

static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
xx_vaddup_s64(int64x2_t x) {
x.y += x.x;
return x;
}

typedef struct { int32x2_t val[2]; } int32x2x2_t;
static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) {
int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}};
return out;
}

static __inline__ void __attribute__((gnu_inline,always_inline))
xx_vtrnq_s64 (
int64x2_t *x,
int64x2_t *y
) {
int64_t tmp = (*x).y;
(*x).y = (*y).x;
(*y).x = tmp;
}

int64x2_t vsraq_n_s64 (
int64x2_t a,
int64x2_t v,
const int x
) {
return a + (v >> x);
}

int64x2_t vshrq_n_s64 (
int64x2_t v,
const int x
) {
return v >> x;
}

static inline int64_t vgetq_lane_s64 (
int64x2_t acc,
const int lane
) {
return lane ? acc.y : acc.x;
}

static inline int32_t vget_lane_s32 (
int32x2_t acc,
const int lane
) {
return lane ? acc.y : acc.x;
}

static inline int64x2_t vmlal_lane_s32 (
int64x2_t acc,
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
return acc + xx*(lane?yy.yy:yy.xx);
}

static inline int64x2_t vmlsl_lane_s32 (
int64x2_t acc,
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
return acc - xx*(lane?yy.yy:yy.xx);
}

static inline int64x2_t vqdmlsl_lane_s32 (
int64x2_t acc,
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
assert(tmp.x >> 63 == tmp.x>>62);
assert(tmp.y >> 63 == tmp.y>>62);
return acc - 2*tmp;
}

static inline int64x2_t vqdmlal_lane_s32 (
int64x2_t acc,
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
assert(tmp.x >> 63 == tmp.x>>62);
assert(tmp.y >> 63 == tmp.y>>62);
return acc + 2*tmp;
}

static inline int64x2_t vqdmull_lane_s32 (
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
assert(tmp.x >> 63 == tmp.x>>62);
assert(tmp.y >> 63 == tmp.y>>62);
return 2*tmp;
}

static inline int32x2_t vmovn_s64(
int64x2_t x
) {
int32x2_t y = {x.x,x.y};
return y;
}

static inline int64x2_t vmull_lane_s32 (
int32x2_t x,
int32x2_t y,
int lane
) {
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
return xx*(lane?yy.yy:yy.xx);
}

#endif /* __NEON_EMULATION_H__ */

+ 749
- 0
src/arch_neon/p448.c View File

@@ -0,0 +1,749 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "word.h"
#include "p448.h"

static inline mask_t __attribute__((always_inline))
is_zero (
word_t x
) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}

static uint64_t widemul_32 (
const uint32_t a,
const uint32_t b
) {
return ((uint64_t)a)* b;
}

#ifdef __ARM_NEON__
static __inline__ void __attribute__((gnu_inline,always_inline))
xx_vtrnq_s64 (
int64x2_t *x,
int64x2_t *y
) {
__asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y));
}

static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
xx_vaddup_s64(int64x2_t x) {
__asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
return x;
}
#else
#include "neon_emulation.h"
#endif // ARM_NEON

static inline void __attribute__((gnu_inline,always_inline))
smlal (
uint64_t *acc,
const uint32_t a,
const uint32_t b
) {
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}

static inline void __attribute__((gnu_inline,always_inline))
smlal2 (
uint64_t *acc,
const uint32_t a,
const uint32_t b
) {
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
}

static inline void __attribute__((gnu_inline,always_inline))
smull (
uint64_t *acc,
const uint32_t a,
const uint32_t b
) {
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}

static inline void __attribute__((gnu_inline,always_inline))
smull2 (
uint64_t *acc,
const uint32_t a,
const uint32_t b
) {
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
}

// static inline int64x2_t copy_now(int64x2_t x) {
// int64x2_t y;
// __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x));
// return y;
// }

void
p448_mul (
p448_t *__restrict__ cs,
const p448_t *as,
const p448_t *bs
) {
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
const int32x2_t
*val = (const int32x2_t *)a,
*vbl = (const int32x2_t *)b,
*vah = (const int32x2_t *)(&a[8]),
*vbh = (const int32x2_t *)(&b[8]);
int32x2_t
*vcl = (int32x2_t *)c,
*vch = (int32x2_t *)(&c[8]),
vmask = {(1<<28) - 1, (1<<28)-1};

int64x2_t accumx0a, accumx0b;
int64x2_t accumx1a, accumx1b;
int64x2_t accumx2a, accumx2b;
int64x2_t accumx3a, accumx3b;
int64x2_t accumx4a, accumx4b;
int64x2_t accumx5a, accumx5b;
int64x2_t accumx6a, accumx6b;
int64x2_t accumx7a, accumx7b;
int64x2_t carry;
int32x2x2_t trn_res;
int32x2_t delta;
accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0);
accumx1a = vmull_lane_s32( delta, vbh[3], 1);
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
accumx3a = vmull_lane_s32( delta, vbh[3], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
accumx3b = vmull_lane_s32( delta, vbh[1], 1);
accumx0b = vmull_lane_s32( delta, vbh[0], 0);
accumx1b = vmull_lane_s32( delta, vbh[0], 1);
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
accumx0b += accumx0a;
accumx1b += accumx1a;
accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
accumx0a += accumx0b;
accumx1a += accumx1b;
accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
xx_vtrnq_s64(&accumx0a, &accumx0b);
xx_vtrnq_s64(&accumx1a, &accumx1b);
xx_vtrnq_s64(&accumx2a, &accumx2b);
xx_vtrnq_s64(&accumx3a, &accumx3b);
accumx0b += accumx1a;
accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
accumx2a += accumx1b;
accumx2b += accumx3a;
accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
vcl[1] = trn_res.val[1] & vmask;
vch[1] = trn_res.val[0] & vmask;
carry = accumx3b;
accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0);
accumx5a = vmull_lane_s32( delta, vbh[3], 1);
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
accumx7b = vmull_lane_s32( delta, vbh[3], 1);
accumx4b = accumx4a;
accumx5b = accumx5a;
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
accumx4a += accumx4b;
accumx5a += accumx5b;
accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
/**/
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
xx_vtrnq_s64(&accumx4a, &accumx4b);
xx_vtrnq_s64(&accumx5a, &accumx5b);
xx_vtrnq_s64(&accumx6a, &accumx6b);
xx_vtrnq_s64(&accumx7a, &accumx7b);
accumx4a += carry;
accumx4b += accumx5a;
accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
accumx6a += accumx5b;
accumx6b += accumx7a;
trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
vcl[2] = trn_res.val[1] & vmask;
vch[2] = trn_res.val[0] & vmask;
accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
vcl[3] = trn_res.val[1] & vmask;
vch[3] = trn_res.val[0] & vmask;
accumx7b = xx_vaddup_s64(accumx7b);

int32x2_t t0 = vcl[0], t1 = vch[0];
trn_res = vtrn_s32(t0,t1);
t0 = trn_res.val[0]; t1 = trn_res.val[1];
accumx7b = vaddw_s32(accumx7b, t0);
t0 = vmovn_s64(accumx7b) & vmask;
accumx7b = vshrq_n_s64(accumx7b,28);
accumx7b = vaddw_s32(accumx7b, t1);
t1 = vmovn_s64(accumx7b) & vmask;
trn_res = vtrn_s32(t0,t1);
vcl[0] = trn_res.val[0];
vch[0] = trn_res.val[1];
accumx7b = vshrq_n_s64(accumx7b,28);

t0 = vmovn_s64(accumx7b);
uint32_t
c0 = vget_lane_s32(t0,0),
c1 = vget_lane_s32(t0,1);
c[2] += c0;
c[10] += c1;
}

void
p448_sqr (
p448_t *__restrict__ cs,
const p448_t *as
) {
/* FUTURE possible improvements:
* don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end
* or use phi/nega-phi for everything, montgomery style
* or find some sort of phi algorithm which doesn't have this problem
* break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks
*
* These improvements are all pretty minor, but I guess together they might matter?
*/
const uint32_t *b = as->limb;
uint32_t *c = cs->limb;

int32x2_t vbm[4];
const int32x2_t
*vbl = (const int32x2_t *)b,
*vbh = (const int32x2_t *)(&b[8]);
int i;
for (i=0; i<4; i++) {
vbm[i] = vbl[i] - vbh[i];
}
int32x2_t
*vcl = (int32x2_t *)c,
*vch = (int32x2_t *)(&c[8]),
vmask = {(1<<28) - 1, (1<<28)-1};

int64x2_t accumx0a, accumx0b;
int64x2_t accumx1a, accumx1b;
int64x2_t accumx2a, accumx2b;
int64x2_t accumx3a, accumx3b;
int64x2_t accumx4a, accumx4b;
int64x2_t accumx5a, accumx5b;
int64x2_t accumx6a, accumx6b;
int64x2_t accumx7a, accumx7b;
int64x2_t carry;
int32x2x2_t trn_res;
accumx0a = vqdmull_lane_s32( vbh[1], vbh[3], 0);
accumx1a = vqdmull_lane_s32( vbh[1], vbh[3], 1);
accumx2a = vqdmull_lane_s32( vbh[2], vbh[3], 0);
accumx3a = vqdmull_lane_s32( vbh[2], vbh[3], 1);
accumx0a = vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1);
accumx2b = accumx2a;
accumx3b = accumx3a;
accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0);
accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1);
accumx0b = accumx0a;
accumx1b = accumx1a;
accumx0b = vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0);
accumx1b = vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1);
accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0);
accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1);
accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0);
accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1);
accumx0b = vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0);
accumx1b = vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0);
accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1);
accumx0a += accumx0b;
accumx1a += accumx1b;
accumx0a = vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0);
accumx1a = vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1);
accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0);
accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1);
accumx0a = vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0);
accumx1a = vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1);
accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0);
accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1);
accumx0b += accumx0a;
accumx1b += accumx1a;
accumx0b = vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0);
accumx1b = vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0);
accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1);
xx_vtrnq_s64(&accumx0b, &accumx0a);
xx_vtrnq_s64(&accumx1b, &accumx1a);
xx_vtrnq_s64(&accumx2b, &accumx2a);
xx_vtrnq_s64(&accumx3b, &accumx3a);
accumx0a += accumx1b;
accumx0a = vsraq_n_s64(accumx0a,accumx0b,28);
accumx1a = vsraq_n_s64(accumx1a,accumx0a,28);
accumx2b += accumx1a;
accumx2a += accumx3b;
accumx2a = vsraq_n_s64(accumx2a,accumx2b,28);
accumx3a = vsraq_n_s64(accumx3a,accumx2a,28);
trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a));
vcl[1] = trn_res.val[1] & vmask;
vch[1] = trn_res.val[0] & vmask;
carry = accumx3a;
accumx4a = vmull_lane_s32( vbh[3], vbh[3], 0);
accumx5a = vmull_lane_s32( vbh[3], vbh[3], 1);
accumx6b = vqdmull_lane_s32( vbh[0], vbh[3], 0);
accumx7b = vqdmull_lane_s32( vbh[0], vbh[3], 1);
accumx4b = accumx4a;
accumx5b = accumx5a;
accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0);
accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1);
accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0);
accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1);
accumx4b = vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1);
accumx4b = vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0);
accumx5b = vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0);
accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1);
accumx4a += accumx4b;
accumx5a += accumx5b;
accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0);
accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1);
accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0);
accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1);
accumx4a = vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0);
accumx5a = vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1);
accumx4a = vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0);
accumx5a = vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1);
accumx6b += accumx6a;
accumx7b += accumx7a;
accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0);
accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1);
accumx4b += accumx4a;
accumx5b += accumx5a;
accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0);
accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1);
accumx4b = vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0);
accumx5b = vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1);
accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0);
accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1);
xx_vtrnq_s64(&accumx4b, &accumx4a);
xx_vtrnq_s64(&accumx5b, &accumx5a);
xx_vtrnq_s64(&accumx6b, &accumx6a);
xx_vtrnq_s64(&accumx7b, &accumx7a);
accumx4b += carry;
accumx4a += accumx5b;
accumx4a = vsraq_n_s64(accumx4a,accumx4b,28);
accumx5a = vsraq_n_s64(accumx5a,accumx4a,28);
accumx6b += accumx5a;
accumx6a += accumx7b;
trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a));
vcl[2] = trn_res.val[1] & vmask;
vch[2] = trn_res.val[0] & vmask;
accumx6a = vsraq_n_s64(accumx6a,accumx6b,28);
accumx7a = vsraq_n_s64(accumx7a,accumx6a,28);
trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a));
vcl[3] = trn_res.val[1] & vmask;
vch[3] = trn_res.val[0] & vmask;
accumx7a = xx_vaddup_s64(accumx7a);

int32x2_t t0 = vcl[0], t1 = vch[0];
trn_res = vtrn_s32(t0,t1);
t0 = trn_res.val[0]; t1 = trn_res.val[1];
accumx7a = vaddw_s32(accumx7a, t0);
t0 = vmovn_s64(accumx7a) & vmask;
accumx7a = vshrq_n_s64(accumx7a,28);
accumx7a = vaddw_s32(accumx7a, t1);
t1 = vmovn_s64(accumx7a) & vmask;
trn_res = vtrn_s32(t0,t1);
vcl[0] = trn_res.val[0];
vch[0] = trn_res.val[1];
accumx7a = vshrq_n_s64(accumx7a,28);

t0 = vmovn_s64(accumx7a);
uint32_t
c0 = vget_lane_s32(t0,0),
c1 = vget_lane_s32(t0,1);
c[2] += c0;
c[10] += c1;
}

void
p448_mulw (
p448_t *__restrict__ cs,
const p448_t *as,
uint64_t b
) {
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;

uint64_t accum0, accum8;
uint32_t mask = (1ull<<28)-1;

int i;

uint32_t c0, c8, n0, n8;
accum0 = widemul_32(bhi, a[15]);
accum8 = widemul_32(bhi, a[15] + a[7]);
c0 = a[0]; c8 = a[8];
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);

c[0] = accum0 & mask; accum0 >>= 28;
c[8] = accum8 & mask; accum8 >>= 28;
i=1;
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);
c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);
c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
i++;
}

accum0 += accum8 + c[8];
c[8] = accum0 & mask;
c[9] += accum0 >> 28;

accum8 += c[0];
c[0] = accum8 & mask;
c[1] += accum8 >> 28;
}

void
p448_strong_reduce (
p448_t *a
) {
word_t mask = (1ull<<28)-1;

/* first, clear high */
a->limb[8] += a->limb[15]>>28;
a->limb[0] += a->limb[15]>>28;
a->limb[15] &= mask;

/* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

/* compute total_value - p. No need to reduce mod p. */

dsword_t scarry = 0;
int i;
for (i=0; i<16; i++) {
scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
a->limb[i] = scarry & mask;
scarry >>= 28;
}

/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^448
* so let's add back in p. will carry back off the top for 2^448.
*/

assert(is_zero(scarry) | is_zero(scarry+1));

word_t scarry_mask = scarry & mask;
dword_t carry = 0;

/* add it back */
for (i=0; i<16; i++) {
carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 28;
}

assert(is_zero(carry + scarry));
}

mask_t
p448_is_zero (
const struct p448_t *a
) {
struct p448_t b;
p448_copy(&b,a);
p448_strong_reduce(&b);

uint32_t any = 0;
int i;
for (i=0; i<16; i++) {
any |= b.limb[i];
}
return is_zero(any);
}

void
p448_serialize (
uint8_t *serial,
const struct p448_t *x
) {
int i,j;
p448_t red;
p448_copy(&red, x);
p448_strong_reduce(&red);
for (i=0; i<8; i++) {
uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
for (j=0; j<7; j++) {
serial[7*i+j] = limb;
limb >>= 8;
}
assert(limb == 0);
}
}

mask_t
p448_deserialize (
p448_t *x,
const uint8_t serial[56]
) {
int i,j;
for (i=0; i<8; i++) {
uint64_t out = 0;
for (j=0; j<7; j++) {
out |= ((uint64_t)serial[7*i+j])<<(8*j);
}
x->limb[2*i] = out & ((1ull<<28)-1);
x->limb[2*i+1] = out >> 28;
}
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 56 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
uint32_t ge = -1, mask = (1ull<<28)-1;
for (i=0; i<8; i++) {
ge &= x->limb[i];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
/* Propagate the rest */
for (i=9; i<16; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
}

void
simultaneous_invert_p448(
struct p448_t *__restrict__ out,
const struct p448_t *in,
unsigned int n
) {
if (n==0) {
return;
} else if (n==1) {
p448_inverse(out,in);
return;
}
p448_copy(&out[1], &in[0]);
int i;
for (i=1; i<(int) (n-1); i++) {
p448_mul(&out[i+1], &out[i], &in[i]);
}
p448_mul(&out[0], &out[n-1], &in[n-1]);
struct p448_t tmp;
p448_inverse(&tmp, &out[0]);
p448_copy(&out[0], &tmp);
/* at this point, out[0] = product(in[i]) ^ -1
* out[i] = product(in[0]..in[i-1]) if i != 0
*/
for (i=n-1; i>0; i--) {
p448_mul(&tmp, &out[i], &out[0]);
p448_copy(&out[i], &tmp);
p448_mul(&tmp, &out[0], &in[i]);
p448_copy(&out[0], &tmp);
}
}

+ 378
- 0
src/arch_neon/p448.h View File

@@ -0,0 +1,378 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_H__
#define __P448_H__ 1

#include "word.h"

#include <stdint.h>
#include <assert.h>

typedef struct p448_t {
uint32_t limb[16];
} __attribute__((aligned(32))) p448_t;

#ifdef __cplusplus
extern "C" {
#endif

static __inline__ void
p448_set_ui (
p448_t *out,
uint64_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t do_swap
) __attribute__((unused,always_inline));

static __inline__ void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_neg (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_neg (
p448_t *a,
mask_t doNegate
) __attribute__((unused,always_inline));

static __inline__ void
p448_addw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_subw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_copy (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_weak_reduce (
p448_t *inout
) __attribute__((unused,always_inline));
void
p448_strong_reduce (
p448_t *inout
);

mask_t
p448_is_zero (
const p448_t *in
);
static __inline__ void
p448_bias (
p448_t *inout,
int amount
) __attribute__((unused,always_inline));

void
p448_mul (
p448_t *__restrict__ out,
const p448_t *a,
const p448_t *b
);

void
p448_mulw (
p448_t *__restrict__ out,
const p448_t *a,
uint64_t b
);

void
p448_sqr (
p448_t *__restrict__ out,
const p448_t *a
);
static __inline__ void
p448_sqrn (
p448_t *__restrict__ y,
const p448_t *x,
int n
) __attribute__((unused,always_inline));

void
p448_serialize (
uint8_t *serial,
const struct p448_t *x
);

mask_t
p448_deserialize (
p448_t *x,
const uint8_t serial[56]
);
static __inline__ void
p448_mask(
struct p448_t *a,
const struct p448_t *b,
mask_t mask
) __attribute__((unused,always_inline));

/**
* Returns 1/x.
*
* If x=0, returns 0.
*/
void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
);
void
simultaneous_invert_p448 (
struct p448_t *__restrict__ out,
const struct p448_t *in,
unsigned int n
);

static inline mask_t
p448_eq (
const struct p448_t *a,
const struct p448_t *b
) __attribute__((always_inline,unused));

/* -------------- Inline functions begin here -------------- */

void
p448_set_ui (
p448_t *out,
uint64_t x
) {
int i;
out->limb[0] = x & ((1<<28)-1);
out->limb[1] = x>>28;
for (i=2; i<16; i++) {
out->limb[i] = 0;
}
}
void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t doswap
) {
big_register_t *aa = (big_register_t*)a;
big_register_t *bb = (big_register_t*)b;
big_register_t m = br_set_to_mask(doswap);

unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
big_register_t x = m & (aa[i]^bb[i]);
aa[i] ^= x;
bb[i] ^= x;
}
}

void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
*/
}

void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
*/
}

void
p448_neg (
p448_t *out,
const p448_t *a
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = -a->limb[i];
}
*/
}

void
p448_cond_neg(
p448_t *a,
mask_t doNegate
) {
unsigned int i;
struct p448_t negated;
big_register_t *aa = (big_register_t *)a;
big_register_t *nn = (big_register_t*)&negated;
big_register_t m = br_set_to_mask(doNegate);
p448_neg(&negated, a);
p448_bias(&negated, 2);
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
aa[i] = (aa[i] & ~m) | (nn[i] & m);
}
}

void
p448_addw (
p448_t *a,
uint32_t x
) {
a->limb[0] += x;
}
void
p448_subw (
p448_t *a,
uint32_t x
) {
a->limb[0] -= x;
}

void
p448_copy (
p448_t *out,
const p448_t *a
) {
*out = *a;
}

void
p448_bias (
p448_t *a,
int amt
) {
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
uint32x4_t *aa = (uint32x4_t*) a;
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
aa[3] += lo;
}

void
p448_weak_reduce (
p448_t *a
) {
uint64_t mask = (1ull<<28) - 1;
uint64_t tmp = a->limb[15] >> 28;
int i;
a->limb[8] += tmp;
for (i=15; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}

void
p448_sqrn (
p448_t *__restrict__ y,
const p448_t *x,
int n
) {
p448_t tmp;
assert(n>0);
if (n&1) {
p448_sqr(y,x);
n--;
} else {
p448_sqr(&tmp,x);
p448_sqr(y,&tmp);
n-=2;
}
for (; n; n-=2) {
p448_sqr(&tmp,y);
p448_sqr(y,&tmp);
}
}

mask_t
p448_eq (
const struct p448_t *a,
const struct p448_t *b
) {
struct p448_t ra, rb;
p448_copy(&ra, a);
p448_copy(&rb, b);
p448_weak_reduce(&ra);
p448_weak_reduce(&rb);
p448_sub(&ra, &ra, &rb);
p448_bias(&ra, 2);
return p448_is_zero(&ra);
}

void
p448_mask (
struct p448_t *a,
const struct p448_t *b,
mask_t mask
) {
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
a->limb[i] = b->limb[i] & mask;
}
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __P448_H__ */

+ 16
- 24
src/arch_x86_64/p448.c View File

@@ -17,13 +17,14 @@ p448_mul (
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1;

uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32)));
uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));

/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
}
/*
for (int i=0; i<4; i++) {
@@ -81,18 +82,15 @@ p448_mul (
accum0 >>= 56;
accum1 >>= 56;

accum2 = widemul(&aa[2],&bb[3]);
msb(&accum0, &a[2], &b[3]);
mac(&accum1, &a[6], &b[7]);
accum2 = widemul(&a[2],&b[7]);
mac(&accum0, &a[6], &bb[3]);
mac(&accum1, &aa[2], &bbb[3]);

mac(&accum2, &aa[3], &bb[2]);
msb(&accum0, &a[3], &b[2]);
mac(&accum1, &a[7], &b[6]);
mac(&accum2, &a[3], &b[6]);
mac(&accum0, &a[7], &bb[2]);
mac(&accum1, &aa[3], &bbb[2]);

accum1 += accum2;
accum0 += accum2;

accum2 = widemul(&a[0],&b[1]);
mac(&accum2, &a[0],&b[1]);
mac(&accum1, &aa[0], &bb[1]);
mac(&accum0, &a[4], &b[5]);

@@ -109,14 +107,11 @@ p448_mul (
accum0 >>= 56;
accum1 >>= 56;

accum2 = widemul(&aa[3],&bb[3]);
msb(&accum0, &a[3], &b[3]);
mac(&accum1, &a[7], &b[7]);

accum1 += accum2;
accum0 += accum2;
accum2 = widemul(&a[3],&b[7]);
mac(&accum0, &a[7], &bb[3]);
mac(&accum1, &aa[3], &bbb[3]);

accum2 = widemul(&a[0],&b[2]);
mac(&accum2, &a[0],&b[2]);
mac(&accum1, &aa[0], &bb[2]);
mac(&accum0, &a[4], &b[6]);

@@ -186,11 +181,9 @@ p448_mulw (
c[3] = accum0 & mask; accum0 >>= 56;
c[7] = accum4 & mask; accum4 >>= 56;

c[4] += accum0 + accum4;
c[0] += accum4;
// c[4] += accum0 + accum4;
// c[0] += accum4;
/*
* TODO: double-check that this is not necessary.
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 56;
@@ -198,7 +191,6 @@ p448_mulw (
accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 56;
*/
}

void


+ 19
- 1
src/arch_x86_64/p448.h View File

@@ -290,7 +290,10 @@ p448_copy (
p448_t *out,
const p448_t *a
) {
*out = *a;
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
}
}

void
@@ -299,10 +302,25 @@ p448_bias (
int amt
) {
uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
#if __AVX2__
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
uint64x4_t *aa = (uint64x4_t*) a;
aa[0] += lo;
aa[1] += hi;
#elif __SSE2__
uint64x2_t lo = {co1,co1}, hi = {co2,co1};
uint64x2_t *aa = (uint64x2_t*) a;
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
aa[3] += lo;
#else
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
a->limb[i] += (i==4) ? co2 : co1;
}
#endif
}

void


+ 0
- 6
src/exported.sym View File

@@ -1,6 +0,0 @@
_goldilocks_init
_goldilocks_keygen
_goldilocks_shared_secret
_goldilocks_sign
_goldilocks_verify
_goldilocks_private_to_public

+ 293
- 86
src/goldilocks.c View File

@@ -32,7 +32,10 @@
#define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
#endif

/* FUTURE: auto */
#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS))
#define GOLDI_DIVERSIFY_BYTES 8

/* FUTURE: auto. MAGIC */
const struct affine_t goldilocks_base_point = {
{{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
@@ -42,11 +45,12 @@ const struct affine_t goldilocks_base_point = {
{{ 19 }}
};

/* These are just unique identifiers */
static const char *G_INITING = "initializing";
static const char *G_INITED = "initialized";
static const char *G_FAILED = "failed to initialize";

/* FUTURE: auto */
/* FUTURE: auto. MAGIC */
static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
U64LE(0xdc873d6d54a7bb0d),
U64LE(0xde933d8d723a70aa),
@@ -54,19 +58,45 @@ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
0x8335dc16
};
const struct barrett_prime_t goldi_q448 = {
448/WORD_BITS,
GOLDI_FIELD_WORDS,
62 % WORD_BITS,
sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]),
goldi_q448_lo
};

/* FUTURE: auto */
/* MAGIC */
static const struct p448_t
sqrt_d_minus_1 = {{
U58LE(0xd2e21836749f46),
U58LE(0x888db42b4f0179),
U58LE(0x5a189aabdeea38),
U58LE(0x51e65ca6f14c06),
U58LE(0xa49f7b424d9770),
U58LE(0xdcac4628c5f656),
U58LE(0x49443b8748734a),
U58LE(0x12fec0c0b25b7a)
}};

struct goldilocks_precomputed_public_key_t {
struct goldilocks_public_key_t pub;
struct fixed_base_table_t table;
};

#ifndef USE_BIG_TABLES
#if __ARM_NEON__
#define USE_BIG_TABLES 1
#else
#define USE_BIG_TABLES (WORD_BITS==64)
#endif
#endif

/* FUTURE: auto. MAGIC */
struct {
const char * volatile state;
#if GOLDILOCKS_USE_PTHREAD
pthread_mutex_t mutex;
#endif
struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64];
struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64];
struct fixed_base_table_t fixed_base;
struct tw_niels_t wnafs[32];
struct crandom_state_t rand;
@@ -107,7 +137,7 @@ goldilocks_init () {
/* Precompute the tables. */
mask_t succ;
int big = (WORD_BITS==64);
int big = USE_BIG_TABLES;
uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

succ = precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs);
@@ -135,55 +165,77 @@ fail:
return -1;
}

static const struct p448_t
sqrt_d_minus_1 = {{
U58LE(0xd2e21836749f46),
U58LE(0x888db42b4f0179),
U58LE(0x5a189aabdeea38),
U58LE(0x51e65ca6f14c06),
U58LE(0xa49f7b424d9770),
U58LE(0xdcac4628c5f656),
U58LE(0x49443b8748734a),
U58LE(0x12fec0c0b25b7a)
}};

int
goldilocks_keygen (
goldilocks_derive_private_key (
struct goldilocks_private_key_t *privkey,
struct goldilocks_public_key_t *pubkey
const unsigned char proto[GOLDI_SYMKEY_BYTES]
) {
if (!goldilocks_check_init()) {
return GOLDI_EUNINIT;
}
word_t sk[448*2/WORD_BITS];
memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES);
unsigned char skb[SHA512_OUTPUT_BYTES];
word_t sk[GOLDI_FIELD_WORDS];
assert(sizeof(skb) >= sizeof(sk));
struct sha512_ctx_t ctx;
struct tw_extensible_t exta;
struct p448_t pk;
sha512_init(&ctx);
sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
sha512_final(&ctx, (unsigned char *)skb);

barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448);
barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);

scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
untwist_and_double_and_serialize(&pk, &exta);
p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
return GOLDI_EOK;
}

void
goldilocks_underive_private_key (
unsigned char proto[GOLDI_SYMKEY_BYTES],
const struct goldilocks_private_key_t *privkey
) {
memcpy(proto, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
}

int
goldilocks_keygen (
struct goldilocks_private_key_t *privkey,
struct goldilocks_public_key_t *pubkey
) {
if (!goldilocks_check_init()) {
return GOLDI_EUNINIT;
}
unsigned char proto[GOLDI_SYMKEY_BYTES];

#if GOLDILOCKS_USE_PTHREAD
int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex);
if (ml_ret) return ml_ret;
#endif

int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
if (!ret) ret = ret2;
int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto));

#if GOLDILOCKS_USE_PTHREAD
ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
if (ml_ret) abort();
#endif
barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448);
barrett_serialize(privkey->opaque, sk, 448/8);
scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base);
//transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
untwist_and_double_and_serialize(&pk, &exta);
int ret2 = goldilocks_derive_private_key(privkey, proto);
if (!ret) ret = ret2;
p448_serialize(pubkey->opaque, &pk);
memcpy(&privkey->opaque[56], pubkey->opaque, 56);
ret2 = goldilocks_private_to_public(pubkey, privkey);
if (!ret) ret = ret2;
return ret ? GOLDI_ENODICE : GOLDI_EOK;
}
@@ -194,7 +246,7 @@ goldilocks_private_to_public (
const struct goldilocks_private_key_t *privkey
) {
struct p448_t pk;
mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]);
mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
if (msucc) {
p448_serialize(pubkey->opaque, &pk);
@@ -204,30 +256,46 @@ goldilocks_private_to_public (
}
}

int
goldilocks_shared_secret (
uint8_t shared[64],
static int
goldilocks_shared_secret_core (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_public_key_t *your_pubkey
const struct goldilocks_public_key_t *your_pubkey,
const struct goldilocks_precomputed_public_key_t *pre
) {
/* This function doesn't actually need anything in goldilocks_global,
* so it doesn't check init.
*/
word_t sk[448/WORD_BITS];
assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
word_t sk[GOLDI_FIELD_WORDS];
struct p448_t pk;
mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
struct p448_t sum, prod;
msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
p448_mul(&prod,&pk,&sum);
p448_add(&sum,&pk,&sum);
#endif
msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448);
succ &= montgomery_ladder(&pk,&pk,sk,446,2);
#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
if (pre) {
struct tw_extensible_t tw;
succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table);
untwist_and_double_and_serialize(&pk, &tw);
} else {
succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
}
#else
(void)pre;
succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
#endif
p448_serialize(shared,&pk);
@@ -236,28 +304,28 @@ goldilocks_shared_secret (
sha512_init(&ctx);

#ifdef EXPERIMENT_ECDH_OBLITERATE_CT
uint8_t oblit[40];
uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES];
unsigned i;
for (i=0; i<8; i++) {
for (i=0; i<GOLDI_DIVERSIFY_BYTES; i++) {
oblit[i] = "noshared"[i] & ~(succ&msucc);
}
for (i=0; i<32; i++) {
oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
for (i=0; i<GOLDI_SYMKEY_BYTES; i++) {
oblit[GOLDI_DIVERSIFY_BYTES+i] = my_privkey->opaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc);
}
sha512_update(&ctx, oblit, 40);
sha512_update(&ctx, oblit, sizeof(oblit));
#endif
#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
/* stir in the sum and product of the pubkeys. */
uint8_t a_pk[56];
uint8_t a_pk[GOLDI_FIELD_BYTES];
p448_serialize(a_pk, &sum);
sha512_update(&ctx, a_pk, 56);
sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
p448_serialize(a_pk, &prod);
sha512_update(&ctx, a_pk, 56);
sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
#endif
/* stir in the shared key and finish */
sha512_update(&ctx, shared, 56);
sha512_update(&ctx, shared, GOLDI_FIELD_BYTES);
sha512_final(&ctx, shared);
return (GOLDI_ECORRUPT & ~msucc)
@@ -265,9 +333,42 @@ goldilocks_shared_secret (
| (GOLDI_EOK & msucc & succ);
}

int
goldilocks_shared_secret (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_public_key_t *your_pubkey
) {
return goldilocks_shared_secret_core(
shared,
my_privkey,
your_pubkey,
NULL
);
}

static void
goldilocks_derive_challenge(
word_t challenge[GOLDI_FIELD_WORDS],
const unsigned char pubkey[GOLDI_FIELD_BYTES],
const unsigned char gnonce[GOLDI_FIELD_BYTES],
const unsigned char *message,
uint64_t message_len
) {
/* challenge = H(pk, [nonceG], message). */
unsigned char sha_out[SHA512_OUTPUT_BYTES];
struct sha512_ctx_t ctx;
sha512_init(&ctx);
sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES);
sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
sha512_update(&ctx, message, message_len);
sha512_final(&ctx, sha_out);
barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448);
}

int
goldilocks_sign (
uint8_t signature_out[56*2],
uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_private_key_t *privkey
@@ -277,7 +378,7 @@ goldilocks_sign (
}
/* challenge = H(pk, [nonceG], message). */
word_t skw[448/WORD_BITS];
word_t skw[GOLDI_FIELD_WORDS];
mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448);
if (!succ) {
memset(skw,0,sizeof(skw));
@@ -285,48 +386,50 @@ goldilocks_sign (
}
/* Derive a nonce. TODO: use HMAC. FUTURE: factor. */
unsigned char sha_out[512/8];
word_t tk[448/WORD_BITS];
unsigned char sha_out[SHA512_OUTPUT_BYTES];
word_t tk[GOLDI_FIELD_WORDS];
struct sha512_ctx_t ctx;
sha512_init(&ctx);
sha512_update(&ctx, (const unsigned char *)"signonce", 8);
sha512_update(&ctx, &privkey->opaque[112], 32);
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
sha512_update(&ctx, message, message_len);
sha512_update(&ctx, &privkey->opaque[112], 32);
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
sha512_final(&ctx, sha_out);
barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448);
barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448);
/* 4[nonce]G */
uint8_t signature_tmp[56];
uint8_t signature_tmp[GOLDI_FIELD_BYTES];
struct tw_extensible_t exta;
struct p448_t gsk;
scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base);
scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
double_tw_extensible(&exta);
untwist_and_double_and_serialize(&gsk, &exta);
p448_serialize(signature_tmp, &gsk);
word_t challenge[448/WORD_BITS];
sha512_update(&ctx, &privkey->opaque[56], 56);
sha512_update(&ctx, signature_tmp, 56);
sha512_update(&ctx, message, message_len);
sha512_final(&ctx, sha_out);
barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
word_t challenge[GOLDI_FIELD_WORDS];
goldilocks_derive_challenge (
challenge,
&privkey->opaque[GOLDI_FIELD_BYTES],
signature_tmp,
message,
message_len
);
// reduce challenge and sub.
barrett_negate(challenge,448/WORD_BITS,&goldi_q448);
barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448);

barrett_mac(
tk,448/WORD_BITS,
challenge,448/WORD_BITS,
skw,448/WORD_BITS,
tk,GOLDI_FIELD_WORDS,
challenge,GOLDI_FIELD_WORDS,
skw,GOLDI_FIELD_WORDS,
&goldi_q448
);
word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1);
barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448);
word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1);
barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448);
memcpy(signature_out, signature_tmp, 56);
barrett_serialize(signature_out+56, tk, 448/8);
memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
memset((unsigned char *)tk,0,sizeof(tk));
memset((unsigned char *)skw,0,sizeof(skw));
memset((unsigned char *)challenge,0,sizeof(challenge));
@@ -342,7 +445,7 @@ goldilocks_sign (

int
goldilocks_verify (
const uint8_t signature[56*2],
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_public_key_t *pubkey
@@ -352,24 +455,16 @@ goldilocks_verify (
}
struct p448_t pk;
word_t s[448/WORD_BITS];
word_t s[GOLDI_FIELD_WORDS];
mask_t succ = p448_deserialize(&pk,pubkey->opaque);
if (!succ) return GOLDI_EINVAL;
succ = barrett_deserialize(s, &signature[56], &goldi_q448);
succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
if (!succ) return GOLDI_EINVAL;
/* challenge = H(pk, [nonceG], message). */
unsigned char sha_out[512/8];
word_t challenge[448/WORD_BITS];
struct sha512_ctx_t ctx;
sha512_init(&ctx);
sha512_update(&ctx, pubkey->opaque, 56);
sha512_update(&ctx, signature, 56);
sha512_update(&ctx, message, message_len);
sha512_final(&ctx, sha_out);
barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448);
word_t challenge[GOLDI_FIELD_WORDS];
goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
struct p448_t eph;
struct tw_extensible_t pk_text;
@@ -381,7 +476,102 @@ goldilocks_verify (
succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
if (!succ) return GOLDI_EINVAL;
linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 );
linear_combo_var_fixed_vt( &pk_text,
challenge, GOLDI_SCALAR_BITS,
s, GOLDI_SCALAR_BITS,
goldilocks_global.wnafs, 5 );
untwist_and_double_and_serialize( &pk, &pk_text );
p448_sub(&eph, &eph, &pk);
p448_bias(&eph, 2);
succ = p448_is_zero(&eph);
return succ ? 0 : GOLDI_EINVAL;
}

#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

struct goldilocks_precomputed_public_key_t *
goldilocks_precompute_public_key (
const struct goldilocks_public_key_t *pub
) {
struct goldilocks_precomputed_public_key_t *precom;
precom = (struct goldilocks_precomputed_public_key_t *)
malloc(sizeof(*precom));
if (!precom) return NULL;
struct tw_extensible_t pk_text;
struct p448_t pk;
mask_t succ = p448_deserialize(&pk, pub->opaque);
if (!succ) {
free(precom);
return NULL;
}
succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
if (!succ) {
free(precom);
return NULL;
}
int big = USE_BIG_TABLES;
uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14;

succ = precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL);
if (!succ) {
free(precom);
return NULL;
}
memcpy(&precom->pub,pub,sizeof(*pub));
return precom;
}

void
goldilocks_destroy_precomputed_public_key (
struct goldilocks_precomputed_public_key_t *precom
) {
if (!precom) return;
destroy_fixed_base(&precom->table);
memset(&precom->pub.opaque, 0, sizeof(precom->pub));
free(precom);
}

int
goldilocks_verify_precomputed (
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_precomputed_public_key_t *pubkey
) {
if (!goldilocks_check_init()) {
return GOLDI_EUNINIT;
}

word_t s[GOLDI_FIELD_WORDS];
mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448);
if (!succ) return GOLDI_EINVAL;
word_t challenge[GOLDI_FIELD_WORDS];
goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
struct p448_t eph, pk;
struct tw_extensible_t pk_text;
/* deserialize [nonce]G */
succ = p448_deserialize(&eph, signature);
if (!succ) return GOLDI_EINVAL;
succ = linear_combo_combs_vt (
&pk_text,
challenge, GOLDI_SCALAR_BITS, &pubkey->table,
s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base
);
if (!succ) return GOLDI_EINVAL;
untwist_and_double_and_serialize( &pk, &pk_text );
p448_sub(&eph, &eph, &pk);
@@ -391,3 +581,20 @@ goldilocks_verify (
return succ ? 0 : GOLDI_EINVAL;
}

int
goldilocks_shared_secret_precomputed (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_precomputed_public_key_t *your_pubkey
) {
return goldilocks_shared_secret_core(
shared,
my_privkey,
&your_pubkey->pub,
your_pubkey
);
}

#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS


+ 2
- 0
src/include/intrinsics.h View File

@@ -12,7 +12,9 @@

#include <sys/types.h>

#if __i386__ || __x86_64__
#include <immintrin.h>
#endif

#define INTRINSIC \
static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused))


+ 34
- 6
src/include/scalarmul.h View File

@@ -26,7 +26,7 @@ struct fixed_base_table_t {
struct tw_niels_t *table;
/** Adjustments to the scalar in even and odd cases, respectively. */
word_t scalar_adjustments[2*(448/WORD_BITS)];
word_t scalar_adjustments[2*(448/WORD_BITS)]; /* MAGIC */
/** The number of combs in the table. */
unsigned int n;
@@ -103,7 +103,7 @@ montgomery_ladder (
void
scalarmul (
struct tw_extensible_t *working,
const word_t scalar[448/WORD_BITS]
const word_t scalar[448/WORD_BITS] /* MAGIC */
/* TODO? int nbits */
);
@@ -124,7 +124,7 @@ scalarmul (
void
scalarmul_vlook (
struct tw_extensible_t *working,
const word_t scalar[448/WORD_BITS]
const word_t scalar[448/WORD_BITS] /* MAGIC */
/* TODO? int nbits */
);

@@ -209,7 +209,7 @@ scalarmul_fixed_base (
void
scalarmul_vt (
struct tw_extensible_t *working,
const word_t scalar[448/WORD_BITS]
const word_t scalar[448/WORD_BITS] /* MAGIC */
);


@@ -274,14 +274,42 @@ scalarmul_fixed_base_wnaf_vt (
void
linear_combo_var_fixed_vt (
struct tw_extensible_t *working,
const word_t scalar_var[448/WORD_BITS],
const word_t scalar_var[448/WORD_BITS], /* MAGIC */
unsigned int nbits_var,
const word_t scalar_pre[448/WORD_BITS],
const word_t scalar_pre[448/WORD_BITS], /* MAGIC */
unsigned int nbits_pre,
const struct tw_niels_t *precmp,
unsigned int table_bits_pre
);

/**
* Variable-time scalar linear combination of two fixed points.
*
* @warning This function takes variable time. It is intended for
* signature verification.
*
* @param [out] working The output point.
* @param [in] scalar1 The first scalar.
* @param [in] nbits1 The number of bits in the first scalar.
* @param [in] table1 The first precomputed table.
* @param [in] scalar2 The second scalar.
* @param [in] nbits1 The number of bits in the second scalar.
* @param [in] table1 The second precomputed table.
*
* @retval MASK_SUCCESS Success.
* @retval MASK_FAILURE Failure, because eg the tables are too small.
*/
mask_t
linear_combo_combs_vt (
struct tw_extensible_t *out,
const word_t scalar1[448/WORD_BITS],
unsigned int nbits1,
const struct fixed_base_table_t *table1,
const word_t scalar2[448/WORD_BITS],
unsigned int nbits2,
const struct fixed_base_table_t *table2
);

#ifdef __cplusplus
};
#endif


+ 3
- 1
src/include/sha512.h View File

@@ -10,6 +10,8 @@
extern "C" {
#endif

#define SHA512_OUTPUT_BYTES 64

/**
* SHA512 hashing context.
*
@@ -37,7 +39,7 @@ sha512_update (
void
sha512_final (
struct sha512_ctx_t *ctx,
uint8_t result[64]
uint8_t result[SHA512_OUTPUT_BYTES]
);
#ifdef __cplusplus


+ 58
- 4
src/include/word.h View File

@@ -8,11 +8,22 @@
/* for posix_memalign */
#define _XOPEN_SOURCE 600

#ifndef __APPLE__
#define _BSD_SOURCE
#include <endian.h>
#endif

#include <stdint.h>
#include <stdlib.h>
#include <sys/types.h>
#include <inttypes.h>

#if __ARM_NEON__
#include <arm_neon.h>
#elif __SSE2__
#include <immintrin.h>
#endif

#if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT)
/* It's a 64-bit machine if:
* // limits.h thinks so
@@ -33,6 +44,7 @@ typedef __int128_t dsword_t;
#define PRIxWORD58 "%014" PRIx64
#define U64LE(x) x##ull
#define U58LE(x) x##ull
#define letohWORD letoh64
#else
typedef uint16_t hword_t;
typedef uint32_t word_t;
@@ -45,15 +57,19 @@ typedef int64_t dsword_t;
#define PRIxWORD58 "%07" PRIx32
#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define letohWORD letoh32
#endif

#define WORD_BITS (sizeof(word_t) * 8)

/* TODO: vector width for procs like ARM; gcc support */
typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

typedef word_t mask_t;
static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;



#ifdef __ARM_NEON__
typedef uint32x4_t vecmask_t;
#else
/* FIXME this only works on clang */
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
@@ -61,8 +77,13 @@ typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
/* TODO: vector width for procs like ARM; gcc support */
typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
#endif

#if __AVX2__
typedef uint32x8_t big_register_t;
@@ -82,11 +103,28 @@ typedef uint32_t big_register_t;
#endif


#if __AVX2__ || __SSE2__ || __ARM_NEON__
#ifdef __ARM_NEON__
static __inline__ big_register_t
br_set_to_mask(mask_t x) {
return vdupq_n_u32(x);
}
#else
static __inline__ big_register_t
br_set_to_mask(mask_t x) {
return (big_register_t)x;
}
#endif

#if __AVX2__ || __SSE2__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
return (big_register_t)(x == (big_register_t)0);
}
#elif __ARM_NEON__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
return vceqq_u32(x,x^x);
}
#else
static __inline__ mask_t
br_is_zero(word_t x) {
@@ -96,6 +134,22 @@ br_is_zero(word_t x) {




#ifdef __APPLE__
static inline uint64_t
htobe64 (uint64_t x) {
__asm__ ("bswapq %0" : "+r"(x));
return x;
}
static inline uint64_t
htole64 (uint64_t x) { return x; }

static inline uint64_t
letoh64 (uint64_t x) { return x; }
#endif



/**
* Allocate memory which is sufficiently aligned to be used for the
* largest vector on the system (for now that's a big_register_t).


+ 197
- 72
src/scalarmul.c View File

@@ -63,14 +63,14 @@ cond_negate_tw_pniels (
cond_negate_tw_niels(&n->n, doNegate);
}

void
static __inline__ void
constant_time_lookup_tw_pniels (
struct tw_pniels_t *out,
const struct tw_pniels_t *in,
int nin,
int idx
) {
big_register_t big_one = 1, big_i = idx;
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
big_register_t *o = (big_register_t *)out;
const big_register_t *i = (const big_register_t *)in;
int j;
@@ -85,14 +85,14 @@ constant_time_lookup_tw_pniels (
}
}

static __inline__ void
static __inline__ void
constant_time_lookup_tw_niels (
struct tw_niels_t *out,
const struct tw_niels_t *in,
int nin,
int idx
) {
big_register_t big_one = 1, big_i = idx;
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
big_register_t *o = (big_register_t *)out;
const big_register_t *i = (const big_register_t *)in;
int j;
@@ -139,64 +139,73 @@ scalarmul (
struct tw_extensible_t *working,
const word_t scalar[448/WORD_BITS]
) {

const int nbits=448; /* HACK? */
const int nbits=450; /* MAGIC */
word_t prepared_data[448*2/WORD_BITS] = {
U64LE(0x9595b847fdf73126),
U64LE(0x9bb9b8a856af5200),
U64LE(0xb3136e22f37d5c4f),
U64LE(0x0000000189a19442),
U64LE(0xebec9967f5d3f5c2),
U64LE(0x0aa09b49b16c9a02),
U64LE(0x7f6126aec172cd8e),
U64LE(0x00000007b027e54d),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x4000000000000000),
U64LE(0x721cf5b5529eec33),
U64LE(0x7a4cf635c8e9c2ab),
U64LE(0xeec492d944a725bf),
U64LE(0x000000020cd77058),
U64LE(0xc873d6d54a7bb0cf),
U64LE(0xe933d8d723a70aad),
U64LE(0xbb124b65129c96fd),
U64LE(0x00000008335dc163),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000)
}; /* TODO: split off */
}; /* MAGIC */
word_t scalar2[448/WORD_BITS];
convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
const int WINDOW = 5, /* MAGIC */
WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
NTABLE = 1<<(WINDOW-1);

struct tw_extensible_t tabulator;
copy_tw_extensible(&tabulator, working);
double_tw_extensible(&tabulator);

struct tw_pniels_t pn, multiples[8];
struct tw_pniels_t pn, multiples[NTABLE];
convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
convert_tw_extensible_to_tw_pniels(&multiples[0], working);

int i;
for (i=1; i<8; i++) {
int i,j;
for (i=1; i<NTABLE; i++) {
add_tw_pniels_to_tw_extensible(working, &pn);
convert_tw_extensible_to_tw_pniels(&multiples[i], working);
}

i = nbits - 4;
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
inv = (bits>>3)-1;
i = nbits - WINDOW;
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
cond_negate_tw_pniels(&pn, inv);
convert_tw_pniels_to_tw_extensible(working, &pn);

for (i-=4; i>=0; i-=4) {
double_tw_extensible(working);
double_tw_extensible(working);
double_tw_extensible(working);
double_tw_extensible(working);
for (i-=WINDOW; i>=0; i-=WINDOW) {
for (j=0; j<WINDOW; j++) {
double_tw_extensible(working);
}

bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
inv = (bits>>3)-1;
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
}
bits &= WINDOW_MASK;
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
cond_negate_tw_pniels(&pn, inv);
add_tw_pniels_to_tw_extensible(working, &pn);
}
@@ -207,81 +216,89 @@ scalarmul_vlook (
struct tw_extensible_t *working,
const word_t scalar[448/WORD_BITS]
) {

const int nbits=448; /* HACK? */
const int nbits=450; /* HACK? */
word_t prepared_data[448*2/WORD_BITS] = {
U64LE(0x9595b847fdf73126),
U64LE(0x9bb9b8a856af5200),
U64LE(0xb3136e22f37d5c4f),
U64LE(0x0000000189a19442),
U64LE(0xebec9967f5d3f5c2),
U64LE(0x0aa09b49b16c9a02),
U64LE(0x7f6126aec172cd8e),
U64LE(0x00000007b027e54d),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x4000000000000000),
U64LE(0x721cf5b5529eec33),
U64LE(0x7a4cf635c8e9c2ab),
U64LE(0xeec492d944a725bf),
U64LE(0x000000020cd77058),
U64LE(0xc873d6d54a7bb0cf),
U64LE(0xe933d8d723a70aad),
U64LE(0xbb124b65129c96fd),
U64LE(0x00000008335dc163),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000)
}; /* TODO: split off */
}; /* MAGIC: split off */
word_t scalar2[448/WORD_BITS];
convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS);
const int WINDOW = 5, /* MAGIC */
WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
NTABLE = 1<<(WINDOW-1);

struct tw_extensible_t tabulator;
copy_tw_extensible(&tabulator, working);
double_tw_extensible(&tabulator);

struct tw_pniels_t pn, multiples[8];
struct tw_pniels_t pn, multiples[NTABLE];
convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
convert_tw_extensible_to_tw_pniels(&multiples[0], working);

int i;
for (i=1; i<8; i++) {
int i,j;
for (i=1; i<NTABLE; i++) {
add_tw_pniels_to_tw_extensible(working, &pn);
convert_tw_extensible_to_tw_pniels(&multiples[i], working);
}

i = nbits - 4;
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF,
inv = (bits>>3)-1;
i = nbits - WINDOW;
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;

copy_tw_pniels(&pn, &multiples[bits&7]);
copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
cond_negate_tw_pniels(&pn, inv);
convert_tw_pniels_to_tw_extensible(working, &pn);

for (i-=4; i>=0; i-=4) {
double_tw_extensible(working);
double_tw_extensible(working);
double_tw_extensible(working);
double_tw_extensible(working);
for (i-=WINDOW; i>=0; i-=WINDOW) {
for (j=0; j<WINDOW; j++) {
double_tw_extensible(working);
}

bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF;
inv = (bits>>3)-1;
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
}
bits &= WINDOW_MASK;
inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
copy_tw_pniels(&pn, &multiples[bits&7]);
copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
cond_negate_tw_pniels(&pn, inv);
add_tw_pniels_to_tw_extensible(working, &pn);
}
}


mask_t
scalarmul_fixed_base (
struct tw_extensible_t *out,
const word_t scalar[448/WORD_BITS],
static mask_t
schedule_scalar_for_combs (
word_t *scalar2,
const word_t *scalar,
unsigned int nbits,
const struct fixed_base_table_t *table
) {
unsigned int i;
unsigned int n = table->n, t = table->t, s = table->s;
assert(n >= 1 && t >= 1 && s >= 1);
if (n*t*s < nbits) {
if (n*t*s < nbits || n < 1 || t < 1 || s < 1) {
return MASK_FAILURE;
}
@@ -289,10 +306,9 @@ scalarmul_fixed_base (
scalar2_words = scalar_words;
if (scalar2_words < 448 / WORD_BITS)
scalar2_words = 448 / WORD_BITS;
word_t scalar2[scalar2_words], scalar3[scalar2_words];
word_t scalar3[scalar2_words];
/* Copy scalar to scalar3, but clear its high bits (if there are any) */
unsigned int i,j,k;
for (i=0; i<scalar_words; i++) {
scalar3[i] = scalar[i];
}
@@ -309,6 +325,31 @@ scalarmul_fixed_base (
table->scalar_adjustments , 448 / WORD_BITS
);
return MASK_SUCCESS;
}

mask_t
scalarmul_fixed_base (
struct tw_extensible_t *out,
const word_t scalar[448/WORD_BITS],
unsigned int nbits,
const struct fixed_base_table_t *table
) {
unsigned int i,j,k;
unsigned int n = table->n, t = table->t, s = table->s;
unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS;
word_t scalar2[scalar2_words];

mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table);
if (!succ) return MASK_FAILURE;
#ifdef __clang_analyzer__
assert(t >= 1);
#endif
struct tw_niels_t ni;
for (i=0; i<s; i++) {
@@ -345,6 +386,90 @@ scalarmul_fixed_base (
return MASK_SUCCESS;
}

mask_t
linear_combo_combs_vt (
struct tw_extensible_t *out,
const word_t scalar1[448/WORD_BITS],
unsigned int nbits1,
const struct fixed_base_table_t *table1,
const word_t scalar2[448/WORD_BITS],
unsigned int nbits2,
const struct fixed_base_table_t *table2
) {
unsigned int i,j,k,sc;
unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS;
unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS;
word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];

/* Schedule the scalars */
mask_t succ;
succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1);
if (!succ) return MASK_FAILURE;
succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2);
if (!succ) return MASK_FAILURE;

#ifdef __clang_analyzer__
assert(table1->t >= 1);
assert(table2->t >= 1);
#endif
struct tw_niels_t ni;
unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
word_t *scalars[2] = {scalar1b,scalar2b};
for (i=0; i<smax; i++) {
if (i) double_tw_extensible(out);
for (sc=0; sc<2; sc++) {
const struct fixed_base_table_t *table = sc ? table2 : table1;
int ii = i-smax+table->s;
if (ii < 0) continue;
assert(ii < (int)table->s);
for (j=0; j<table->n; j++) {
int tab = 0;

for (k=0; k<table->t; k++) {
unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t);
if (bit < swords[sc] * WORD_BITS) {
tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
}
}
mask_t invert = (tab>>(table->t-1))-1;
tab ^= invert;
tab &= (1<<(table->t-1)) - 1;
copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]);
cond_negate_tw_niels(&ni,invert);
if (started) {
add_tw_niels_to_tw_extensible(out, &ni);
} else {
convert_tw_niels_to_tw_extensible(out, &ni);
started = 1;
}
}
}
assert(started);
}
return MASK_SUCCESS;
}


mask_t
precompute_fixed_base (
struct fixed_base_table_t *out,
@@ -354,7 +479,7 @@ precompute_fixed_base (
unsigned int s,
struct tw_niels_t *prealloc
) {
if (s < 1 || t < 1 || n < 1 || n*t*s < 446) {
if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */
memset(out, 0, sizeof(*out));
return 0;
}
@@ -402,7 +527,7 @@ precompute_fixed_base (
adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);

/* FIXME: factor out somehow */
/* MAGIC: factor out somehow */
const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = {
U64LE(0xdc873d6d54a7bb0d),
U64LE(0xde933d8d723a70aa),
@@ -462,13 +587,13 @@ precompute_fixed_base (
/* Gray-code phase */
for (j=0;; j++) {
int gray = j ^ (j>>1);
int idx = ((i+1)<<(t-1))-1 ^ gray;
int idx = (((i+1)<<(t-1))-1) ^ gray;

convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
copy_tw_niels(&table[idx], &pn_tmp.n);
p448_copy(&zs[idx], &pn_tmp.z);
if (j >= (1<<(t-1)) - 1) break;
if (j >= (1u<<(t-1)) - 1) break;
int delta = (j+1) ^ ((j+1)>>1) ^ gray;

for (k=0; delta>1; k++)
@@ -777,7 +902,7 @@ linear_combo_var_fixed_vt(
const struct tw_niels_t *precmp,
unsigned int table_bits_pre
) {
const int table_bits_var = 3;
const int table_bits_var = 4;
struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];


+ 1
- 13
src/sha512.c View File

@@ -2,12 +2,8 @@
* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __APPLE__
#define _BSD_SOURCE
#include <endian.h>
#endif

#include "sha512.h"
#include "word.h"

#include <string.h>
#include <assert.h>
@@ -20,14 +16,6 @@ rotate_r (
return (x >> d) | (x << (64-d));
}

#ifdef __APPLE__
static inline uint64_t
htobe64 (uint64_t x) {
__asm__ ("bswapq %0" : "+r"(x));
return x;
}
#endif

static const uint64_t
sha512_init_state[8] = {
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,


+ 68
- 27
test/bench.c View File

@@ -17,23 +17,28 @@
#include "goldilocks.h"
#include "sha512.h"

double now() {
static __inline__ void
ignore_result ( int result ) {
(void)result;
}

static double now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1000000.0;
}

void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
static void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
crandom_generate(crand, (unsigned char *)a, sizeof(*a));
p448_strong_reduce(a);
}

void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
static void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) {
crandom_generate(crand, (unsigned char *)sk, 448/8);
}

void p448_print( const char *descr, const struct p448_t *a ) {
static void p448_print( const char *descr, const struct p448_t *a ) {
p448_t b;
p448_copy(&b, a);
p448_strong_reduce(&b);
@@ -45,17 +50,21 @@ void p448_print( const char *descr, const struct p448_t *a ) {
printf("\n");
}

void p448_print_full( const char *descr, const struct p448_t *a ) {
static void __attribute__((unused))
p448_print_full (
const char *descr,
const struct p448_t *a
) {
int j;
printf("%s = 0x", descr);
for (j=15; j>=0; j--) {
printf("%02" PRIxWORD "_" PRIxWORD58 " ",
a->limb[j]>>28, a->limb[j]&(1<<28)-1);
a->limb[j]>>28, a->limb[j]&((1<<28)-1));
}
printf("\n");
}

void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
static void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) {
int j;
printf("%s = 0x", descr);
for (j=448/WORD_BITS-1; j>=0; j--) {
@@ -295,7 +304,7 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
(void)montgomery_ladder(&a,&b,sk,448,0);
ignore_result(montgomery_ladder(&a,&b,sk,448,0));
}
when = now() - when;
printf("full ladder: %5.1fµs\n", when * 1e6 / i);
@@ -310,11 +319,18 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
scalarmul_vlook(&ext,sk);
untwist_and_double_and_serialize(&a,&ext);
}
when = now() - when;
printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase/10; i++) {
scalarmul(&ext,sk);
untwist_and_double_and_serialize(&a,&ext);
}
when = now() - when;
printf("edwards smc: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase/10; i++) {
q448_randomize(&crand, sk);
@@ -326,7 +342,7 @@ int main(int argc, char **argv) {
struct tw_niels_t wnaft[1<<6];
when = now();
for (i=0; i<nbase/10; i++) {
(void)precompute_fixed_base_wnaf(wnaft,&ext,6);
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,6));
}
when = now() - when;
printf("wnaf6 pre: %5.1fµs\n", when * 1e6 / i);
@@ -341,7 +357,7 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
(void)precompute_fixed_base_wnaf(wnaft,&ext,4);
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,4));
}
when = now() - when;
printf("wnaf4 pre: %5.1fµs\n", when * 1e6 / i);
@@ -356,7 +372,7 @@ int main(int argc, char **argv) {

when = now();
for (i=0; i<nbase/10; i++) {
(void)precompute_fixed_base_wnaf(wnaft,&ext,5);
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,5));
}
when = now() - when;
printf("wnaf5 pre: %5.1fµs\n", when * 1e6 / i);
@@ -401,7 +417,7 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
if (i) destroy_fixed_base(&t_5_5_18);
(void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL);
ignore_result(precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL));
}
when = now() - when;
printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
@@ -409,7 +425,7 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
if (i) destroy_fixed_base(&t_3_5_30);
(void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL);
ignore_result(precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL));
}
when = now() - when;
printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i);
@@ -417,7 +433,7 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
if (i) destroy_fixed_base(&t_5_3_30);
(void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL);
ignore_result(precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL));
}
when = now() - when;
printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i);
@@ -425,15 +441,15 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase/10; i++) {
if (i) destroy_fixed_base(&t_15_3_10);
(void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL);
ignore_result(precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL));
}
when = now() - when;
printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i);
printf("pre(15,3,10):%5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase/10; i++) {
if (i) destroy_fixed_base(&t_8_4_14);
(void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL);
ignore_result(precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL));
}
when = now() - when;
printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i);
@@ -471,7 +487,7 @@ int main(int argc, char **argv) {
scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10);
}
when = now() - when;
printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i);
printf("com(15,3,10):%5.1fµs\n", when * 1e6 / i);
printf("\nGoldilocks:\n");
@@ -494,7 +510,7 @@ int main(int argc, char **argv) {
printf("keygen: %5.1fµs\n", when * 1e6 / i);
uint8_t ss1[64],ss2[64];
int gres1,gres2;
int gres1=0,gres2=0;
when = now();
for (i=0; i<nbase; i++) {
if (i&1) {
@@ -540,18 +556,43 @@ int main(int argc, char **argv) {
when = now();
for (i=0; i<nbase; i++) {
res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
(void)res;
int ver = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
assert(!ver);
}
when = now() - when;
printf("verify: %5.1fµs\n", when * 1e6 / i);
struct goldilocks_precomputed_public_key_t *pre = NULL;
when = now();
for (i=0; i<nbase; i++) {
goldilocks_destroy_precomputed_public_key(pre);
pre = goldilocks_precompute_public_key(&gpk);
}
when = now() - when;
printf("precompute: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase; i++) {
int ver = goldilocks_verify_precomputed(sout,(const unsigned char *)message,message_len,pre);
assert(!ver);
}
when = now() - when;
printf("verify pre: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<nbase; i++) {
int ret = goldilocks_shared_secret_precomputed(ss1,&gsk,pre);
assert(!ret);
}
when = now() - when;
printf("ecdh pre: %5.1fµs\n", when * 1e6 / i);
printf("\nTesting...\n");
int failures=0, successes = 0;
for (i=0; i<nbase/10; i++) {
(void)goldilocks_keygen(&gsk,&gpk);
ignore_result(goldilocks_keygen(&gsk,&gpk));
goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
if (res) failures++;
@@ -574,9 +615,9 @@ int main(int argc, char **argv) {
y = (hword_t)y;
word_t z=x*y;
(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0);
(void)montgomery_ladder(&c,&b,&y,WORD_BITS,0);
(void)montgomery_ladder(&b,&a,&z,WORD_BITS,0);
ignore_result(montgomery_ladder(&b,&a,&x,WORD_BITS,0));
ignore_result(montgomery_ladder(&c,&b,&y,WORD_BITS,0));
ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0));
p448_sub(&d,&b,&c);
p448_bias(&d,2);
@@ -655,7 +696,7 @@ int main(int argc, char **argv) {
untwist_and_double(&exta,&exv);
serialize_extensible(&b, &exta);

(void)precompute_fixed_base_wnaf(wnaft,&exu,5);
ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5));
linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5);
untwist_and_double(&exta,&exv);
serialize_extensible(&c, &exta);


+ 14
- 4
test/test.c View File

@@ -6,7 +6,7 @@

int failed_tests, n_tests, failed_this_test, running_a_test;

void end_test() {
static void end_test() {
if (!failed_this_test) {
printf("[PASS]\n");
}
@@ -14,7 +14,7 @@ void end_test() {
running_a_test = 0;
}

void begin_test(const char *name) {
static void begin_test(const char *name) {
if (running_a_test) end_test();
printf("%s...%*s",name,(int)(30-strlen(name)),"");
fflush(stdout);
@@ -110,8 +110,9 @@ int main(int argc, char **argv) {
(void) argv;
n_tests = running_a_test = failed_tests = 0;
begin_test("SHA-512 NIST Monte Carlo");
test_sha512_monte_carlo();

begin_test("Arithmetic");
test_arithmetic();

begin_test("EC point operations");
test_pointops();
@@ -122,6 +123,15 @@ int main(int argc, char **argv) {
begin_test("Scalarmul commutativity");
test_scalarmul_commutativity();
begin_test("Linear combo");
test_linear_combo();
begin_test("SHA-512 NIST Monte Carlo");
test_sha512_monte_carlo();
begin_test("Goldilocks complete system");
test_goldilocks();
if (running_a_test) end_test();
printf("\n");
if (failed_tests) {


+ 6
- 0
test/test.h View File

@@ -33,10 +33,16 @@ void youfail();

int test_sha512_monte_carlo();

int test_linear_combo ();

int test_scalarmul_compatibility ();

int test_scalarmul_commutativity ();

int test_arithmetic ();

int test_goldilocks ();

int test_pointops ();

#endif // __GOLDILOCKS_TEST_H__

+ 196
- 0
test/test_arithmetic.c View File

@@ -0,0 +1,196 @@
#include "p448.h"
#include "test.h"
#include <gmp.h>
#include <string.h>
#include <stdio.h>

mpz_t mp_p448;

static mask_t mpz_to_p448 (
struct p448_t *out,
const mpz_t in
) {
uint8_t ser[56];
mpz_t modded;
memset(ser,0,sizeof(ser));
mpz_init(modded);
mpz_mod(modded, in, mp_p448);
mpz_export(ser, NULL, -1, 1, -1, 0, modded);
mask_t succ = p448_deserialize(out, ser);
return succ;
}

static mask_t p448_assert_eq_gmp(
const char *descr,
const struct p448_t *x,
const mpz_t y,
float lowBound,
float highBound
) {
uint8_t xser[56], yser[56];
mpz_t modded;
memset(yser,0,sizeof(yser));
p448_serialize(xser, x);
mpz_init(modded);
mpz_mod(modded, y, mp_p448);
mpz_export(yser, NULL, -1, 1, -1, 0, modded);
unsigned int i;
for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
int bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
(1ull<<bits) - 2 : (1ull<<bits) - 1;
if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
youfail();
printf(" P448 limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n",
i, x->limb[i], lowBound, highBound, descr, yardstick);
break;
}
}
if (memcmp(xser,yser,56)) {
youfail();
printf(" Failed arithmetic test %s\n", descr);
p448_print(" p448", x);
printf(" gmp = 0x");
int j;
for (j=55; j>=0; j--) {
printf("%02x", yser[j]);
}
printf("\n");
return MASK_FAILURE;
}
mpz_clear(modded);
return MASK_SUCCESS;
}

static mask_t test_add_sub (
const mpz_t x,
const mpz_t y,
word_t word
) {
struct p448_t xx,yy,tt;
mpz_t t;
mask_t succ = MASK_SUCCESS;
succ = mpz_to_p448(&xx,x);
succ &= mpz_to_p448(&yy,y);
mpz_init(t);
p448_add(&tt,&xx,&yy);
mpz_add(t,x,y);
succ &= p448_assert_eq_gmp("add",&tt,t,0,2.1);
p448_sub(&tt,&xx,&yy);
p448_bias(&tt,2);
mpz_sub(t,x,y);
succ &= p448_assert_eq_gmp("sub",&tt,t,0,3.1);
p448_copy(&tt,&xx);
p448_addw(&tt,word);
mpz_add_ui(t,x,word);
succ &= p448_assert_eq_gmp("addw",&tt,t,0,2.1);
p448_copy(&tt,&xx);
p448_subw(&tt,word);
p448_bias(&tt,1);
mpz_sub_ui(t,x,word);
succ &= p448_assert_eq_gmp("subw",&tt,t,0,2.1);
if (!succ) {
p448_print(" x", &xx);
p448_print(" y", &yy);
}
mpz_clear(t);
return succ;
}

static mask_t test_mul_sqr (
const mpz_t x,
const mpz_t y,
word_t word
) {
struct p448_t xx,yy,tt;
mpz_t t;
mask_t succ = MASK_SUCCESS;
succ = mpz_to_p448(&xx,x);
succ &= mpz_to_p448(&yy,y);
mpz_init(t);
p448_mul(&tt,&xx,&yy);
mpz_mul(t,x,y);
succ &= p448_assert_eq_gmp("mul",&tt,t,0,1.1);
p448_mulw(&tt,&xx,word);
mpz_mul_ui(t,x,word);
succ &= p448_assert_eq_gmp("mulw",&tt,t,0,1.1);
p448_sqr(&tt,&xx);
mpz_mul(t,x,x);
succ &= p448_assert_eq_gmp("sqrx",&tt,t,0,1.1);
p448_sqr(&tt,&yy);
mpz_mul(t,y,y);
succ &= p448_assert_eq_gmp("sqy",&tt,t,0,1.1);
if (!succ) {
p448_print(" x", &xx);
p448_print(" y", &yy);
}
mpz_clear(t);
return succ;
}

int test_arithmetic () {
int j, ntests = 100000;
gmp_randstate_t state;
gmp_randinit_mt(state);
uint8_t pser[56];
for (j=0; j<56; j++) {
pser[j] = (j==28) ? 0xFE : 0xFF;
}
mpz_init(mp_p448);
mpz_import(mp_p448, 56, -1, 1, -1, 0, pser);
mpz_t x,y;
mpz_init(x);
mpz_init(y);
mask_t succ = MASK_SUCCESS;
int bits = sizeof(word_t) * 448 / sizeof(p448_t);
for (j=0; j<ntests; j++) {
if (j&1) {
mpz_rrandomb(x, state, 448);
mpz_rrandomb(y, state, 448);
} else {
mpz_urandomb(x, state, 448);
mpz_urandomb(y, state, 448);
}
word_t word = gmp_urandomm_ui (state, 1ull<<bits);
succ &= test_add_sub(x,y,word);
succ &= test_mul_sqr(x,y,word);
// TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
}
mpz_clear(x);
mpz_clear(y);
mpz_clear(mp_p448);
gmp_randclear(state);
return succ ? 0 : 1;
}


+ 195
- 0
test/test_goldilocks.c View File

@@ -0,0 +1,195 @@
#include "test.h"
#include "goldilocks.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int test_goldilocks () {
const char *message1 = "hello world";
const char *message2 = "Jello world";
unsigned char signature[GOLDI_SIGNATURE_BYTES];
unsigned char
ss12[GOLDI_SHARED_SECRET_BYTES],
ss21[GOLDI_SHARED_SECRET_BYTES],
ss21p[GOLDI_SHARED_SECRET_BYTES],
proto[GOLDI_SYMKEY_BYTES];
struct goldilocks_public_key_t pub, pub2;
struct goldilocks_private_key_t priv, priv2;
struct goldilocks_precomputed_public_key_t *pre = NULL;
int i, ret, good = 1;
ret = goldilocks_init();
if (ret) {
youfail();
printf(" Failed init.\n");
}
for (i=0; i<1000 && good; i++) {
ret = goldilocks_keygen(&priv, &pub);
if (ret) {
youfail();
printf(" Failed keygen trial %d.\n", i);
good = 0;
}
goldilocks_destroy_precomputed_public_key( pre );
pre = goldilocks_precompute_public_key ( &pub );
if (!pre) {
youfail();
printf(" Failed precomp-public trial %d.\n", i);
return -1;
}
ret = goldilocks_sign(
signature,
(const unsigned char *)message1,
strlen(message1),
&priv
);
if (ret) {
youfail();
printf(" Failed sign trial %d.\n", i);
good = 0;
}
ret = goldilocks_verify(
signature,
(const unsigned char *)message1,
strlen(message1),
&pub
);
if (ret) {
youfail();
printf(" Failed verify trial %d.\n", i);
good = 0;
}

ret = goldilocks_verify_precomputed (
signature,
(const unsigned char *)message1,
strlen(message1),
pre
);
if (ret) {
youfail();
printf(" Failed verify-pre trial %d.\n", i);
good = 0;
}
/* terrible negative test */
ret = goldilocks_verify(
signature,
(const unsigned char *)message2,
strlen(message1),
&pub
);
if (ret != GOLDI_EINVAL) {
youfail();
printf(" Failed nega-verify trial %d.\n", i);
good = 0;
}
ret = goldilocks_verify_precomputed(
signature,
(const unsigned char *)message2,
strlen(message1),
pre
);
if (ret != GOLDI_EINVAL) {
youfail();
printf(" Failed nega-verify-pre trial %d.\n", i);
good = 0;
}
/* honestly a slightly better negative test */
memset(signature,0,sizeof(signature));
ret = goldilocks_verify(
signature,
(const unsigned char *)message1,
strlen(message1),
&pub
);
if (ret != GOLDI_EINVAL) {
youfail();
printf(" Failed nega-verify-0 trial %d.\n", i);
good = 0;
}
ret = goldilocks_verify_precomputed(
signature,
(const unsigned char *)message1,
strlen(message1),
pre
);
if (ret != GOLDI_EINVAL) {
youfail();
printf(" Failed nega-verify-pre-0 trial %d.\n", i);
good = 0;
}
/* ecdh */
ret = goldilocks_keygen(&priv2, &pub2);
if (ret) {
youfail();
printf(" Failed keygen2 trial %d.\n", i);
good = 0;
}
ret = goldilocks_shared_secret ( ss12, &priv, &pub2 );
if (ret) {
youfail();
printf(" Failed ss12 trial %d.\n", i);
good = 0;
}
ret = goldilocks_shared_secret ( ss21, &priv2, &pub );
if (ret) {
youfail();
printf(" Failed ss21 trial %d.\n", i);
good = 0;
}
ret = goldilocks_shared_secret_precomputed ( ss21p, &priv2, pre );
if (ret) {
youfail();
printf(" Failed ss21p trial %d.\n", i);
good = 0;
}
if (memcmp(ss12,ss21,sizeof(ss12))) {
youfail();
printf(" Failed shared-secret trial %d.\n", i);
good = 0;
}
if (memcmp(ss21,ss21p,sizeof(ss21))) {
youfail();
printf(" Failed shared-secret precomp trial %d.\n", i);
good = 0;
}
/* test derive / underive / priv to pub */
goldilocks_underive_private_key ( proto, &priv );
ret = goldilocks_derive_private_key ( &priv2, proto );
if (ret || memcmp(&priv,&priv2,sizeof(priv))) {
youfail();
printf(" Failed derive round-trip trial %d.\n", i);
good = 0;
}
ret = goldilocks_private_to_public ( &pub2, &priv );
if (ret || memcmp(&pub,&pub2,sizeof(pub))) {
youfail();
printf(" Failed private-to-public trial %d.\n", i);
good = 0;
}
}
goldilocks_destroy_precomputed_public_key( pre );
return good ? 0 : -1;
}

+ 129
- 0
test/test_scalarmul.c View File

@@ -159,6 +159,92 @@ single_scalarmul_compatibility_test (
return ret;
}

static int
single_linear_combo_test (
const struct p448_t *base1,
const word_t *scalar1,
int nbits1,
const struct p448_t *base2,
const word_t *scalar2,
int nbits2
) {
/* MAGIC */
const struct p448_t
sqrt_d_minus_1 = {{
U58LE(0xd2e21836749f46),
U58LE(0x888db42b4f0179),
U58LE(0x5a189aabdeea38),
U58LE(0x51e65ca6f14c06),
U58LE(0xa49f7b424d9770),
U58LE(0xdcac4628c5f656),
U58LE(0x49443b8748734a),
U58LE(0x12fec0c0b25b7a)
}};
struct tw_extensible_t text1, text2, working;
struct tw_pniels_t pn;
struct p448_t result_comb, result_combo, result_wnaf;
mask_t succ =
deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1)
& deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2);
if (!succ) return 1;
struct fixed_base_table_t t1, t2;
struct tw_niels_t wnaf[32];
memset(&t1,0,sizeof(t1));
memset(&t2,0,sizeof(t2));
succ = precompute_fixed_base(&t1, &text1, 5, 5, 18, NULL);
succ &= precompute_fixed_base(&t2, &text2, 6, 3, 25, NULL);
succ &= precompute_fixed_base_wnaf(wnaf, &text2, 5);
if (!succ) {
destroy_fixed_base(&t1);
destroy_fixed_base(&t2);
return -1;
}
/* use the dedicated wNAF linear combo algorithm */
copy_tw_extensible(&working, &text1);
linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5);
untwist_and_double_and_serialize(&result_wnaf, &working);
/* use the dedicated combs algorithm */
succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2);
untwist_and_double_and_serialize(&result_combo, &working);
/* use two combs */
succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1);
convert_tw_extensible_to_tw_pniels(&pn, &working);
succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2);
add_tw_pniels_to_tw_extensible(&working, &pn);
untwist_and_double_and_serialize(&result_comb, &working);
mask_t consistent = MASK_SUCCESS;
consistent &= p448_eq(&result_combo, &result_wnaf);
consistent &= p448_eq(&result_comb, &result_wnaf);
if (!succ || !consistent) {
youfail();
printf(" Failed linear combo consistency test with nbits=%d,%d.\n",nbits1,nbits2);

p448_print(" base1", base1);
scalar_print(" scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
p448_print(" base2", base2);
scalar_print(" scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
p448_print(" combs", &result_comb);
p448_print(" combo", &result_combo);
p448_print(" wNAFs", &result_wnaf);
return -1;
}
destroy_fixed_base(&t1);
destroy_fixed_base(&t2);
return 0;
}

/* 0 = succeed, 1 = inval, -1 = fail */
static int
single_scalarmul_commutativity_test (
@@ -251,6 +337,49 @@ int test_scalarmul_commutativity () {
return 0;
}

int test_linear_combo () {
int i,j,k,got;
struct crandom_state_t crand;
crandom_init_from_buffer(&crand, "scalarmul_linear_combos_test RNG");
for (i=0; i<=448; i+=7) {
for (j=0; j<=448; j+=7) {
got = 0;
for (k=0; k<128 && !got; k++) {
uint8_t ser[56];
word_t scalar1[7], scalar2[7];
crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1));
crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2));
p448_t base1;
crandom_generate(&crand, ser, sizeof(ser));
mask_t succ = p448_deserialize(&base1, ser);
if (!succ) continue;
p448_t base2;
crandom_generate(&crand, ser, sizeof(ser));
succ = p448_deserialize(&base2, ser);
if (!succ) continue;
int ret = single_linear_combo_test (&base1, scalar1, i, &base2, scalar2, j);
got = !ret;
if (ret == -1) return -1;
}

if (!got) {
youfail();
printf(" Unlikely: rejected 128 scalars in a row.\n");
return -1;
}
}
}
return 0;
}

int test_scalarmul_compatibility () {
int i,j,k,got;


Loading…
Cancel
Save