(you knew this would happen). Added ARM NEON support. Added support for precomputation on public keys, which speeds up later signatures and ECDH calls. See history.txt or the doc for details. Reworked internals so that private keys can be derived from any 32-byte secret random value. This also means that secret keys can be "compressed" for cold storage. Added more tests. Running the tests now requires GMP, though Goldilocks itself does not. Linking now uses visibility instead of exported.sym.master
@@ -1,3 +1,46 @@ | |||
May 3, 2104: | |||
Minor changes to internal routines mean that this version is not | |||
compatible with the previous one. | |||
Added ARM NEON code. | |||
Added the ability to precompute multiples of a partner's public key. This | |||
takes slightly longer than a signature verification, but reduces future | |||
verifications with the precomputed key by ~63% and ECDH by ~70%. | |||
goldilocks_precompute_public_key | |||
goldilocks_destroy_precomputed_public_key | |||
goldilocks_verify_precomputed | |||
goldilocks_shared_secret_precomputed | |||
The precomputation feature are is protected by a macro | |||
GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
which can be #defined to 0 to compile these functions out. Unlike most | |||
of Goldilocks' functions, goldilocks_precompute_public_key uses malloc() | |||
(and goldilocks_destroy_precomputed_public_key uses free()). | |||
Changed private keys to be derived from just the symmetric part. This | |||
means that you can compress them to 32 bytes for cold storage, or derive | |||
keypairs from crypto secrets from other systems. | |||
goldilocks_derive_private_key | |||
goldilocks_underive_private_key | |||
goldilocks_private_to_public | |||
Fixed a number of bugs related to vector alignment on Sandy Bridge, which | |||
has AVX but uses SSE2 alignment (because it doesn't have AVX2). Maybe I | |||
should just switch it to use AVX2 alignment? | |||
Beginning to factor out curve-specific magic, so as to build other curves | |||
with the Goldilocks framework. That would enable fair tests against eg | |||
E-521, Ed25519 etc. Still would be a lot of work. | |||
More thorough testing of arithmetic. Now uses GMP for testing framework, | |||
but not in the actual library. | |||
Added some high-level tests for the whole library, including some (bs) | |||
negative testing. Obviously, effective negative testing is a very difficult | |||
proposition in a crypto library. | |||
March 29, 2014: | |||
Added a test directory with various tests. Currently testing SHA512 Monte | |||
Carlo, compatibility of the different scalarmul functions, and some | |||
@@ -1,23 +1,57 @@ | |||
# Copyright (c) 2014 Cryptography Research, Inc. | |||
# Released under the MIT License. See LICENSE.txt for license information. | |||
UNAME := $(shell uname) | |||
MACHINE := $(shell uname -m) | |||
ifeq ($(UNAME),Darwin) | |||
CC = clang | |||
LD = clang | |||
else | |||
CC = gcc | |||
endif | |||
LD = $(CC) | |||
ifneq (,$(findstring x86_64,$(MACHINE))) | |||
ARCH ?= arch_x86_64 | |||
else | |||
# no i386 port yet | |||
ARCH ?= arch_arm_32 | |||
endif | |||
ARCH = arch_x86_64 | |||
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | |||
-Wgcc-compat -Wmissing-declarations | |||
-Wmissing-declarations -Wunused-function $(EXWARN) | |||
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) | |||
LANGFLAGS = -std=c99 | |||
GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC | |||
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | |||
OFLAGS = -O3 | |||
#XFLAGS = -DN_TESTS_BASE=1000 | |||
ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 | |||
#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16 | |||
CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS) | |||
LDFLAGS = $(ARCHFLAGS) | |||
ifneq (,$(findstring arm,$(MACHINE))) | |||
ifneq (,$(findstring neon,$(ARCH))) | |||
ARCHFLAGS += -mfpu=neon | |||
else | |||
ARCHFLAGS += -mfpu=vfpv3-d16 | |||
endif | |||
ARCHFLAGS += -mcpu=cortex-a9 # FIXME | |||
GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow | |||
else | |||
ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO | |||
endif | |||
ifeq ($(CC),clang) | |||
WARNFLAGS += -Wgcc-compat | |||
endif | |||
ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC))) | |||
# ARCHFLAGS += -m32 | |||
ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1 | |||
endif | |||
CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS) | |||
LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS) | |||
ASFLAGS = $(ARCHFLAGS) | |||
.PHONY: clean all test bench todo doc lib | |||
@@ -29,7 +63,7 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ | |||
build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o | |||
TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ | |||
build/test_pointops.o | |||
build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o | |||
BENCHCOMPONENTS=build/bench.o | |||
@@ -45,15 +79,20 @@ build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS) | |||
$(LD) $(LDFLAGS) -o $@ $^ | |||
build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS) | |||
$(LD) $(LDFLAGS) -o $@ $^ | |||
$(LD) $(LDFLAGS) -o $@ $^ -lgmp | |||
lib: build/goldilocks.so | |||
build/goldilocks.so: $(LIBCOMPONENTS) | |||
rm -f $@ | |||
ifeq ($(UNAME),Darwin) | |||
libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \ | |||
-exported_symbols_list src/exported.sym \ | |||
$(LIBCOMPONENTS) | |||
else | |||
$(LD) -shared -Wl,-soname,goldilocks.so.1 -Wl,--gc-sections -o $@ $(LIBCOMPONENTS) | |||
strip --discard-all $@ | |||
ln -sf $@ build/goldilocks.so.1 | |||
endif | |||
build/timestamp: | |||
mkdir -p build | |||
@@ -80,9 +119,9 @@ doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/ | |||
todo:: | |||
@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \ | |||
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | |||
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | |||
@echo '=============================' | |||
@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \ | |||
@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \ | |||
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \ | |||
/bin/echo -n $$i' ' | head -c 10; \ | |||
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \ | |||
@@ -90,7 +129,7 @@ todo:: | |||
@echo '=============================' | |||
@echo -n 'Total ' | |||
@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \ | |||
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l | |||
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l | |||
bench: build/bench | |||
./$< | |||
@@ -12,13 +12,42 @@ | |||
#include <stdint.h> | |||
#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
/** If nonzero, implement precomputation for verify and ECDH. */ | |||
#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1 | |||
#endif | |||
/** The size of the Goldilocks field, in bits. */ | |||
#define GOLDI_FIELD_BITS 448 | |||
/** The size of the Goldilocks scalars, in bits. */ | |||
#define GOLDI_SCALAR_BITS 446 | |||
/** The same size, in bytes. */ | |||
#define GOLDI_FIELD_BYTES (GOLDI_FIELD_BITS/8) | |||
/** The size of a Goldilocks public key, in bytes. */ | |||
#define GOLDI_PUBLIC_KEY_BYTES GOLDI_FIELD_BYTES | |||
/** The extra bytes in a Goldilocks private key for the symmetric key. */ | |||
#define GOLDI_SYMKEY_BYTES 32 | |||
/** The size of a shared secret. */ | |||
#define GOLDI_SHARED_SECRET_BYTES 64 | |||
/** The size of a Goldilocks private key, in bytes. */ | |||
#define GOLDI_PRIVATE_KEY_BYTES (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES) | |||
/** The size of a Goldilocks private key, in bytes. */ | |||
#define GOLDI_SIGNATURE_BYTES (2*GOLDI_FIELD_BYTES) | |||
/** | |||
* @brief Serialized form of a Goldilocks public key. | |||
* | |||
* @warning This isn't even my final form! | |||
*/ | |||
struct goldilocks_public_key_t { | |||
uint8_t opaque[56]; /**< Serialized data. */ | |||
uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */ | |||
}; | |||
/** | |||
@@ -30,7 +59,7 @@ struct goldilocks_public_key_t { | |||
* @warning This isn't even my final form! | |||
*/ | |||
struct goldilocks_private_key_t { | |||
uint8_t opaque[144]; /**< Serialized data. */ | |||
uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */ | |||
}; | |||
#ifdef __cplusplus | |||
@@ -72,7 +101,7 @@ static const int GOLDI_EALREADYINIT = 44805; | |||
*/ | |||
int | |||
goldilocks_init () | |||
__attribute__((warn_unused_result)); | |||
__attribute__((warn_unused_result,visibility ("default"))); | |||
/** | |||
@@ -90,7 +119,40 @@ int | |||
goldilocks_keygen ( | |||
struct goldilocks_private_key_t *privkey, | |||
struct goldilocks_public_key_t *pubkey | |||
) __attribute__((warn_unused_result,nonnull(1,2))); | |||
) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default"))); | |||
/** | |||
* @brief Derive a key from its compressed form. | |||
* @param [out] privkey The derived private key. | |||
* @param [in] proto The compressed or proto-key, which must be 32 random bytes. | |||
* | |||
* @warning This isn't even my final form! | |||
* | |||
* @retval GOLDI_EOK Success. | |||
* @retval GOLDI_EUNINIT You must call goldilocks_init() first. | |||
*/ | |||
int | |||
goldilocks_derive_private_key ( | |||
struct goldilocks_private_key_t *privkey, | |||
const unsigned char proto[GOLDI_SYMKEY_BYTES] | |||
) __attribute__((nonnull(1,2),visibility ("default"))); | |||
/** | |||
* @brief Compress a private key (by copying out the proto-key) | |||
* @param [out] proto The proto-key. | |||
* @param [in] privkey The private key. | |||
* | |||
* @warning This isn't even my final form! | |||
* @todo test. | |||
* | |||
* @retval GOLDI_EOK Success. | |||
* @retval GOLDI_EUNINIT You must call goldilocks_init() first. | |||
*/ | |||
void | |||
goldilocks_underive_private_key ( | |||
unsigned char proto[GOLDI_SYMKEY_BYTES], | |||
const struct goldilocks_private_key_t *privkey | |||
) __attribute__((nonnull(1,2),visibility ("default"))); | |||
/** | |||
* @brief Extract the public key from a private key. | |||
@@ -107,7 +169,7 @@ int | |||
goldilocks_private_to_public ( | |||
struct goldilocks_public_key_t *pubkey, | |||
const struct goldilocks_private_key_t *privkey | |||
) __attribute__((nonnull(1,2))); | |||
) __attribute__((nonnull(1,2),visibility ("default"))); | |||
/** | |||
* @brief Generate a Diffie-Hellman shared secret in constant time. | |||
@@ -140,10 +202,10 @@ goldilocks_private_to_public ( | |||
*/ | |||
int | |||
goldilocks_shared_secret ( | |||
uint8_t shared[64], | |||
uint8_t shared[GOLDI_SHARED_SECRET_BYTES], | |||
const struct goldilocks_private_key_t *my_privkey, | |||
const struct goldilocks_public_key_t *your_pubkey | |||
) __attribute__((warn_unused_result,nonnull(1,2,3))); | |||
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default"))); | |||
/** | |||
* @brief Sign a message. | |||
@@ -166,11 +228,11 @@ goldilocks_shared_secret ( | |||
*/ | |||
int | |||
goldilocks_sign ( | |||
uint8_t signature_out[56*2], | |||
uint8_t signature_out[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_private_key_t *privkey | |||
) __attribute__((nonnull(1,2,4))); | |||
) __attribute__((nonnull(1,2,4),visibility ("default"))); | |||
/** | |||
* @brief Verify a signature. | |||
@@ -197,11 +259,108 @@ goldilocks_sign ( | |||
*/ | |||
int | |||
goldilocks_verify ( | |||
const uint8_t signature[56*2], | |||
const uint8_t signature[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_public_key_t *pubkey | |||
) __attribute__((warn_unused_result,nonnull(1,2,4))); | |||
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default"))); | |||
#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
/** A public key which has been expanded by precomputation for higher speed. */ | |||
struct goldilocks_precomputed_public_key_t; | |||
/** | |||
* @brief Expand a public key by precomputation. | |||
* | |||
* @todo Give actual error returns, instead of ambiguous NULL. | |||
* | |||
* @warning This isn't even my final form! | |||
* | |||
* @param [in] pub The public key. | |||
* @retval NULL We ran out of memory, or the | |||
*/ | |||
struct goldilocks_precomputed_public_key_t * | |||
goldilocks_precompute_public_key ( | |||
const struct goldilocks_public_key_t *pub | |||
) __attribute__((warn_unused_result,nonnull(1),visibility ("default"))); | |||
/** | |||
* @brief Overwrite an expanded public key with zeros, then destroy it. | |||
* | |||
* If the input is NULL, this function does nothing. | |||
* | |||
* @param [in] precom The public key. | |||
*/ | |||
void | |||
goldilocks_destroy_precomputed_public_key ( | |||
struct goldilocks_precomputed_public_key_t *precom | |||
) __attribute__((visibility ("default"))); | |||
/** | |||
* @brief Verify a signature. | |||
* | |||
* This function is fairly strict. It will correctly detect when | |||
* the signature has the wrong cofactor component, or when the sig | |||
* values aren't less than p or q. | |||
* | |||
* @warning This isn't even my final form! | |||
* | |||
* @param [in] signature The signature. | |||
* @param [in] message The message to be verified. | |||
* @param [in] message_len The length of the message to be verified. | |||
* @param [in] pubkey The signer's public key, expanded by precomputation. | |||
* | |||
* @retval GOLDI_EOK Success. | |||
* @retval GOLDI_EINVAL The public key or signature is corrupt. | |||
* @retval GOLDI_EUNINIT You must call goldilocks_init() first. | |||
*/ | |||
int | |||
goldilocks_verify_precomputed ( | |||
const uint8_t signature[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_precomputed_public_key_t *pubkey | |||
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default"))); | |||
/** | |||
* @brief Generate a Diffie-Hellman shared secret in constant time. | |||
* Uses a precomputation on the other party's public key for efficiency. | |||
* | |||
* This function uses some compile-time flags whose merit remains to | |||
* be decided. | |||
* | |||
* If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes | |||
* of zeros to the secret before hashing. In the case that the other | |||
* party's key is detectably corrupt, instead the symmetric part | |||
* of the secret key is used to produce a pseudorandom value. | |||
* | |||
* If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of | |||
* the two parties' public keys is prepended to the hash. | |||
* | |||
* In the current version, this function can safely be run even without | |||
* goldilocks_init(). But this property is not guaranteed for future | |||
* versions, so call it anyway. | |||
* | |||
* @warning This isn't even my final form! | |||
* | |||
* @param [out] shared The shared secret established with the other party. | |||
* @param [in] my_privkey My private key. | |||
* @param [in] your_pubkey The other party's precomputed public key. | |||
* | |||
* @retval GOLDI_EOK Success. | |||
* @retval GOLDI_ECORRUPT My key is corrupt. | |||
* @retval GOLDI_EINVAL The other party's key is corrupt. | |||
* @retval GOLDI_EUNINIT You must call goldilocks_init() first. | |||
*/ | |||
int | |||
goldilocks_shared_secret_precomputed ( | |||
uint8_t shared[GOLDI_SHARED_SECRET_BYTES], | |||
const struct goldilocks_private_key_t *my_privkey, | |||
const struct goldilocks_precomputed_public_key_t *your_pubkey | |||
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default"))); | |||
#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */ | |||
#ifdef __cplusplus | |||
}; /* extern "C" */ | |||
@@ -28,6 +28,8 @@ smlal ( | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
#ifdef __ARMEL__ | |||
uint32_t lo = *acc, hi = (*acc)>>32; | |||
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" | |||
@@ -35,6 +37,9 @@ smlal ( | |||
: [a]"r"(a), [b]"r"(b)); | |||
*acc = lo + (((uint64_t)hi)<<32); | |||
#else | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
#endif | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
@@ -43,6 +48,7 @@ smlal2 ( | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
#ifdef __ARMEL__ | |||
uint32_t lo = *acc, hi = (*acc)>>32; | |||
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" | |||
@@ -50,6 +56,9 @@ smlal2 ( | |||
: [a]"r"(a), [b]"r"(2*b)); | |||
*acc = lo + (((uint64_t)hi)<<32); | |||
#else | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); | |||
#endif | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
@@ -58,6 +67,7 @@ smull ( | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
#ifdef __ARMEL__ | |||
uint32_t lo, hi; | |||
__asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" | |||
@@ -65,6 +75,9 @@ smull ( | |||
: [a]"r"(a), [b]"r"(b)); | |||
*acc = lo + (((uint64_t)hi)<<32); | |||
#else | |||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
#endif | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
@@ -73,6 +86,7 @@ smull2 ( | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
#ifdef __ARMEL__ | |||
uint32_t lo, hi; | |||
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" | |||
@@ -80,6 +94,9 @@ smull2 ( | |||
: [a]"r"(a), [b]"r"(2*b)); | |||
*acc = lo + (((uint64_t)hi)<<32); | |||
#else | |||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); | |||
#endif | |||
} | |||
void | |||
@@ -760,13 +777,13 @@ p448_mulw ( | |||
const p448_t *as, | |||
uint64_t b | |||
) { | |||
const uint32_t bhi = b>>28, blo = b & (1<<28)-1; | |||
uint32_t mask = (1ull<<28)-1; | |||
const uint32_t bhi = b>>28, blo = b & mask; | |||
const uint32_t *a = as->limb; | |||
uint32_t *c = cs->limb; | |||
uint64_t accum0, accum8; | |||
uint32_t mask = (1ull<<28)-1; | |||
int i; | |||
@@ -957,7 +974,7 @@ p448_deserialize ( | |||
for (j=0; j<7; j++) { | |||
out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||
} | |||
x->limb[2*i] = out & (1ull<<28)-1; | |||
x->limb[2*i] = out & ((1ull<<28)-1); | |||
x->limb[2*i+1] = out >> 28; | |||
} | |||
@@ -173,7 +173,7 @@ p448_set_ui ( | |||
uint64_t x | |||
) { | |||
int i; | |||
out->limb[0] = x & (1<<28)-1; | |||
out->limb[0] = x & ((1<<28)-1); | |||
out->limb[1] = x>>28; | |||
for (i=2; i<16; i++) { | |||
out->limb[i] = 0; | |||
@@ -188,7 +188,11 @@ p448_cond_swap ( | |||
) { | |||
big_register_t *aa = (big_register_t*)a; | |||
big_register_t *bb = (big_register_t*)b; | |||
#if __ARM_NEON__ | |||
big_register_t m = vdupq_n_u32(doswap); | |||
#else | |||
big_register_t m = doswap; | |||
#endif | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) { | |||
@@ -260,8 +264,12 @@ p448_cond_neg( | |||
struct p448_t negated; | |||
big_register_t *aa = (big_register_t *)a; | |||
big_register_t *nn = (big_register_t*)&negated; | |||
#if __ARM_NEON__ | |||
big_register_t m = vdupq_n_u32(doNegate); | |||
#else | |||
big_register_t m = doNegate; | |||
#endif | |||
p448_neg(&negated, a); | |||
p448_bias(&negated, 2); | |||
@@ -0,0 +1,959 @@ | |||
/** | |||
* @cond internal | |||
* @file ec_point.c | |||
* @copyright | |||
* Copyright (c) 2014 Cryptography Research, Inc. \n | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
* @author Mike Hamburg | |||
* @warning This file was automatically generated. | |||
*/ | |||
#include "ec_point.h" | |||
void | |||
p448_isr ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L1, x ); | |||
p448_mul ( &L2, x, &L1 ); | |||
p448_sqr ( &L1, &L2 ); | |||
p448_mul ( &L2, x, &L1 ); | |||
p448_sqrn ( &L1, &L2, 3 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_sqrn ( &L1, &L0, 3 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_sqrn ( &L2, &L0, 9 ); | |||
p448_mul ( &L1, &L0, &L2 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mul ( &L2, x, &L0 ); | |||
p448_sqrn ( &L0, &L2, 18 ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_sqrn ( &L0, &L2, 37 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_sqrn ( &L0, &L1, 37 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_sqrn ( &L0, &L1, 111 ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_sqr ( &L0, &L2 ); | |||
p448_mul ( &L1, x, &L0 ); | |||
p448_sqrn ( &L0, &L1, 223 ); | |||
p448_mul ( a, &L2, &L0 ); | |||
} | |||
void | |||
p448_inverse ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
) { | |||
struct p448_t L0, L1; | |||
p448_isr ( &L0, x ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mul ( a, x, &L0 ); | |||
} | |||
void | |||
add_tw_niels_to_tw_extensible ( | |||
struct tw_extensible_t* d, | |||
const struct tw_niels_t* e | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sub ( &L1, &d->y, &d->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L0, &e->a, &L1 ); | |||
p448_add ( &L1, &d->x, &d->y ); | |||
p448_mul ( &d->y, &e->b, &L1 ); | |||
p448_mul ( &L1, &d->u, &d->t ); | |||
p448_mul ( &d->x, &e->c, &L1 ); | |||
p448_add ( &d->u, &L0, &d->y ); | |||
p448_sub ( &d->t, &d->y, &L0 ); | |||
p448_bias ( &d->t, 2 ); | |||
p448_weak_reduce( &d->t ); | |||
p448_sub ( &d->y, &d->z, &d->x ); | |||
p448_bias ( &d->y, 2 ); | |||
p448_weak_reduce( &d->y ); | |||
p448_add ( &L0, &d->x, &d->z ); | |||
p448_mul ( &d->z, &L0, &d->y ); | |||
p448_mul ( &d->x, &d->y, &d->t ); | |||
p448_mul ( &d->y, &L0, &d->u ); | |||
} | |||
void | |||
sub_tw_niels_from_tw_extensible ( | |||
struct tw_extensible_t* d, | |||
const struct tw_niels_t* e | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sub ( &L1, &d->y, &d->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L0, &e->b, &L1 ); | |||
p448_add ( &L1, &d->x, &d->y ); | |||
p448_mul ( &d->y, &e->a, &L1 ); | |||
p448_mul ( &L1, &d->u, &d->t ); | |||
p448_mul ( &d->x, &e->c, &L1 ); | |||
p448_add ( &d->u, &L0, &d->y ); | |||
p448_sub ( &d->t, &d->y, &L0 ); | |||
p448_bias ( &d->t, 2 ); | |||
p448_weak_reduce( &d->t ); | |||
p448_add ( &d->y, &d->x, &d->z ); | |||
p448_sub ( &L0, &d->z, &d->x ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &d->z, &L0, &d->y ); | |||
p448_mul ( &d->x, &d->y, &d->t ); | |||
p448_mul ( &d->y, &L0, &d->u ); | |||
} | |||
void | |||
add_tw_pniels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_mul ( &L0, &e->z, &a->z ); | |||
p448_copy ( &e->z, &L0 ); | |||
add_tw_niels_to_tw_extensible( e, &a->n ); | |||
} | |||
void | |||
sub_tw_pniels_from_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_mul ( &L0, &e->z, &a->z ); | |||
p448_copy ( &e->z, &L0 ); | |||
sub_tw_niels_from_tw_extensible( e, &a->n ); | |||
} | |||
void | |||
double_tw_extensible ( | |||
struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->x ); | |||
p448_sqr ( &L0, &a->y ); | |||
p448_add ( &a->u, &L2, &L0 ); | |||
p448_add ( &a->t, &a->y, &a->x ); | |||
p448_sqr ( &L1, &a->t ); | |||
p448_sub ( &a->t, &L1, &a->u ); | |||
p448_bias ( &a->t, 3 ); | |||
p448_weak_reduce( &a->t ); | |||
p448_sub ( &L1, &L0, &L2 ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_bias ( &a->x, 1 ); | |||
p448_add ( &a->z, &a->x, &a->x ); | |||
p448_sub ( &L0, &a->z, &L1 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->z, &L1, &L0 ); | |||
p448_mul ( &a->x, &L0, &a->t ); | |||
p448_mul ( &a->y, &L1, &a->u ); | |||
} | |||
void | |||
double_extensible ( | |||
struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->x ); | |||
p448_sqr ( &L0, &a->y ); | |||
p448_add ( &L1, &L2, &L0 ); | |||
p448_add ( &a->t, &a->y, &a->x ); | |||
p448_sqr ( &a->u, &a->t ); | |||
p448_sub ( &a->t, &a->u, &L1 ); | |||
p448_bias ( &a->t, 3 ); | |||
p448_weak_reduce( &a->t ); | |||
p448_sub ( &a->u, &L0, &L2 ); | |||
p448_bias ( &a->u, 2 ); | |||
p448_weak_reduce( &a->u ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_add ( &a->z, &a->x, &a->x ); | |||
p448_sub ( &L0, &a->z, &L1 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->z, &L1, &L0 ); | |||
p448_mul ( &a->x, &L0, &a->t ); | |||
p448_mul ( &a->y, &L1, &a->u ); | |||
} | |||
void | |||
twist_and_double ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_sqr ( &b->x, &a->x ); | |||
p448_sqr ( &b->z, &a->y ); | |||
p448_add ( &b->u, &b->x, &b->z ); | |||
p448_add ( &b->t, &a->y, &a->x ); | |||
p448_sqr ( &L0, &b->t ); | |||
p448_sub ( &b->t, &L0, &b->u ); | |||
p448_bias ( &b->t, 3 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_sub ( &L0, &b->z, &b->x ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_sqr ( &b->x, &a->z ); | |||
p448_bias ( &b->x, 2 ); | |||
p448_add ( &b->z, &b->x, &b->x ); | |||
p448_sub ( &b->y, &b->z, &b->u ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->z, &L0, &b->y ); | |||
p448_mul ( &b->x, &b->y, &b->t ); | |||
p448_mul ( &b->y, &L0, &b->u ); | |||
} | |||
void | |||
untwist_and_double ( | |||
struct extensible_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_sqr ( &b->x, &a->x ); | |||
p448_sqr ( &b->z, &a->y ); | |||
p448_add ( &L0, &b->x, &b->z ); | |||
p448_add ( &b->t, &a->y, &a->x ); | |||
p448_sqr ( &b->u, &b->t ); | |||
p448_sub ( &b->t, &b->u, &L0 ); | |||
p448_bias ( &b->t, 3 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_sub ( &b->u, &b->z, &b->x ); | |||
p448_bias ( &b->u, 2 ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sqr ( &b->x, &a->z ); | |||
p448_bias ( &b->x, 1 ); | |||
p448_add ( &b->z, &b->x, &b->x ); | |||
p448_sub ( &b->y, &b->z, &b->u ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->z, &L0, &b->y ); | |||
p448_mul ( &b->x, &b->y, &b->t ); | |||
p448_mul ( &b->y, &L0, &b->u ); | |||
} | |||
void | |||
convert_tw_affine_to_tw_pniels ( | |||
struct tw_pniels_t* b, | |||
const struct tw_affine_t* a | |||
) { | |||
p448_sub ( &b->n.a, &a->y, &a->x ); | |||
p448_bias ( &b->n.a, 2 ); | |||
p448_weak_reduce( &b->n.a ); | |||
p448_add ( &b->n.b, &a->x, &a->y ); | |||
p448_weak_reduce( &b->n.b ); | |||
p448_mul ( &b->n.c, &a->y, &a->x ); | |||
p448_mulw ( &b->z, &b->n.c, 78164 ); | |||
p448_neg ( &b->n.c, &b->z ); | |||
p448_bias ( &b->n.c, 2 ); | |||
p448_weak_reduce( &b->n.c ); | |||
p448_set_ui( &b->z, 2 ); | |||
} | |||
void | |||
convert_tw_affine_to_tw_extensible ( | |||
struct tw_extensible_t* b, | |||
const struct tw_affine_t* a | |||
) { | |||
p448_copy ( &b->x, &a->x ); | |||
p448_copy ( &b->y, &a->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &a->x ); | |||
p448_copy ( &b->u, &a->y ); | |||
} | |||
void | |||
convert_affine_to_extensible ( | |||
struct extensible_t* b, | |||
const struct affine_t* a | |||
) { | |||
p448_copy ( &b->x, &a->x ); | |||
p448_copy ( &b->y, &a->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &a->x ); | |||
p448_copy ( &b->u, &a->y ); | |||
} | |||
void | |||
convert_tw_extensible_to_tw_pniels ( | |||
struct tw_pniels_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
p448_sub ( &b->n.a, &a->y, &a->x ); | |||
p448_bias ( &b->n.a, 2 ); | |||
p448_weak_reduce( &b->n.a ); | |||
p448_add ( &b->n.b, &a->x, &a->y ); | |||
p448_weak_reduce( &b->n.b ); | |||
p448_mul ( &b->n.c, &a->u, &a->t ); | |||
p448_mulw ( &b->z, &b->n.c, 78164 ); | |||
p448_neg ( &b->n.c, &b->z ); | |||
p448_bias ( &b->n.c, 2 ); | |||
p448_weak_reduce( &b->n.c ); | |||
p448_add ( &b->z, &a->z, &a->z ); | |||
p448_weak_reduce( &b->z ); | |||
} | |||
void | |||
convert_tw_pniels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* d | |||
) { | |||
p448_add ( &e->u, &d->n.b, &d->n.a ); | |||
p448_sub ( &e->t, &d->n.b, &d->n.a ); | |||
p448_bias ( &e->t, 2 ); | |||
p448_weak_reduce( &e->t ); | |||
p448_mul ( &e->x, &d->z, &e->t ); | |||
p448_mul ( &e->y, &d->z, &e->u ); | |||
p448_sqr ( &e->z, &d->z ); | |||
} | |||
void | |||
convert_tw_niels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_niels_t* d | |||
) { | |||
p448_add ( &e->y, &d->b, &d->a ); | |||
p448_weak_reduce( &e->y ); | |||
p448_sub ( &e->x, &d->b, &d->a ); | |||
p448_bias ( &e->x, 2 ); | |||
p448_weak_reduce( &e->x ); | |||
p448_set_ui( &e->z, 1 ); | |||
p448_copy ( &e->t, &e->x ); | |||
p448_copy ( &e->u, &e->y ); | |||
} | |||
void | |||
montgomery_step ( | |||
struct montgomery_t* a | |||
) { | |||
struct p448_t L0, L1; | |||
p448_add ( &L0, &a->zd, &a->xd ); | |||
p448_sub ( &L1, &a->xd, &a->zd ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_sub ( &a->zd, &a->xa, &a->za ); | |||
p448_bias ( &a->zd, 2 ); | |||
p448_weak_reduce( &a->zd ); | |||
p448_mul ( &a->xd, &L0, &a->zd ); | |||
p448_add ( &a->zd, &a->za, &a->xa ); | |||
p448_mul ( &a->za, &L1, &a->zd ); | |||
p448_add ( &a->xa, &a->za, &a->xd ); | |||
p448_sqr ( &a->zd, &a->xa ); | |||
p448_mul ( &a->xa, &a->z0, &a->zd ); | |||
p448_sub ( &a->zd, &a->xd, &a->za ); | |||
p448_bias ( &a->zd, 2 ); | |||
p448_weak_reduce( &a->zd ); | |||
p448_sqr ( &a->za, &a->zd ); | |||
p448_sqr ( &a->xd, &L0 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mulw ( &a->zd, &a->xd, 39082 ); | |||
p448_sub ( &L1, &a->xd, &L0 ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &a->xd, &L0, &a->zd ); | |||
p448_sub ( &L0, &a->zd, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->zd, &L0, &L1 ); | |||
} | |||
void | |||
deserialize_montgomery ( | |||
struct montgomery_t* a, | |||
const struct p448_t* sbz | |||
) { | |||
p448_sqr ( &a->z0, sbz ); | |||
p448_set_ui( &a->xd, 1 ); | |||
p448_set_ui( &a->zd, 0 ); | |||
p448_set_ui( &a->xa, 1 ); | |||
p448_copy ( &a->za, &a->z0 ); | |||
} | |||
mask_t | |||
serialize_montgomery ( | |||
struct p448_t* b, | |||
const struct montgomery_t* a, | |||
const struct p448_t* sbz | |||
) { | |||
mask_t L0, L1, L2; | |||
struct p448_t L3, L4, L5, L6; | |||
p448_mul ( &L6, &a->z0, &a->zd ); | |||
p448_sub ( &L4, &L6, &a->xd ); | |||
p448_bias ( &L4, 2 ); | |||
p448_weak_reduce( &L4 ); | |||
p448_mul ( &L6, &a->za, &L4 ); | |||
p448_mul ( &L5, &a->z0, &a->xd ); | |||
p448_sub ( &L4, &L5, &a->zd ); | |||
p448_bias ( &L4, 2 ); | |||
p448_weak_reduce( &L4 ); | |||
p448_mul ( &L3, &a->xa, &L4 ); | |||
p448_add ( &L5, &L3, &L6 ); | |||
p448_sub ( &L4, &L6, &L3 ); | |||
p448_bias ( &L4, 2 ); | |||
p448_weak_reduce( &L4 ); | |||
p448_mul ( &L6, &L4, &L5 ); | |||
p448_copy ( &L5, &a->z0 ); | |||
p448_addw ( &L5, 1 ); | |||
p448_sqr ( &L4, &L5 ); | |||
p448_mulw ( &L5, &L4, 39082 ); | |||
p448_neg ( &L4, &L5 ); | |||
p448_add ( &L5, &a->z0, &a->z0 ); | |||
p448_bias ( &L5, 1 ); | |||
p448_add ( &L3, &L5, &L5 ); | |||
p448_add ( &L5, &L3, &L4 ); | |||
p448_weak_reduce( &L5 ); | |||
p448_mul ( &L3, &a->xd, &L5 ); | |||
L1 = p448_is_zero( &a->zd ); | |||
L2 = - L1; | |||
p448_mask ( &L4, &L3, L1 ); | |||
p448_add ( &L5, &L4, &a->zd ); | |||
L0 = ~ L1; | |||
p448_mul ( &L4, sbz, &L6 ); | |||
p448_addw ( &L4, L2 ); | |||
p448_mul ( &L6, &L5, &L4 ); | |||
p448_mul ( &L4, &L6, &L5 ); | |||
p448_mul ( &L5, &L6, &a->xd ); | |||
p448_mul ( &L6, &L4, &L5 ); | |||
p448_isr ( &L3, &L6 ); | |||
p448_mul ( &L5, &L4, &L3 ); | |||
p448_sqr ( &L4, &L3 ); | |||
p448_mul ( &L3, &L6, &L4 ); | |||
p448_mask ( b, &L5, L0 ); | |||
p448_subw ( &L3, 1 ); | |||
p448_bias ( &L3, 1 ); | |||
L1 = p448_is_zero( &L3 ); | |||
L0 = p448_is_zero( sbz ); | |||
return L1 | L0; | |||
} | |||
void | |||
serialize_extensible ( | |||
struct p448_t* b, | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sub ( &L0, &a->y, &a->z ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_add ( b, &a->z, &a->y ); | |||
p448_mul ( &L1, &a->z, &a->x ); | |||
p448_mul ( &L2, &L0, &L1 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_mul ( &L0, &L2, b ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_isr ( &L0, &L2 ); | |||
p448_mul ( b, &L1, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
} | |||
void | |||
untwist_and_double_and_serialize ( | |||
struct p448_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_mul ( &L3, &a->y, &a->x ); | |||
p448_add ( b, &a->y, &a->x ); | |||
p448_sqr ( &L1, b ); | |||
p448_add ( &L2, &L3, &L3 ); | |||
p448_sub ( b, &L1, &L2 ); | |||
p448_bias ( b, 3 ); | |||
p448_weak_reduce( b ); | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &L2 ); | |||
p448_add ( &L2, b, b ); | |||
p448_mulw ( b, &L2, 39082 ); | |||
p448_neg ( &L2, b ); | |||
p448_bias ( &L2, 2 ); | |||
p448_mulw ( &L0, &L2, 39082 ); | |||
p448_neg ( b, &L0 ); | |||
p448_bias ( b, 2 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_mul ( &L2, b, &L0 ); | |||
p448_isr ( &L0, &L2 ); | |||
p448_mul ( &L1, b, &L0 ); | |||
p448_sqr ( b, &L0 ); | |||
p448_mul ( &L0, &L2, b ); | |||
p448_mul ( b, &L1, &L3 ); | |||
} | |||
void | |||
twist_even ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
mask_t L0, L1; | |||
p448_sqr ( &b->y, &a->z ); | |||
p448_sqr ( &b->z, &a->x ); | |||
p448_sub ( &b->u, &b->y, &b->z ); | |||
p448_bias ( &b->u, 2 ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sub ( &b->z, &a->z, &a->x ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->y, &b->z, &a->y ); | |||
p448_sub ( &b->z, &a->z, &a->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->x, &b->z, &b->y ); | |||
p448_mul ( &b->t, &b->x, &b->u ); | |||
p448_mul ( &b->y, &b->x, &b->t ); | |||
p448_isr ( &b->t, &b->y ); | |||
p448_mul ( &b->u, &b->x, &b->t ); | |||
p448_sqr ( &b->x, &b->t ); | |||
p448_mul ( &b->t, &b->y, &b->x ); | |||
p448_mul ( &b->x, &a->x, &b->u ); | |||
p448_mul ( &b->y, &a->y, &b->u ); | |||
L1 = p448_is_zero( &b->z ); | |||
L0 = - L1; | |||
p448_addw ( &b->y, L0 ); | |||
p448_weak_reduce( &b->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &b->x ); | |||
p448_copy ( &b->u, &b->y ); | |||
} | |||
void | |||
test_only_twist ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3; | |||
p448_sqr ( &b->u, &a->z ); | |||
p448_sqr ( &b->y, &a->x ); | |||
p448_sub ( &b->z, &b->u, &b->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_add ( &b->y, &b->z, &b->z ); | |||
p448_add ( &b->u, &b->y, &b->y ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sub ( &b->y, &a->z, &a->x ); | |||
p448_bias ( &b->y, 2 ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->x, &b->y, &a->y ); | |||
p448_sub ( &b->z, &a->z, &a->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->t, &b->z, &b->x ); | |||
p448_mul ( &L3, &b->t, &b->u ); | |||
p448_mul ( &b->x, &b->t, &L3 ); | |||
p448_isr ( &L2, &b->x ); | |||
p448_mul ( &b->u, &b->t, &L2 ); | |||
p448_sqr ( &L3, &L2 ); | |||
p448_mul ( &b->t, &b->x, &L3 ); | |||
p448_add ( &b->x, &a->y, &a->x ); | |||
p448_weak_reduce( &b->x ); | |||
p448_sub ( &L2, &a->x, &a->y ); | |||
p448_bias ( &L2, 2 ); | |||
p448_weak_reduce( &L2 ); | |||
p448_mul ( &L3, &b->t, &L2 ); | |||
p448_add ( &L2, &L3, &b->x ); | |||
p448_sub ( &b->t, &b->x, &L3 ); | |||
p448_bias ( &b->t, 2 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_mul ( &b->x, &L2, &b->u ); | |||
L0 = p448_is_zero( &b->y ); | |||
L1 = - L0; | |||
p448_addw ( &b->x, L1 ); | |||
p448_weak_reduce( &b->x ); | |||
p448_mul ( &b->y, &b->t, &b->u ); | |||
L0 = p448_is_zero( &b->z ); | |||
L1 = - L0; | |||
p448_addw ( &b->y, L1 ); | |||
p448_weak_reduce( &b->y ); | |||
L1 = p448_is_zero( &a->y ); | |||
L0 = L1 + 1; | |||
p448_set_ui( &b->z, L0 ); | |||
p448_copy ( &b->t, &b->x ); | |||
p448_copy ( &b->u, &b->y ); | |||
} | |||
mask_t | |||
is_square ( | |||
const struct p448_t* x | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3; | |||
p448_isr ( &L2, x ); | |||
p448_sqr ( &L3, &L2 ); | |||
p448_mul ( &L2, x, &L3 ); | |||
p448_subw ( &L2, 1 ); | |||
p448_bias ( &L2, 1 ); | |||
L1 = p448_is_zero( &L2 ); | |||
L0 = p448_is_zero( x ); | |||
return L1 | L0; | |||
} | |||
mask_t | |||
is_even_pt ( | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &a->x ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
return is_square ( &L0 ); | |||
} | |||
mask_t | |||
is_even_tw ( | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &a->x ); | |||
p448_add ( &L0, &L1, &L2 ); | |||
p448_weak_reduce( &L0 ); | |||
return is_square ( &L0 ); | |||
} | |||
mask_t | |||
deserialize_affine ( | |||
struct affine_t* a, | |||
const struct p448_t* sz | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_sqr ( &L1, sz ); | |||
p448_copy ( &L3, &L1 ); | |||
p448_addw ( &L3, 1 ); | |||
p448_sqr ( &a->x, &L3 ); | |||
p448_mulw ( &L3, &a->x, 39082 ); | |||
p448_neg ( &a->x, &L3 ); | |||
p448_add ( &L3, &L1, &L1 ); | |||
p448_bias ( &L3, 1 ); | |||
p448_add ( &a->y, &L3, &L3 ); | |||
p448_add ( &L3, &a->y, &a->x ); | |||
p448_weak_reduce( &L3 ); | |||
p448_copy ( &a->y, &L1 ); | |||
p448_subw ( &a->y, 1 ); | |||
p448_neg ( &a->x, &a->y ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_mul ( &a->y, &a->x, &L3 ); | |||
p448_sqr ( &L2, &a->x ); | |||
p448_mul ( &L0, &L2, &a->y ); | |||
p448_mul ( &a->y, &a->x, &L0 ); | |||
p448_isr ( &L3, &a->y ); | |||
p448_mul ( &a->y, &L2, &L3 ); | |||
p448_sqr ( &L2, &L3 ); | |||
p448_mul ( &L3, &L0, &L2 ); | |||
p448_mul ( &L0, &a->x, &L3 ); | |||
p448_add ( &L2, &a->y, &a->y ); | |||
p448_mul ( &a->x, sz, &L2 ); | |||
p448_addw ( &L1, 1 ); | |||
p448_mul ( &a->y, &L1, &L3 ); | |||
p448_subw ( &L0, 1 ); | |||
p448_bias ( &L0, 1 ); | |||
return p448_is_zero( &L0 ); | |||
} | |||
mask_t | |||
deserialize_and_twist_approx ( | |||
struct tw_extensible_t* a, | |||
const struct p448_t* sdm1, | |||
const struct p448_t* sz | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sqr ( &a->z, sz ); | |||
p448_copy ( &a->y, &a->z ); | |||
p448_addw ( &a->y, 1 ); | |||
p448_sqr ( &a->x, &a->y ); | |||
p448_mulw ( &a->y, &a->x, 39082 ); | |||
p448_neg ( &a->x, &a->y ); | |||
p448_add ( &a->y, &a->z, &a->z ); | |||
p448_bias ( &a->y, 1 ); | |||
p448_add ( &a->u, &a->y, &a->y ); | |||
p448_add ( &a->y, &a->u, &a->x ); | |||
p448_weak_reduce( &a->y ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_subw ( &a->x, 1 ); | |||
p448_neg ( &a->u, &a->x ); | |||
p448_bias ( &a->u, 2 ); | |||
p448_weak_reduce( &a->u ); | |||
p448_mul ( &a->x, sdm1, &a->u ); | |||
p448_mul ( &L0, &a->x, &a->y ); | |||
p448_mul ( &a->t, &L0, &a->y ); | |||
p448_mul ( &a->u, &a->x, &a->t ); | |||
p448_mul ( &a->t, &a->u, &L0 ); | |||
p448_mul ( &a->y, &a->x, &a->t ); | |||
p448_isr ( &L0, &a->y ); | |||
p448_mul ( &a->y, &a->u, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &a->u, &a->t, &L1 ); | |||
p448_mul ( &a->t, &a->x, &a->u ); | |||
p448_add ( &a->x, sz, sz ); | |||
p448_mul ( &L0, &a->u, &a->x ); | |||
p448_copy ( &a->x, &a->z ); | |||
p448_subw ( &a->x, 1 ); | |||
p448_neg ( &L1, &a->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &a->x, &L1, &L0 ); | |||
p448_mul ( &L0, &a->u, &a->y ); | |||
p448_addw ( &a->z, 1 ); | |||
p448_mul ( &a->y, &a->z, &L0 ); | |||
p448_subw ( &a->t, 1 ); | |||
p448_bias ( &a->t, 1 ); | |||
mask_t ret = p448_is_zero( &a->t ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_copy ( &a->t, &a->x ); | |||
p448_copy ( &a->u, &a->y ); | |||
return ret; | |||
} | |||
void | |||
set_identity_extensible ( | |||
struct extensible_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_set_ui( &a->t, 0 ); | |||
p448_set_ui( &a->u, 0 ); | |||
} | |||
void | |||
set_identity_tw_extensible ( | |||
struct tw_extensible_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_set_ui( &a->t, 0 ); | |||
p448_set_ui( &a->u, 0 ); | |||
} | |||
void | |||
set_identity_affine ( | |||
struct affine_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
} | |||
mask_t | |||
eq_affine ( | |||
const struct affine_t* a, | |||
const struct affine_t* b | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2; | |||
p448_sub ( &L2, &a->x, &b->x ); | |||
p448_bias ( &L2, 2 ); | |||
L1 = p448_is_zero( &L2 ); | |||
p448_sub ( &L2, &a->y, &b->y ); | |||
p448_bias ( &L2, 2 ); | |||
L0 = p448_is_zero( &L2 ); | |||
return L1 & L0; | |||
} | |||
mask_t | |||
eq_extensible ( | |||
const struct extensible_t* a, | |||
const struct extensible_t* b | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4; | |||
p448_mul ( &L4, &b->z, &a->x ); | |||
p448_mul ( &L3, &a->z, &b->x ); | |||
p448_sub ( &L2, &L4, &L3 ); | |||
p448_bias ( &L2, 2 ); | |||
L1 = p448_is_zero( &L2 ); | |||
p448_mul ( &L4, &b->z, &a->y ); | |||
p448_mul ( &L3, &a->z, &b->y ); | |||
p448_sub ( &L2, &L4, &L3 ); | |||
p448_bias ( &L2, 2 ); | |||
L0 = p448_is_zero( &L2 ); | |||
return L1 & L0; | |||
} | |||
mask_t | |||
eq_tw_extensible ( | |||
const struct tw_extensible_t* a, | |||
const struct tw_extensible_t* b | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4; | |||
p448_mul ( &L4, &b->z, &a->x ); | |||
p448_mul ( &L3, &a->z, &b->x ); | |||
p448_sub ( &L2, &L4, &L3 ); | |||
p448_bias ( &L2, 2 ); | |||
L1 = p448_is_zero( &L2 ); | |||
p448_mul ( &L4, &b->z, &a->y ); | |||
p448_mul ( &L3, &a->z, &b->y ); | |||
p448_sub ( &L2, &L4, &L3 ); | |||
p448_bias ( &L2, 2 ); | |||
L0 = p448_is_zero( &L2 ); | |||
return L1 & L0; | |||
} | |||
void | |||
elligator_2s_inject ( | |||
struct affine_t* a, | |||
const struct p448_t* r | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; | |||
p448_sqr ( &a->x, r ); | |||
p448_sqr ( &L3, &a->x ); | |||
p448_copy ( &a->y, &L3 ); | |||
p448_subw ( &a->y, 1 ); | |||
p448_neg ( &L9, &a->y ); | |||
p448_bias ( &L9, 2 ); | |||
p448_weak_reduce( &L9 ); | |||
p448_sqr ( &L2, &L9 ); | |||
p448_mulw ( &L8, &L2, 1527402724 ); | |||
p448_mulw ( &L7, &L3, 6108985600 ); | |||
p448_add ( &a->y, &L7, &L8 ); | |||
p448_weak_reduce( &a->y ); | |||
p448_mulw ( &L8, &L2, 6109454568 ); | |||
p448_sub ( &L7, &a->y, &L8 ); | |||
p448_bias ( &L7, 2 ); | |||
p448_weak_reduce( &L7 ); | |||
p448_mulw ( &L4, &a->y, 78160 ); | |||
p448_mul ( &L6, &L7, &L9 ); | |||
p448_mul ( &L8, &L6, &L4 ); | |||
p448_mul ( &L4, &L7, &L8 ); | |||
p448_isr ( &L5, &L4 ); | |||
p448_mul ( &L4, &L6, &L5 ); | |||
p448_sqr ( &L6, &L5 ); | |||
p448_mul ( &L5, &L8, &L6 ); | |||
p448_mul ( &L8, &L7, &L5 ); | |||
p448_mul ( &L7, &L8, &L5 ); | |||
p448_copy ( &L5, &a->x ); | |||
p448_subw ( &L5, 1 ); | |||
p448_addw ( &a->x, 1 ); | |||
p448_mul ( &L6, &a->x, &L8 ); | |||
p448_sub ( &a->x, &L5, &L6 ); | |||
p448_bias ( &a->x, 3 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_mul ( &L5, &L4, &a->x ); | |||
p448_mulw ( &L4, &L5, 78160 ); | |||
p448_neg ( &a->x, &L4 ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_add ( &L4, &L3, &L3 ); | |||
p448_add ( &L3, &L4, &L2 ); | |||
p448_subw ( &L3, 2 ); | |||
p448_bias ( &L3, 1 ); | |||
p448_weak_reduce( &L3 ); | |||
p448_mul ( &L2, &L3, &L8 ); | |||
p448_mulw ( &L3, &L2, 3054649120 ); | |||
p448_add ( &L2, &L3, &a->y ); | |||
p448_mul ( &a->y, &L7, &L2 ); | |||
L1 = p448_is_zero( &L9 ); | |||
L0 = - L1; | |||
p448_addw ( &a->y, L0 ); | |||
p448_weak_reduce( &a->y ); | |||
} | |||
mask_t | |||
validate_affine ( | |||
const struct affine_t* a | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_sqr ( &L0, &a->y ); | |||
p448_sqr ( &L2, &a->x ); | |||
p448_add ( &L3, &L2, &L0 ); | |||
p448_subw ( &L3, 1 ); | |||
p448_mulw ( &L1, &L2, 39081 ); | |||
p448_neg ( &L2, &L1 ); | |||
p448_bias ( &L2, 2 ); | |||
p448_mul ( &L1, &L0, &L2 ); | |||
p448_sub ( &L0, &L3, &L1 ); | |||
p448_bias ( &L0, 3 ); | |||
return p448_is_zero( &L0 ); | |||
} | |||
mask_t | |||
validate_tw_extensible ( | |||
const struct tw_extensible_t* ext | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4, L5; | |||
/* | |||
* Check invariant: | |||
* 0 = -x*y + z*t*u | |||
*/ | |||
p448_mul ( &L2, &ext->t, &ext->u ); | |||
p448_mul ( &L4, &ext->z, &L2 ); | |||
p448_addw ( &L4, 0 ); | |||
p448_mul ( &L3, &ext->x, &ext->y ); | |||
p448_neg ( &L2, &L3 ); | |||
p448_add ( &L3, &L2, &L4 ); | |||
p448_bias ( &L3, 2 ); | |||
L1 = p448_is_zero( &L3 ); | |||
/* | |||
* Check invariant: | |||
* 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 | |||
*/ | |||
p448_sqr ( &L4, &ext->y ); | |||
p448_neg ( &L2, &L4 ); | |||
p448_addw ( &L2, 0 ); | |||
p448_sqr ( &L3, &ext->x ); | |||
p448_add ( &L4, &L3, &L2 ); | |||
p448_sqr ( &L5, &ext->u ); | |||
p448_sqr ( &L3, &ext->t ); | |||
p448_mul ( &L2, &L3, &L5 ); | |||
p448_mulw ( &L3, &L2, 39081 ); | |||
p448_neg ( &L5, &L3 ); | |||
p448_add ( &L3, &L5, &L4 ); | |||
p448_neg ( &L5, &L2 ); | |||
p448_add ( &L4, &L5, &L3 ); | |||
p448_sqr ( &L3, &ext->z ); | |||
p448_add ( &L2, &L3, &L4 ); | |||
p448_bias ( &L2, 4 ); | |||
L0 = p448_is_zero( &L2 ); | |||
return L1 & L0; | |||
} | |||
mask_t | |||
validate_extensible ( | |||
const struct extensible_t* ext | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4, L5; | |||
/* | |||
* Check invariant: | |||
* 0 = d*t^2*u^2 - x^2 - y^2 + z^2 | |||
*/ | |||
p448_sqr ( &L4, &ext->y ); | |||
p448_neg ( &L3, &L4 ); | |||
p448_addw ( &L3, 0 ); | |||
p448_sqr ( &L2, &ext->z ); | |||
p448_add ( &L4, &L2, &L3 ); | |||
p448_sqr ( &L5, &ext->u ); | |||
p448_sqr ( &L2, &ext->t ); | |||
p448_mul ( &L3, &L2, &L5 ); | |||
p448_mulw ( &L5, &L3, 39081 ); | |||
p448_neg ( &L2, &L5 ); | |||
p448_add ( &L3, &L2, &L4 ); | |||
p448_sqr ( &L2, &ext->x ); | |||
p448_neg ( &L4, &L2 ); | |||
p448_add ( &L2, &L4, &L3 ); | |||
p448_bias ( &L2, 4 ); | |||
L1 = p448_is_zero( &L2 ); | |||
/* | |||
* Check invariant: | |||
* 0 = -x*y + z*t*u | |||
*/ | |||
p448_mul ( &L3, &ext->t, &ext->u ); | |||
p448_mul ( &L4, &ext->z, &L3 ); | |||
p448_addw ( &L4, 0 ); | |||
p448_mul ( &L2, &ext->x, &ext->y ); | |||
p448_neg ( &L3, &L2 ); | |||
p448_add ( &L2, &L3, &L4 ); | |||
p448_bias ( &L2, 2 ); | |||
L0 = p448_is_zero( &L2 ); | |||
return L1 & L0; | |||
} | |||
@@ -0,0 +1,150 @@ | |||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
*/ | |||
/** | |||
* @file "neon_emulation.h" | |||
* @brief NEON intrinsic emulation using clang's vector extensions. | |||
* | |||
* This lets you test and debug NEON code on x86. | |||
*/ | |||
#ifndef __NEON_EMULATION_H__ | |||
#define __NEON_EMULATION_H__ 1 | |||
#include "word.h" | |||
#include <stdint.h> | |||
#include <assert.h> | |||
static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) { | |||
a.x += b.x; | |||
a.y += b.y; | |||
return a; | |||
} | |||
static __inline__ int64x2_t __attribute__((gnu_inline,always_inline)) | |||
xx_vaddup_s64(int64x2_t x) { | |||
x.y += x.x; | |||
return x; | |||
} | |||
typedef struct { int32x2_t val[2]; } int32x2x2_t; | |||
static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) { | |||
int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}}; | |||
return out; | |||
} | |||
static __inline__ void __attribute__((gnu_inline,always_inline)) | |||
xx_vtrnq_s64 ( | |||
int64x2_t *x, | |||
int64x2_t *y | |||
) { | |||
int64_t tmp = (*x).y; | |||
(*x).y = (*y).x; | |||
(*y).x = tmp; | |||
} | |||
int64x2_t vsraq_n_s64 ( | |||
int64x2_t a, | |||
int64x2_t v, | |||
const int x | |||
) { | |||
return a + (v >> x); | |||
} | |||
int64x2_t vshrq_n_s64 ( | |||
int64x2_t v, | |||
const int x | |||
) { | |||
return v >> x; | |||
} | |||
static inline int64_t vgetq_lane_s64 ( | |||
int64x2_t acc, | |||
const int lane | |||
) { | |||
return lane ? acc.y : acc.x; | |||
} | |||
static inline int32_t vget_lane_s32 ( | |||
int32x2_t acc, | |||
const int lane | |||
) { | |||
return lane ? acc.y : acc.x; | |||
} | |||
static inline int64x2_t vmlal_lane_s32 ( | |||
int64x2_t acc, | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
return acc + xx*(lane?yy.yy:yy.xx); | |||
} | |||
static inline int64x2_t vmlsl_lane_s32 ( | |||
int64x2_t acc, | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
return acc - xx*(lane?yy.yy:yy.xx); | |||
} | |||
static inline int64x2_t vqdmlsl_lane_s32 ( | |||
int64x2_t acc, | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
int64x2_t tmp = xx*(lane?yy.yy:yy.xx); | |||
assert(tmp.x >> 63 == tmp.x>>62); | |||
assert(tmp.y >> 63 == tmp.y>>62); | |||
return acc - 2*tmp; | |||
} | |||
static inline int64x2_t vqdmlal_lane_s32 ( | |||
int64x2_t acc, | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
int64x2_t tmp = xx*(lane?yy.yy:yy.xx); | |||
assert(tmp.x >> 63 == tmp.x>>62); | |||
assert(tmp.y >> 63 == tmp.y>>62); | |||
return acc + 2*tmp; | |||
} | |||
static inline int64x2_t vqdmull_lane_s32 ( | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
int64x2_t tmp = xx*(lane?yy.yy:yy.xx); | |||
assert(tmp.x >> 63 == tmp.x>>62); | |||
assert(tmp.y >> 63 == tmp.y>>62); | |||
return 2*tmp; | |||
} | |||
static inline int32x2_t vmovn_s64( | |||
int64x2_t x | |||
) { | |||
int32x2_t y = {x.x,x.y}; | |||
return y; | |||
} | |||
static inline int64x2_t vmull_lane_s32 ( | |||
int32x2_t x, | |||
int32x2_t y, | |||
int lane | |||
) { | |||
int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y }; | |||
return xx*(lane?yy.yy:yy.xx); | |||
} | |||
#endif /* __NEON_EMULATION_H__ */ |
@@ -0,0 +1,749 @@ | |||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
*/ | |||
#include "word.h" | |||
#include "p448.h" | |||
static inline mask_t __attribute__((always_inline)) | |||
is_zero ( | |||
word_t x | |||
) { | |||
dword_t xx = x; | |||
xx--; | |||
return xx >> WORD_BITS; | |||
} | |||
static uint64_t widemul_32 ( | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
return ((uint64_t)a)* b; | |||
} | |||
#ifdef __ARM_NEON__ | |||
static __inline__ void __attribute__((gnu_inline,always_inline)) | |||
xx_vtrnq_s64 ( | |||
int64x2_t *x, | |||
int64x2_t *y | |||
) { | |||
__asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y)); | |||
} | |||
static __inline__ int64x2_t __attribute__((gnu_inline,always_inline)) | |||
xx_vaddup_s64(int64x2_t x) { | |||
__asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); | |||
return x; | |||
} | |||
#else | |||
#include "neon_emulation.h" | |||
#endif // ARM_NEON | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
smlal ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
smlal2 ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
smull ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
smull2 ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
const uint32_t b | |||
) { | |||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; | |||
} | |||
// static inline int64x2_t copy_now(int64x2_t x) { | |||
// int64x2_t y; | |||
// __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x)); | |||
// return y; | |||
// } | |||
void | |||
p448_mul ( | |||
p448_t *__restrict__ cs, | |||
const p448_t *as, | |||
const p448_t *bs | |||
) { | |||
const uint32_t *a = as->limb, *b = bs->limb; | |||
uint32_t *c = cs->limb; | |||
const int32x2_t | |||
*val = (const int32x2_t *)a, | |||
*vbl = (const int32x2_t *)b, | |||
*vah = (const int32x2_t *)(&a[8]), | |||
*vbh = (const int32x2_t *)(&b[8]); | |||
int32x2_t | |||
*vcl = (int32x2_t *)c, | |||
*vch = (int32x2_t *)(&c[8]), | |||
vmask = {(1<<28) - 1, (1<<28)-1}; | |||
int64x2_t accumx0a, accumx0b; | |||
int64x2_t accumx1a, accumx1b; | |||
int64x2_t accumx2a, accumx2b; | |||
int64x2_t accumx3a, accumx3b; | |||
int64x2_t accumx4a, accumx4b; | |||
int64x2_t accumx5a, accumx5b; | |||
int64x2_t accumx6a, accumx6b; | |||
int64x2_t accumx7a, accumx7b; | |||
int64x2_t carry; | |||
int32x2x2_t trn_res; | |||
int32x2_t delta; | |||
accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); | |||
accumx1a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); | |||
accumx3a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); | |||
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); | |||
accumx3b = vmull_lane_s32( delta, vbh[1], 1); | |||
accumx0b = vmull_lane_s32( delta, vbh[0], 0); | |||
accumx1b = vmull_lane_s32( delta, vbh[0], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); | |||
accumx2b += accumx2a; | |||
accumx3b += accumx3a; | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); | |||
accumx0b += accumx0a; | |||
accumx1b += accumx1a; | |||
accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); | |||
accumx2a += accumx2b; | |||
accumx3a += accumx3b; | |||
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); | |||
accumx0a += accumx0b; | |||
accumx1a += accumx1b; | |||
accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); | |||
xx_vtrnq_s64(&accumx0a, &accumx0b); | |||
xx_vtrnq_s64(&accumx1a, &accumx1b); | |||
xx_vtrnq_s64(&accumx2a, &accumx2b); | |||
xx_vtrnq_s64(&accumx3a, &accumx3b); | |||
accumx0b += accumx1a; | |||
accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); | |||
accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); | |||
accumx2a += accumx1b; | |||
accumx2b += accumx3a; | |||
accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); | |||
accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); | |||
vcl[0] = trn_res.val[1] & vmask; | |||
vch[0] = trn_res.val[0] & vmask; | |||
trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); | |||
vcl[1] = trn_res.val[1] & vmask; | |||
vch[1] = trn_res.val[0] & vmask; | |||
carry = accumx3b; | |||
accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); | |||
accumx5a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); | |||
accumx7b = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx4b = accumx4a; | |||
accumx5b = accumx5a; | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); | |||
accumx6a = accumx6b; | |||
accumx7a = accumx7b; | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); | |||
accumx4a += accumx4b; | |||
accumx5a += accumx5b; | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); | |||
/**/ | |||
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); | |||
xx_vtrnq_s64(&accumx4a, &accumx4b); | |||
xx_vtrnq_s64(&accumx5a, &accumx5b); | |||
xx_vtrnq_s64(&accumx6a, &accumx6b); | |||
xx_vtrnq_s64(&accumx7a, &accumx7b); | |||
accumx4a += carry; | |||
accumx4b += accumx5a; | |||
accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); | |||
accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); | |||
accumx6a += accumx5b; | |||
accumx6b += accumx7a; | |||
trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); | |||
vcl[2] = trn_res.val[1] & vmask; | |||
vch[2] = trn_res.val[0] & vmask; | |||
accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); | |||
accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); | |||
vcl[3] = trn_res.val[1] & vmask; | |||
vch[3] = trn_res.val[0] & vmask; | |||
accumx7b = xx_vaddup_s64(accumx7b); | |||
int32x2_t t0 = vcl[0], t1 = vch[0]; | |||
trn_res = vtrn_s32(t0,t1); | |||
t0 = trn_res.val[0]; t1 = trn_res.val[1]; | |||
accumx7b = vaddw_s32(accumx7b, t0); | |||
t0 = vmovn_s64(accumx7b) & vmask; | |||
accumx7b = vshrq_n_s64(accumx7b,28); | |||
accumx7b = vaddw_s32(accumx7b, t1); | |||
t1 = vmovn_s64(accumx7b) & vmask; | |||
trn_res = vtrn_s32(t0,t1); | |||
vcl[0] = trn_res.val[0]; | |||
vch[0] = trn_res.val[1]; | |||
accumx7b = vshrq_n_s64(accumx7b,28); | |||
t0 = vmovn_s64(accumx7b); | |||
uint32_t | |||
c0 = vget_lane_s32(t0,0), | |||
c1 = vget_lane_s32(t0,1); | |||
c[2] += c0; | |||
c[10] += c1; | |||
} | |||
void | |||
p448_sqr ( | |||
p448_t *__restrict__ cs, | |||
const p448_t *as | |||
) { | |||
/* FUTURE possible improvements: | |||
* don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end | |||
* or use phi/nega-phi for everything, montgomery style | |||
* or find some sort of phi algorithm which doesn't have this problem | |||
* break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks | |||
* | |||
* These improvements are all pretty minor, but I guess together they might matter? | |||
*/ | |||
const uint32_t *b = as->limb; | |||
uint32_t *c = cs->limb; | |||
int32x2_t vbm[4]; | |||
const int32x2_t | |||
*vbl = (const int32x2_t *)b, | |||
*vbh = (const int32x2_t *)(&b[8]); | |||
int i; | |||
for (i=0; i<4; i++) { | |||
vbm[i] = vbl[i] - vbh[i]; | |||
} | |||
int32x2_t | |||
*vcl = (int32x2_t *)c, | |||
*vch = (int32x2_t *)(&c[8]), | |||
vmask = {(1<<28) - 1, (1<<28)-1}; | |||
int64x2_t accumx0a, accumx0b; | |||
int64x2_t accumx1a, accumx1b; | |||
int64x2_t accumx2a, accumx2b; | |||
int64x2_t accumx3a, accumx3b; | |||
int64x2_t accumx4a, accumx4b; | |||
int64x2_t accumx5a, accumx5b; | |||
int64x2_t accumx6a, accumx6b; | |||
int64x2_t accumx7a, accumx7b; | |||
int64x2_t carry; | |||
int32x2x2_t trn_res; | |||
accumx0a = vqdmull_lane_s32( vbh[1], vbh[3], 0); | |||
accumx1a = vqdmull_lane_s32( vbh[1], vbh[3], 1); | |||
accumx2a = vqdmull_lane_s32( vbh[2], vbh[3], 0); | |||
accumx3a = vqdmull_lane_s32( vbh[2], vbh[3], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1); | |||
accumx2b = accumx2a; | |||
accumx3b = accumx3a; | |||
accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0); | |||
accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1); | |||
accumx0b = accumx0a; | |||
accumx1b = accumx1a; | |||
accumx0b = vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1); | |||
accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0); | |||
accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1); | |||
accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0); | |||
accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1); | |||
accumx2a += accumx2b; | |||
accumx3a += accumx3b; | |||
accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0); | |||
accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1); | |||
accumx0a += accumx0b; | |||
accumx1a += accumx1b; | |||
accumx0a = vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1); | |||
accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0); | |||
accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1); | |||
accumx0a = vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0); | |||
accumx1a = vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1); | |||
accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0); | |||
accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1); | |||
accumx0b += accumx0a; | |||
accumx1b += accumx1a; | |||
accumx0b = vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0); | |||
accumx1b = vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1); | |||
accumx2b += accumx2a; | |||
accumx3b += accumx3a; | |||
accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0); | |||
accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1); | |||
xx_vtrnq_s64(&accumx0b, &accumx0a); | |||
xx_vtrnq_s64(&accumx1b, &accumx1a); | |||
xx_vtrnq_s64(&accumx2b, &accumx2a); | |||
xx_vtrnq_s64(&accumx3b, &accumx3a); | |||
accumx0a += accumx1b; | |||
accumx0a = vsraq_n_s64(accumx0a,accumx0b,28); | |||
accumx1a = vsraq_n_s64(accumx1a,accumx0a,28); | |||
accumx2b += accumx1a; | |||
accumx2a += accumx3b; | |||
accumx2a = vsraq_n_s64(accumx2a,accumx2b,28); | |||
accumx3a = vsraq_n_s64(accumx3a,accumx2a,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a)); | |||
vcl[0] = trn_res.val[1] & vmask; | |||
vch[0] = trn_res.val[0] & vmask; | |||
trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a)); | |||
vcl[1] = trn_res.val[1] & vmask; | |||
vch[1] = trn_res.val[0] & vmask; | |||
carry = accumx3a; | |||
accumx4a = vmull_lane_s32( vbh[3], vbh[3], 0); | |||
accumx5a = vmull_lane_s32( vbh[3], vbh[3], 1); | |||
accumx6b = vqdmull_lane_s32( vbh[0], vbh[3], 0); | |||
accumx7b = vqdmull_lane_s32( vbh[0], vbh[3], 1); | |||
accumx4b = accumx4a; | |||
accumx5b = accumx5a; | |||
accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0); | |||
accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1); | |||
accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0); | |||
accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1); | |||
accumx6a = accumx6b; | |||
accumx7a = accumx7b; | |||
accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0); | |||
accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1); | |||
accumx4a += accumx4b; | |||
accumx5a += accumx5b; | |||
accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0); | |||
accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1); | |||
accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0); | |||
accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1); | |||
accumx4a = vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0); | |||
accumx5a = vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1); | |||
accumx6b += accumx6a; | |||
accumx7b += accumx7a; | |||
accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0); | |||
accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1); | |||
accumx4b += accumx4a; | |||
accumx5b += accumx5a; | |||
accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0); | |||
accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1); | |||
accumx4b = vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0); | |||
accumx5b = vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1); | |||
accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0); | |||
accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1); | |||
xx_vtrnq_s64(&accumx4b, &accumx4a); | |||
xx_vtrnq_s64(&accumx5b, &accumx5a); | |||
xx_vtrnq_s64(&accumx6b, &accumx6a); | |||
xx_vtrnq_s64(&accumx7b, &accumx7a); | |||
accumx4b += carry; | |||
accumx4a += accumx5b; | |||
accumx4a = vsraq_n_s64(accumx4a,accumx4b,28); | |||
accumx5a = vsraq_n_s64(accumx5a,accumx4a,28); | |||
accumx6b += accumx5a; | |||
accumx6a += accumx7b; | |||
trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a)); | |||
vcl[2] = trn_res.val[1] & vmask; | |||
vch[2] = trn_res.val[0] & vmask; | |||
accumx6a = vsraq_n_s64(accumx6a,accumx6b,28); | |||
accumx7a = vsraq_n_s64(accumx7a,accumx6a,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a)); | |||
vcl[3] = trn_res.val[1] & vmask; | |||
vch[3] = trn_res.val[0] & vmask; | |||
accumx7a = xx_vaddup_s64(accumx7a); | |||
int32x2_t t0 = vcl[0], t1 = vch[0]; | |||
trn_res = vtrn_s32(t0,t1); | |||
t0 = trn_res.val[0]; t1 = trn_res.val[1]; | |||
accumx7a = vaddw_s32(accumx7a, t0); | |||
t0 = vmovn_s64(accumx7a) & vmask; | |||
accumx7a = vshrq_n_s64(accumx7a,28); | |||
accumx7a = vaddw_s32(accumx7a, t1); | |||
t1 = vmovn_s64(accumx7a) & vmask; | |||
trn_res = vtrn_s32(t0,t1); | |||
vcl[0] = trn_res.val[0]; | |||
vch[0] = trn_res.val[1]; | |||
accumx7a = vshrq_n_s64(accumx7a,28); | |||
t0 = vmovn_s64(accumx7a); | |||
uint32_t | |||
c0 = vget_lane_s32(t0,0), | |||
c1 = vget_lane_s32(t0,1); | |||
c[2] += c0; | |||
c[10] += c1; | |||
} | |||
void | |||
p448_mulw ( | |||
p448_t *__restrict__ cs, | |||
const p448_t *as, | |||
uint64_t b | |||
) { | |||
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | |||
const uint32_t *a = as->limb; | |||
uint32_t *c = cs->limb; | |||
uint64_t accum0, accum8; | |||
uint32_t mask = (1ull<<28)-1; | |||
int i; | |||
uint32_t c0, c8, n0, n8; | |||
accum0 = widemul_32(bhi, a[15]); | |||
accum8 = widemul_32(bhi, a[15] + a[7]); | |||
c0 = a[0]; c8 = a[8]; | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
c[0] = accum0 & mask; accum0 >>= 28; | |||
c[8] = accum8 & mask; accum8 >>= 28; | |||
i=1; | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
i++; | |||
} | |||
accum0 += accum8 + c[8]; | |||
c[8] = accum0 & mask; | |||
c[9] += accum0 >> 28; | |||
accum8 += c[0]; | |||
c[0] = accum8 & mask; | |||
c[1] += accum8 >> 28; | |||
} | |||
void | |||
p448_strong_reduce ( | |||
p448_t *a | |||
) { | |||
word_t mask = (1ull<<28)-1; | |||
/* first, clear high */ | |||
a->limb[8] += a->limb[15]>>28; | |||
a->limb[0] += a->limb[15]>>28; | |||
a->limb[15] &= mask; | |||
/* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ | |||
/* compute total_value - p. No need to reduce mod p. */ | |||
dsword_t scarry = 0; | |||
int i; | |||
for (i=0; i<16; i++) { | |||
scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask); | |||
a->limb[i] = scarry & mask; | |||
scarry >>= 28; | |||
} | |||
/* uncommon case: it was >= p, so now scarry = 0 and this = x | |||
* common case: it was < p, so now scarry = -1 and this = x - p + 2^448 | |||
* so let's add back in p. will carry back off the top for 2^448. | |||
*/ | |||
assert(is_zero(scarry) | is_zero(scarry+1)); | |||
word_t scarry_mask = scarry & mask; | |||
dword_t carry = 0; | |||
/* add it back */ | |||
for (i=0; i<16; i++) { | |||
carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask); | |||
a->limb[i] = carry & mask; | |||
carry >>= 28; | |||
} | |||
assert(is_zero(carry + scarry)); | |||
} | |||
mask_t | |||
p448_is_zero ( | |||
const struct p448_t *a | |||
) { | |||
struct p448_t b; | |||
p448_copy(&b,a); | |||
p448_strong_reduce(&b); | |||
uint32_t any = 0; | |||
int i; | |||
for (i=0; i<16; i++) { | |||
any |= b.limb[i]; | |||
} | |||
return is_zero(any); | |||
} | |||
void | |||
p448_serialize ( | |||
uint8_t *serial, | |||
const struct p448_t *x | |||
) { | |||
int i,j; | |||
p448_t red; | |||
p448_copy(&red, x); | |||
p448_strong_reduce(&red); | |||
for (i=0; i<8; i++) { | |||
uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28); | |||
for (j=0; j<7; j++) { | |||
serial[7*i+j] = limb; | |||
limb >>= 8; | |||
} | |||
assert(limb == 0); | |||
} | |||
} | |||
mask_t | |||
p448_deserialize ( | |||
p448_t *x, | |||
const uint8_t serial[56] | |||
) { | |||
int i,j; | |||
for (i=0; i<8; i++) { | |||
uint64_t out = 0; | |||
for (j=0; j<7; j++) { | |||
out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||
} | |||
x->limb[2*i] = out & ((1ull<<28)-1); | |||
x->limb[2*i+1] = out >> 28; | |||
} | |||
/* Check for reduction. | |||
* | |||
* The idea is to create a variable ge which is all ones (rather, 56 ones) | |||
* if and only if the low $i$ words of $x$ are >= those of p. | |||
* | |||
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) | |||
*/ | |||
uint32_t ge = -1, mask = (1ull<<28)-1; | |||
for (i=0; i<8; i++) { | |||
ge &= x->limb[i]; | |||
} | |||
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ | |||
ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); | |||
/* Propagate the rest */ | |||
for (i=9; i<16; i++) { | |||
ge &= x->limb[i]; | |||
} | |||
return ~is_zero(ge ^ mask); | |||
} | |||
void | |||
simultaneous_invert_p448( | |||
struct p448_t *__restrict__ out, | |||
const struct p448_t *in, | |||
unsigned int n | |||
) { | |||
if (n==0) { | |||
return; | |||
} else if (n==1) { | |||
p448_inverse(out,in); | |||
return; | |||
} | |||
p448_copy(&out[1], &in[0]); | |||
int i; | |||
for (i=1; i<(int) (n-1); i++) { | |||
p448_mul(&out[i+1], &out[i], &in[i]); | |||
} | |||
p448_mul(&out[0], &out[n-1], &in[n-1]); | |||
struct p448_t tmp; | |||
p448_inverse(&tmp, &out[0]); | |||
p448_copy(&out[0], &tmp); | |||
/* at this point, out[0] = product(in[i]) ^ -1 | |||
* out[i] = product(in[0]..in[i-1]) if i != 0 | |||
*/ | |||
for (i=n-1; i>0; i--) { | |||
p448_mul(&tmp, &out[i], &out[0]); | |||
p448_copy(&out[i], &tmp); | |||
p448_mul(&tmp, &out[0], &in[i]); | |||
p448_copy(&out[0], &tmp); | |||
} | |||
} |
@@ -0,0 +1,378 @@ | |||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
*/ | |||
#ifndef __P448_H__ | |||
#define __P448_H__ 1 | |||
#include "word.h" | |||
#include <stdint.h> | |||
#include <assert.h> | |||
typedef struct p448_t { | |||
uint32_t limb[16]; | |||
} __attribute__((aligned(32))) p448_t; | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
static __inline__ void | |||
p448_set_ui ( | |||
p448_t *out, | |||
uint64_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_cond_swap ( | |||
p448_t *a, | |||
p448_t *b, | |||
mask_t do_swap | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_add ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_sub ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_neg ( | |||
p448_t *out, | |||
const p448_t *a | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_cond_neg ( | |||
p448_t *a, | |||
mask_t doNegate | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_addw ( | |||
p448_t *a, | |||
uint32_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_subw ( | |||
p448_t *a, | |||
uint32_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_copy ( | |||
p448_t *out, | |||
const p448_t *a | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_weak_reduce ( | |||
p448_t *inout | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_strong_reduce ( | |||
p448_t *inout | |||
); | |||
mask_t | |||
p448_is_zero ( | |||
const p448_t *in | |||
); | |||
static __inline__ void | |||
p448_bias ( | |||
p448_t *inout, | |||
int amount | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_mul ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a, | |||
const p448_t *b | |||
); | |||
void | |||
p448_mulw ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a, | |||
uint64_t b | |||
); | |||
void | |||
p448_sqr ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a | |||
); | |||
static __inline__ void | |||
p448_sqrn ( | |||
p448_t *__restrict__ y, | |||
const p448_t *x, | |||
int n | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_serialize ( | |||
uint8_t *serial, | |||
const struct p448_t *x | |||
); | |||
mask_t | |||
p448_deserialize ( | |||
p448_t *x, | |||
const uint8_t serial[56] | |||
); | |||
static __inline__ void | |||
p448_mask( | |||
struct p448_t *a, | |||
const struct p448_t *b, | |||
mask_t mask | |||
) __attribute__((unused,always_inline)); | |||
/** | |||
* Returns 1/x. | |||
* | |||
* If x=0, returns 0. | |||
*/ | |||
void | |||
p448_inverse ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
); | |||
void | |||
simultaneous_invert_p448 ( | |||
struct p448_t *__restrict__ out, | |||
const struct p448_t *in, | |||
unsigned int n | |||
); | |||
static inline mask_t | |||
p448_eq ( | |||
const struct p448_t *a, | |||
const struct p448_t *b | |||
) __attribute__((always_inline,unused)); | |||
/* -------------- Inline functions begin here -------------- */ | |||
void | |||
p448_set_ui ( | |||
p448_t *out, | |||
uint64_t x | |||
) { | |||
int i; | |||
out->limb[0] = x & ((1<<28)-1); | |||
out->limb[1] = x>>28; | |||
for (i=2; i<16; i++) { | |||
out->limb[i] = 0; | |||
} | |||
} | |||
void | |||
p448_cond_swap ( | |||
p448_t *a, | |||
p448_t *b, | |||
mask_t doswap | |||
) { | |||
big_register_t *aa = (big_register_t*)a; | |||
big_register_t *bb = (big_register_t*)b; | |||
big_register_t m = br_set_to_mask(doswap); | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) { | |||
big_register_t x = m & (aa[i]^bb[i]); | |||
aa[i] ^= x; | |||
bb[i] ^= x; | |||
} | |||
} | |||
void | |||
p448_add ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i]; | |||
} | |||
/* | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { | |||
out->limb[i] = a->limb[i] + b->limb[i]; | |||
} | |||
*/ | |||
} | |||
void | |||
p448_sub ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i]; | |||
} | |||
/* | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { | |||
out->limb[i] = a->limb[i] - b->limb[i]; | |||
} | |||
*/ | |||
} | |||
void | |||
p448_neg ( | |||
p448_t *out, | |||
const p448_t *a | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i]; | |||
} | |||
/* | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { | |||
out->limb[i] = -a->limb[i]; | |||
} | |||
*/ | |||
} | |||
void | |||
p448_cond_neg( | |||
p448_t *a, | |||
mask_t doNegate | |||
) { | |||
unsigned int i; | |||
struct p448_t negated; | |||
big_register_t *aa = (big_register_t *)a; | |||
big_register_t *nn = (big_register_t*)&negated; | |||
big_register_t m = br_set_to_mask(doNegate); | |||
p448_neg(&negated, a); | |||
p448_bias(&negated, 2); | |||
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) { | |||
aa[i] = (aa[i] & ~m) | (nn[i] & m); | |||
} | |||
} | |||
void | |||
p448_addw ( | |||
p448_t *a, | |||
uint32_t x | |||
) { | |||
a->limb[0] += x; | |||
} | |||
void | |||
p448_subw ( | |||
p448_t *a, | |||
uint32_t x | |||
) { | |||
a->limb[0] -= x; | |||
} | |||
void | |||
p448_copy ( | |||
p448_t *out, | |||
const p448_t *a | |||
) { | |||
*out = *a; | |||
} | |||
void | |||
p448_bias ( | |||
p448_t *a, | |||
int amt | |||
) { | |||
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; | |||
uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; | |||
uint32x4_t *aa = (uint32x4_t*) a; | |||
aa[0] += lo; | |||
aa[1] += lo; | |||
aa[2] += hi; | |||
aa[3] += lo; | |||
} | |||
void | |||
p448_weak_reduce ( | |||
p448_t *a | |||
) { | |||
uint64_t mask = (1ull<<28) - 1; | |||
uint64_t tmp = a->limb[15] >> 28; | |||
int i; | |||
a->limb[8] += tmp; | |||
for (i=15; i>0; i--) { | |||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | |||
} | |||
a->limb[0] = (a->limb[0] & mask) + tmp; | |||
} | |||
void | |||
p448_sqrn ( | |||
p448_t *__restrict__ y, | |||
const p448_t *x, | |||
int n | |||
) { | |||
p448_t tmp; | |||
assert(n>0); | |||
if (n&1) { | |||
p448_sqr(y,x); | |||
n--; | |||
} else { | |||
p448_sqr(&tmp,x); | |||
p448_sqr(y,&tmp); | |||
n-=2; | |||
} | |||
for (; n; n-=2) { | |||
p448_sqr(&tmp,y); | |||
p448_sqr(y,&tmp); | |||
} | |||
} | |||
mask_t | |||
p448_eq ( | |||
const struct p448_t *a, | |||
const struct p448_t *b | |||
) { | |||
struct p448_t ra, rb; | |||
p448_copy(&ra, a); | |||
p448_copy(&rb, b); | |||
p448_weak_reduce(&ra); | |||
p448_weak_reduce(&rb); | |||
p448_sub(&ra, &ra, &rb); | |||
p448_bias(&ra, 2); | |||
return p448_is_zero(&ra); | |||
} | |||
void | |||
p448_mask ( | |||
struct p448_t *a, | |||
const struct p448_t *b, | |||
mask_t mask | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) { | |||
a->limb[i] = b->limb[i] & mask; | |||
} | |||
} | |||
#ifdef __cplusplus | |||
}; /* extern "C" */ | |||
#endif | |||
#endif /* __P448_H__ */ |
@@ -17,13 +17,14 @@ p448_mul ( | |||
__uint128_t accum0 = 0, accum1 = 0, accum2; | |||
uint64_t mask = (1ull<<56) - 1; | |||
uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))); | |||
uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32))); | |||
/* For some reason clang doesn't vectorize this without prompting? */ | |||
unsigned int i; | |||
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) { | |||
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i]; | |||
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; | |||
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; | |||
((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i]; | |||
} | |||
/* | |||
for (int i=0; i<4; i++) { | |||
@@ -81,18 +82,15 @@ p448_mul ( | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
accum2 = widemul(&aa[2],&bb[3]); | |||
msb(&accum0, &a[2], &b[3]); | |||
mac(&accum1, &a[6], &b[7]); | |||
accum2 = widemul(&a[2],&b[7]); | |||
mac(&accum0, &a[6], &bb[3]); | |||
mac(&accum1, &aa[2], &bbb[3]); | |||
mac(&accum2, &aa[3], &bb[2]); | |||
msb(&accum0, &a[3], &b[2]); | |||
mac(&accum1, &a[7], &b[6]); | |||
mac(&accum2, &a[3], &b[6]); | |||
mac(&accum0, &a[7], &bb[2]); | |||
mac(&accum1, &aa[3], &bbb[2]); | |||
accum1 += accum2; | |||
accum0 += accum2; | |||
accum2 = widemul(&a[0],&b[1]); | |||
mac(&accum2, &a[0],&b[1]); | |||
mac(&accum1, &aa[0], &bb[1]); | |||
mac(&accum0, &a[4], &b[5]); | |||
@@ -109,14 +107,11 @@ p448_mul ( | |||
accum0 >>= 56; | |||
accum1 >>= 56; | |||
accum2 = widemul(&aa[3],&bb[3]); | |||
msb(&accum0, &a[3], &b[3]); | |||
mac(&accum1, &a[7], &b[7]); | |||
accum1 += accum2; | |||
accum0 += accum2; | |||
accum2 = widemul(&a[3],&b[7]); | |||
mac(&accum0, &a[7], &bb[3]); | |||
mac(&accum1, &aa[3], &bbb[3]); | |||
accum2 = widemul(&a[0],&b[2]); | |||
mac(&accum2, &a[0],&b[2]); | |||
mac(&accum1, &aa[0], &bb[2]); | |||
mac(&accum0, &a[4], &b[6]); | |||
@@ -186,11 +181,9 @@ p448_mulw ( | |||
c[3] = accum0 & mask; accum0 >>= 56; | |||
c[7] = accum4 & mask; accum4 >>= 56; | |||
c[4] += accum0 + accum4; | |||
c[0] += accum4; | |||
// c[4] += accum0 + accum4; | |||
// c[0] += accum4; | |||
/* | |||
* TODO: double-check that this is not necessary. | |||
accum0 += accum4 + c[4]; | |||
c[4] = accum0 & mask; | |||
c[5] += accum0 >> 56; | |||
@@ -198,7 +191,6 @@ p448_mulw ( | |||
accum4 += c[0]; | |||
c[0] = accum4 & mask; | |||
c[1] += accum4 >> 56; | |||
*/ | |||
} | |||
void | |||
@@ -290,7 +290,10 @@ p448_copy ( | |||
p448_t *out, | |||
const p448_t *a | |||
) { | |||
*out = *a; | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) { | |||
((big_register_t *)out)[i] = ((const big_register_t *)a)[i]; | |||
} | |||
} | |||
void | |||
@@ -299,10 +302,25 @@ p448_bias ( | |||
int amt | |||
) { | |||
uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt; | |||
#if __AVX2__ | |||
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; | |||
uint64x4_t *aa = (uint64x4_t*) a; | |||
aa[0] += lo; | |||
aa[1] += hi; | |||
#elif __SSE2__ | |||
uint64x2_t lo = {co1,co1}, hi = {co2,co1}; | |||
uint64x2_t *aa = (uint64x2_t*) a; | |||
aa[0] += lo; | |||
aa[1] += lo; | |||
aa[2] += hi; | |||
aa[3] += lo; | |||
#else | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) { | |||
a->limb[i] += (i==4) ? co2 : co1; | |||
} | |||
#endif | |||
} | |||
void | |||
@@ -1,6 +0,0 @@ | |||
_goldilocks_init | |||
_goldilocks_keygen | |||
_goldilocks_shared_secret | |||
_goldilocks_sign | |||
_goldilocks_verify | |||
_goldilocks_private_to_public |
@@ -32,7 +32,10 @@ | |||
#define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0 | |||
#endif | |||
/* FUTURE: auto */ | |||
#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS)) | |||
#define GOLDI_DIVERSIFY_BYTES 8 | |||
/* FUTURE: auto. MAGIC */ | |||
const struct affine_t goldilocks_base_point = { | |||
{{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), | |||
U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), | |||
@@ -42,11 +45,12 @@ const struct affine_t goldilocks_base_point = { | |||
{{ 19 }} | |||
}; | |||
/* These are just unique identifiers */ | |||
static const char *G_INITING = "initializing"; | |||
static const char *G_INITED = "initialized"; | |||
static const char *G_FAILED = "failed to initialize"; | |||
/* FUTURE: auto */ | |||
/* FUTURE: auto. MAGIC */ | |||
static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { | |||
U64LE(0xdc873d6d54a7bb0d), | |||
U64LE(0xde933d8d723a70aa), | |||
@@ -54,19 +58,45 @@ static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { | |||
0x8335dc16 | |||
}; | |||
const struct barrett_prime_t goldi_q448 = { | |||
448/WORD_BITS, | |||
GOLDI_FIELD_WORDS, | |||
62 % WORD_BITS, | |||
sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]), | |||
goldi_q448_lo | |||
}; | |||
/* FUTURE: auto */ | |||
/* MAGIC */ | |||
static const struct p448_t | |||
sqrt_d_minus_1 = {{ | |||
U58LE(0xd2e21836749f46), | |||
U58LE(0x888db42b4f0179), | |||
U58LE(0x5a189aabdeea38), | |||
U58LE(0x51e65ca6f14c06), | |||
U58LE(0xa49f7b424d9770), | |||
U58LE(0xdcac4628c5f656), | |||
U58LE(0x49443b8748734a), | |||
U58LE(0x12fec0c0b25b7a) | |||
}}; | |||
struct goldilocks_precomputed_public_key_t { | |||
struct goldilocks_public_key_t pub; | |||
struct fixed_base_table_t table; | |||
}; | |||
#ifndef USE_BIG_TABLES | |||
#if __ARM_NEON__ | |||
#define USE_BIG_TABLES 1 | |||
#else | |||
#define USE_BIG_TABLES (WORD_BITS==64) | |||
#endif | |||
#endif | |||
/* FUTURE: auto. MAGIC */ | |||
struct { | |||
const char * volatile state; | |||
#if GOLDILOCKS_USE_PTHREAD | |||
pthread_mutex_t mutex; | |||
#endif | |||
struct tw_niels_t combs[(WORD_BITS==64) ? 80 : 64]; | |||
struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64]; | |||
struct fixed_base_table_t fixed_base; | |||
struct tw_niels_t wnafs[32]; | |||
struct crandom_state_t rand; | |||
@@ -107,7 +137,7 @@ goldilocks_init () { | |||
/* Precompute the tables. */ | |||
mask_t succ; | |||
int big = (WORD_BITS==64); | |||
int big = USE_BIG_TABLES; | |||
uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; | |||
succ = precompute_fixed_base(&goldilocks_global.fixed_base, &text, n, t, s, goldilocks_global.combs); | |||
@@ -135,55 +165,77 @@ fail: | |||
return -1; | |||
} | |||
static const struct p448_t | |||
sqrt_d_minus_1 = {{ | |||
U58LE(0xd2e21836749f46), | |||
U58LE(0x888db42b4f0179), | |||
U58LE(0x5a189aabdeea38), | |||
U58LE(0x51e65ca6f14c06), | |||
U58LE(0xa49f7b424d9770), | |||
U58LE(0xdcac4628c5f656), | |||
U58LE(0x49443b8748734a), | |||
U58LE(0x12fec0c0b25b7a) | |||
}}; | |||
int | |||
goldilocks_keygen ( | |||
goldilocks_derive_private_key ( | |||
struct goldilocks_private_key_t *privkey, | |||
struct goldilocks_public_key_t *pubkey | |||
const unsigned char proto[GOLDI_SYMKEY_BYTES] | |||
) { | |||
if (!goldilocks_check_init()) { | |||
return GOLDI_EUNINIT; | |||
} | |||
word_t sk[448*2/WORD_BITS]; | |||
memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES); | |||
unsigned char skb[SHA512_OUTPUT_BYTES]; | |||
word_t sk[GOLDI_FIELD_WORDS]; | |||
assert(sizeof(skb) >= sizeof(sk)); | |||
struct sha512_ctx_t ctx; | |||
struct tw_extensible_t exta; | |||
struct p448_t pk; | |||
sha512_init(&ctx); | |||
sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES); | |||
sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES); | |||
sha512_final(&ctx, (unsigned char *)skb); | |||
barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &goldi_q448); | |||
barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES); | |||
scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); | |||
untwist_and_double_and_serialize(&pk, &exta); | |||
p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk); | |||
return GOLDI_EOK; | |||
} | |||
void | |||
goldilocks_underive_private_key ( | |||
unsigned char proto[GOLDI_SYMKEY_BYTES], | |||
const struct goldilocks_private_key_t *privkey | |||
) { | |||
memcpy(proto, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); | |||
} | |||
int | |||
goldilocks_keygen ( | |||
struct goldilocks_private_key_t *privkey, | |||
struct goldilocks_public_key_t *pubkey | |||
) { | |||
if (!goldilocks_check_init()) { | |||
return GOLDI_EUNINIT; | |||
} | |||
unsigned char proto[GOLDI_SYMKEY_BYTES]; | |||
#if GOLDILOCKS_USE_PTHREAD | |||
int ml_ret = pthread_mutex_lock(&goldilocks_global.mutex); | |||
if (ml_ret) return ml_ret; | |||
#endif | |||
int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk)); | |||
int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32); | |||
if (!ret) ret = ret2; | |||
int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto)); | |||
#if GOLDILOCKS_USE_PTHREAD | |||
ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex); | |||
if (ml_ret) abort(); | |||
#endif | |||
barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,&goldi_q448); | |||
barrett_serialize(privkey->opaque, sk, 448/8); | |||
scalarmul_fixed_base(&exta, sk, 448, &goldilocks_global.fixed_base); | |||
//transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta); | |||
untwist_and_double_and_serialize(&pk, &exta); | |||
int ret2 = goldilocks_derive_private_key(privkey, proto); | |||
if (!ret) ret = ret2; | |||
p448_serialize(pubkey->opaque, &pk); | |||
memcpy(&privkey->opaque[56], pubkey->opaque, 56); | |||
ret2 = goldilocks_private_to_public(pubkey, privkey); | |||
if (!ret) ret = ret2; | |||
return ret ? GOLDI_ENODICE : GOLDI_EOK; | |||
} | |||
@@ -194,7 +246,7 @@ goldilocks_private_to_public ( | |||
const struct goldilocks_private_key_t *privkey | |||
) { | |||
struct p448_t pk; | |||
mask_t msucc = p448_deserialize(&pk,&privkey->opaque[56]); | |||
mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]); | |||
if (msucc) { | |||
p448_serialize(pubkey->opaque, &pk); | |||
@@ -204,30 +256,46 @@ goldilocks_private_to_public ( | |||
} | |||
} | |||
int | |||
goldilocks_shared_secret ( | |||
uint8_t shared[64], | |||
static int | |||
goldilocks_shared_secret_core ( | |||
uint8_t shared[GOLDI_SHARED_SECRET_BYTES], | |||
const struct goldilocks_private_key_t *my_privkey, | |||
const struct goldilocks_public_key_t *your_pubkey | |||
const struct goldilocks_public_key_t *your_pubkey, | |||
const struct goldilocks_precomputed_public_key_t *pre | |||
) { | |||
/* This function doesn't actually need anything in goldilocks_global, | |||
* so it doesn't check init. | |||
*/ | |||
word_t sk[448/WORD_BITS]; | |||
assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES); | |||
word_t sk[GOLDI_FIELD_WORDS]; | |||
struct p448_t pk; | |||
mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1; | |||
#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS | |||
struct p448_t sum, prod; | |||
msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]); | |||
msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]); | |||
p448_mul(&prod,&pk,&sum); | |||
p448_add(&sum,&pk,&sum); | |||
#endif | |||
msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448); | |||
succ &= montgomery_ladder(&pk,&pk,sk,446,2); | |||
#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
if (pre) { | |||
struct tw_extensible_t tw; | |||
succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table); | |||
untwist_and_double_and_serialize(&pk, &tw); | |||
} else { | |||
succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1); | |||
} | |||
#else | |||
(void)pre; | |||
succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1); | |||
#endif | |||
p448_serialize(shared,&pk); | |||
@@ -236,28 +304,28 @@ goldilocks_shared_secret ( | |||
sha512_init(&ctx); | |||
#ifdef EXPERIMENT_ECDH_OBLITERATE_CT | |||
uint8_t oblit[40]; | |||
uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES]; | |||
unsigned i; | |||
for (i=0; i<8; i++) { | |||
for (i=0; i<GOLDI_DIVERSIFY_BYTES; i++) { | |||
oblit[i] = "noshared"[i] & ~(succ&msucc); | |||
} | |||
for (i=0; i<32; i++) { | |||
oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc); | |||
for (i=0; i<GOLDI_SYMKEY_BYTES; i++) { | |||
oblit[GOLDI_DIVERSIFY_BYTES+i] = my_privkey->opaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc); | |||
} | |||
sha512_update(&ctx, oblit, 40); | |||
sha512_update(&ctx, oblit, sizeof(oblit)); | |||
#endif | |||
#ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS | |||
/* stir in the sum and product of the pubkeys. */ | |||
uint8_t a_pk[56]; | |||
uint8_t a_pk[GOLDI_FIELD_BYTES]; | |||
p448_serialize(a_pk, &sum); | |||
sha512_update(&ctx, a_pk, 56); | |||
sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); | |||
p448_serialize(a_pk, &prod); | |||
sha512_update(&ctx, a_pk, 56); | |||
sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); | |||
#endif | |||
/* stir in the shared key and finish */ | |||
sha512_update(&ctx, shared, 56); | |||
sha512_update(&ctx, shared, GOLDI_FIELD_BYTES); | |||
sha512_final(&ctx, shared); | |||
return (GOLDI_ECORRUPT & ~msucc) | |||
@@ -265,9 +333,42 @@ goldilocks_shared_secret ( | |||
| (GOLDI_EOK & msucc & succ); | |||
} | |||
int | |||
goldilocks_shared_secret ( | |||
uint8_t shared[GOLDI_SHARED_SECRET_BYTES], | |||
const struct goldilocks_private_key_t *my_privkey, | |||
const struct goldilocks_public_key_t *your_pubkey | |||
) { | |||
return goldilocks_shared_secret_core( | |||
shared, | |||
my_privkey, | |||
your_pubkey, | |||
NULL | |||
); | |||
} | |||
static void | |||
goldilocks_derive_challenge( | |||
word_t challenge[GOLDI_FIELD_WORDS], | |||
const unsigned char pubkey[GOLDI_FIELD_BYTES], | |||
const unsigned char gnonce[GOLDI_FIELD_BYTES], | |||
const unsigned char *message, | |||
uint64_t message_len | |||
) { | |||
/* challenge = H(pk, [nonceG], message). */ | |||
unsigned char sha_out[SHA512_OUTPUT_BYTES]; | |||
struct sha512_ctx_t ctx; | |||
sha512_init(&ctx); | |||
sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES); | |||
sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES); | |||
sha512_update(&ctx, message, message_len); | |||
sha512_final(&ctx, sha_out); | |||
barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448); | |||
} | |||
int | |||
goldilocks_sign ( | |||
uint8_t signature_out[56*2], | |||
uint8_t signature_out[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_private_key_t *privkey | |||
@@ -277,7 +378,7 @@ goldilocks_sign ( | |||
} | |||
/* challenge = H(pk, [nonceG], message). */ | |||
word_t skw[448/WORD_BITS]; | |||
word_t skw[GOLDI_FIELD_WORDS]; | |||
mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448); | |||
if (!succ) { | |||
memset(skw,0,sizeof(skw)); | |||
@@ -285,48 +386,50 @@ goldilocks_sign ( | |||
} | |||
/* Derive a nonce. TODO: use HMAC. FUTURE: factor. */ | |||
unsigned char sha_out[512/8]; | |||
word_t tk[448/WORD_BITS]; | |||
unsigned char sha_out[SHA512_OUTPUT_BYTES]; | |||
word_t tk[GOLDI_FIELD_WORDS]; | |||
struct sha512_ctx_t ctx; | |||
sha512_init(&ctx); | |||
sha512_update(&ctx, (const unsigned char *)"signonce", 8); | |||
sha512_update(&ctx, &privkey->opaque[112], 32); | |||
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); | |||
sha512_update(&ctx, message, message_len); | |||
sha512_update(&ctx, &privkey->opaque[112], 32); | |||
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); | |||
sha512_final(&ctx, sha_out); | |||
barrett_deserialize_and_reduce(tk, sha_out, 512/8, &goldi_q448); | |||
barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448); | |||
/* 4[nonce]G */ | |||
uint8_t signature_tmp[56]; | |||
uint8_t signature_tmp[GOLDI_FIELD_BYTES]; | |||
struct tw_extensible_t exta; | |||
struct p448_t gsk; | |||
scalarmul_fixed_base(&exta, tk, 448, &goldilocks_global.fixed_base); | |||
scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); | |||
double_tw_extensible(&exta); | |||
untwist_and_double_and_serialize(&gsk, &exta); | |||
p448_serialize(signature_tmp, &gsk); | |||
word_t challenge[448/WORD_BITS]; | |||
sha512_update(&ctx, &privkey->opaque[56], 56); | |||
sha512_update(&ctx, signature_tmp, 56); | |||
sha512_update(&ctx, message, message_len); | |||
sha512_final(&ctx, sha_out); | |||
barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); | |||
word_t challenge[GOLDI_FIELD_WORDS]; | |||
goldilocks_derive_challenge ( | |||
challenge, | |||
&privkey->opaque[GOLDI_FIELD_BYTES], | |||
signature_tmp, | |||
message, | |||
message_len | |||
); | |||
// reduce challenge and sub. | |||
barrett_negate(challenge,448/WORD_BITS,&goldi_q448); | |||
barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448); | |||
barrett_mac( | |||
tk,448/WORD_BITS, | |||
challenge,448/WORD_BITS, | |||
skw,448/WORD_BITS, | |||
tk,GOLDI_FIELD_WORDS, | |||
challenge,GOLDI_FIELD_WORDS, | |||
skw,GOLDI_FIELD_WORDS, | |||
&goldi_q448 | |||
); | |||
word_t carry = add_nr_ext_packed(tk,tk,448/WORD_BITS,tk,448/WORD_BITS,-1); | |||
barrett_reduce(tk,448/WORD_BITS,carry,&goldi_q448); | |||
word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1); | |||
barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448); | |||
memcpy(signature_out, signature_tmp, 56); | |||
barrett_serialize(signature_out+56, tk, 448/8); | |||
memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); | |||
barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); | |||
memset((unsigned char *)tk,0,sizeof(tk)); | |||
memset((unsigned char *)skw,0,sizeof(skw)); | |||
memset((unsigned char *)challenge,0,sizeof(challenge)); | |||
@@ -342,7 +445,7 @@ goldilocks_sign ( | |||
int | |||
goldilocks_verify ( | |||
const uint8_t signature[56*2], | |||
const uint8_t signature[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_public_key_t *pubkey | |||
@@ -352,24 +455,16 @@ goldilocks_verify ( | |||
} | |||
struct p448_t pk; | |||
word_t s[448/WORD_BITS]; | |||
word_t s[GOLDI_FIELD_WORDS]; | |||
mask_t succ = p448_deserialize(&pk,pubkey->opaque); | |||
if (!succ) return GOLDI_EINVAL; | |||
succ = barrett_deserialize(s, &signature[56], &goldi_q448); | |||
succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); | |||
if (!succ) return GOLDI_EINVAL; | |||
/* challenge = H(pk, [nonceG], message). */ | |||
unsigned char sha_out[512/8]; | |||
word_t challenge[448/WORD_BITS]; | |||
struct sha512_ctx_t ctx; | |||
sha512_init(&ctx); | |||
sha512_update(&ctx, pubkey->opaque, 56); | |||
sha512_update(&ctx, signature, 56); | |||
sha512_update(&ctx, message, message_len); | |||
sha512_final(&ctx, sha_out); | |||
barrett_deserialize_and_reduce(challenge, sha_out, 512/8, &goldi_q448); | |||
word_t challenge[GOLDI_FIELD_WORDS]; | |||
goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len); | |||
struct p448_t eph; | |||
struct tw_extensible_t pk_text; | |||
@@ -381,7 +476,102 @@ goldilocks_verify ( | |||
succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); | |||
if (!succ) return GOLDI_EINVAL; | |||
linear_combo_var_fixed_vt( &pk_text, challenge, 446, s, 446, goldilocks_global.wnafs, 5 ); | |||
linear_combo_var_fixed_vt( &pk_text, | |||
challenge, GOLDI_SCALAR_BITS, | |||
s, GOLDI_SCALAR_BITS, | |||
goldilocks_global.wnafs, 5 ); | |||
untwist_and_double_and_serialize( &pk, &pk_text ); | |||
p448_sub(&eph, &eph, &pk); | |||
p448_bias(&eph, 2); | |||
succ = p448_is_zero(&eph); | |||
return succ ? 0 : GOLDI_EINVAL; | |||
} | |||
#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
struct goldilocks_precomputed_public_key_t * | |||
goldilocks_precompute_public_key ( | |||
const struct goldilocks_public_key_t *pub | |||
) { | |||
struct goldilocks_precomputed_public_key_t *precom; | |||
precom = (struct goldilocks_precomputed_public_key_t *) | |||
malloc(sizeof(*precom)); | |||
if (!precom) return NULL; | |||
struct tw_extensible_t pk_text; | |||
struct p448_t pk; | |||
mask_t succ = p448_deserialize(&pk, pub->opaque); | |||
if (!succ) { | |||
free(precom); | |||
return NULL; | |||
} | |||
succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); | |||
if (!succ) { | |||
free(precom); | |||
return NULL; | |||
} | |||
int big = USE_BIG_TABLES; | |||
uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; | |||
succ = precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL); | |||
if (!succ) { | |||
free(precom); | |||
return NULL; | |||
} | |||
memcpy(&precom->pub,pub,sizeof(*pub)); | |||
return precom; | |||
} | |||
void | |||
goldilocks_destroy_precomputed_public_key ( | |||
struct goldilocks_precomputed_public_key_t *precom | |||
) { | |||
if (!precom) return; | |||
destroy_fixed_base(&precom->table); | |||
memset(&precom->pub.opaque, 0, sizeof(precom->pub)); | |||
free(precom); | |||
} | |||
int | |||
goldilocks_verify_precomputed ( | |||
const uint8_t signature[GOLDI_SIGNATURE_BYTES], | |||
const uint8_t *message, | |||
uint64_t message_len, | |||
const struct goldilocks_precomputed_public_key_t *pubkey | |||
) { | |||
if (!goldilocks_check_init()) { | |||
return GOLDI_EUNINIT; | |||
} | |||
word_t s[GOLDI_FIELD_WORDS]; | |||
mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); | |||
if (!succ) return GOLDI_EINVAL; | |||
word_t challenge[GOLDI_FIELD_WORDS]; | |||
goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len); | |||
struct p448_t eph, pk; | |||
struct tw_extensible_t pk_text; | |||
/* deserialize [nonce]G */ | |||
succ = p448_deserialize(&eph, signature); | |||
if (!succ) return GOLDI_EINVAL; | |||
succ = linear_combo_combs_vt ( | |||
&pk_text, | |||
challenge, GOLDI_SCALAR_BITS, &pubkey->table, | |||
s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base | |||
); | |||
if (!succ) return GOLDI_EINVAL; | |||
untwist_and_double_and_serialize( &pk, &pk_text ); | |||
p448_sub(&eph, &eph, &pk); | |||
@@ -391,3 +581,20 @@ goldilocks_verify ( | |||
return succ ? 0 : GOLDI_EINVAL; | |||
} | |||
int | |||
goldilocks_shared_secret_precomputed ( | |||
uint8_t shared[GOLDI_SHARED_SECRET_BYTES], | |||
const struct goldilocks_private_key_t *my_privkey, | |||
const struct goldilocks_precomputed_public_key_t *your_pubkey | |||
) { | |||
return goldilocks_shared_secret_core( | |||
shared, | |||
my_privkey, | |||
&your_pubkey->pub, | |||
your_pubkey | |||
); | |||
} | |||
#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS | |||
@@ -12,7 +12,9 @@ | |||
#include <sys/types.h> | |||
#if __i386__ || __x86_64__ | |||
#include <immintrin.h> | |||
#endif | |||
#define INTRINSIC \ | |||
static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused)) | |||
@@ -26,7 +26,7 @@ struct fixed_base_table_t { | |||
struct tw_niels_t *table; | |||
/** Adjustments to the scalar in even and odd cases, respectively. */ | |||
word_t scalar_adjustments[2*(448/WORD_BITS)]; | |||
word_t scalar_adjustments[2*(448/WORD_BITS)]; /* MAGIC */ | |||
/** The number of combs in the table. */ | |||
unsigned int n; | |||
@@ -103,7 +103,7 @@ montgomery_ladder ( | |||
void | |||
scalarmul ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar[448/WORD_BITS] | |||
const word_t scalar[448/WORD_BITS] /* MAGIC */ | |||
/* TODO? int nbits */ | |||
); | |||
@@ -124,7 +124,7 @@ scalarmul ( | |||
void | |||
scalarmul_vlook ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar[448/WORD_BITS] | |||
const word_t scalar[448/WORD_BITS] /* MAGIC */ | |||
/* TODO? int nbits */ | |||
); | |||
@@ -209,7 +209,7 @@ scalarmul_fixed_base ( | |||
void | |||
scalarmul_vt ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar[448/WORD_BITS] | |||
const word_t scalar[448/WORD_BITS] /* MAGIC */ | |||
); | |||
@@ -274,14 +274,42 @@ scalarmul_fixed_base_wnaf_vt ( | |||
void | |||
linear_combo_var_fixed_vt ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar_var[448/WORD_BITS], | |||
const word_t scalar_var[448/WORD_BITS], /* MAGIC */ | |||
unsigned int nbits_var, | |||
const word_t scalar_pre[448/WORD_BITS], | |||
const word_t scalar_pre[448/WORD_BITS], /* MAGIC */ | |||
unsigned int nbits_pre, | |||
const struct tw_niels_t *precmp, | |||
unsigned int table_bits_pre | |||
); | |||
/** | |||
* Variable-time scalar linear combination of two fixed points. | |||
* | |||
* @warning This function takes variable time. It is intended for | |||
* signature verification. | |||
* | |||
* @param [out] working The output point. | |||
* @param [in] scalar1 The first scalar. | |||
* @param [in] nbits1 The number of bits in the first scalar. | |||
* @param [in] table1 The first precomputed table. | |||
* @param [in] scalar2 The second scalar. | |||
* @param [in] nbits1 The number of bits in the second scalar. | |||
* @param [in] table1 The second precomputed table. | |||
* | |||
* @retval MASK_SUCCESS Success. | |||
* @retval MASK_FAILURE Failure, because eg the tables are too small. | |||
*/ | |||
mask_t | |||
linear_combo_combs_vt ( | |||
struct tw_extensible_t *out, | |||
const word_t scalar1[448/WORD_BITS], | |||
unsigned int nbits1, | |||
const struct fixed_base_table_t *table1, | |||
const word_t scalar2[448/WORD_BITS], | |||
unsigned int nbits2, | |||
const struct fixed_base_table_t *table2 | |||
); | |||
#ifdef __cplusplus | |||
}; | |||
#endif | |||
@@ -10,6 +10,8 @@ | |||
extern "C" { | |||
#endif | |||
#define SHA512_OUTPUT_BYTES 64 | |||
/** | |||
* SHA512 hashing context. | |||
* | |||
@@ -37,7 +39,7 @@ sha512_update ( | |||
void | |||
sha512_final ( | |||
struct sha512_ctx_t *ctx, | |||
uint8_t result[64] | |||
uint8_t result[SHA512_OUTPUT_BYTES] | |||
); | |||
#ifdef __cplusplus | |||
@@ -8,11 +8,22 @@ | |||
/* for posix_memalign */ | |||
#define _XOPEN_SOURCE 600 | |||
#ifndef __APPLE__ | |||
#define _BSD_SOURCE | |||
#include <endian.h> | |||
#endif | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <sys/types.h> | |||
#include <inttypes.h> | |||
#if __ARM_NEON__ | |||
#include <arm_neon.h> | |||
#elif __SSE2__ | |||
#include <immintrin.h> | |||
#endif | |||
#if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT) | |||
/* It's a 64-bit machine if: | |||
* // limits.h thinks so | |||
@@ -33,6 +44,7 @@ typedef __int128_t dsword_t; | |||
#define PRIxWORD58 "%014" PRIx64 | |||
#define U64LE(x) x##ull | |||
#define U58LE(x) x##ull | |||
#define letohWORD letoh64 | |||
#else | |||
typedef uint16_t hword_t; | |||
typedef uint32_t word_t; | |||
@@ -45,15 +57,19 @@ typedef int64_t dsword_t; | |||
#define PRIxWORD58 "%07" PRIx32 | |||
#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32 | |||
#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 | |||
#define letohWORD letoh32 | |||
#endif | |||
#define WORD_BITS (sizeof(word_t) * 8) | |||
/* TODO: vector width for procs like ARM; gcc support */ | |||
typedef word_t mask_t, vecmask_t __attribute__((ext_vector_type(4))); | |||
typedef word_t mask_t; | |||
static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1; | |||
#ifdef __ARM_NEON__ | |||
typedef uint32x4_t vecmask_t; | |||
#else | |||
/* FIXME this only works on clang */ | |||
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); | |||
typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); | |||
@@ -61,8 +77,13 @@ typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); | |||
typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); | |||
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); | |||
typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); | |||
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); | |||
typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); | |||
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); | |||
typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); | |||
/* TODO: vector width for procs like ARM; gcc support */ | |||
typedef word_t vecmask_t __attribute__((ext_vector_type(4))); | |||
#endif | |||
#if __AVX2__ | |||
typedef uint32x8_t big_register_t; | |||
@@ -82,11 +103,28 @@ typedef uint32_t big_register_t; | |||
#endif | |||
#if __AVX2__ || __SSE2__ || __ARM_NEON__ | |||
#ifdef __ARM_NEON__ | |||
static __inline__ big_register_t | |||
br_set_to_mask(mask_t x) { | |||
return vdupq_n_u32(x); | |||
} | |||
#else | |||
static __inline__ big_register_t | |||
br_set_to_mask(mask_t x) { | |||
return (big_register_t)x; | |||
} | |||
#endif | |||
#if __AVX2__ || __SSE2__ | |||
static __inline__ big_register_t | |||
br_is_zero(big_register_t x) { | |||
return (big_register_t)(x == (big_register_t)0); | |||
} | |||
#elif __ARM_NEON__ | |||
static __inline__ big_register_t | |||
br_is_zero(big_register_t x) { | |||
return vceqq_u32(x,x^x); | |||
} | |||
#else | |||
static __inline__ mask_t | |||
br_is_zero(word_t x) { | |||
@@ -96,6 +134,22 @@ br_is_zero(word_t x) { | |||
#ifdef __APPLE__ | |||
static inline uint64_t | |||
htobe64 (uint64_t x) { | |||
__asm__ ("bswapq %0" : "+r"(x)); | |||
return x; | |||
} | |||
static inline uint64_t | |||
htole64 (uint64_t x) { return x; } | |||
static inline uint64_t | |||
letoh64 (uint64_t x) { return x; } | |||
#endif | |||
/** | |||
* Allocate memory which is sufficiently aligned to be used for the | |||
* largest vector on the system (for now that's a big_register_t). | |||
@@ -63,14 +63,14 @@ cond_negate_tw_pniels ( | |||
cond_negate_tw_niels(&n->n, doNegate); | |||
} | |||
void | |||
static __inline__ void | |||
constant_time_lookup_tw_pniels ( | |||
struct tw_pniels_t *out, | |||
const struct tw_pniels_t *in, | |||
int nin, | |||
int idx | |||
) { | |||
big_register_t big_one = 1, big_i = idx; | |||
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); | |||
big_register_t *o = (big_register_t *)out; | |||
const big_register_t *i = (const big_register_t *)in; | |||
int j; | |||
@@ -85,14 +85,14 @@ constant_time_lookup_tw_pniels ( | |||
} | |||
} | |||
static __inline__ void | |||
static __inline__ void | |||
constant_time_lookup_tw_niels ( | |||
struct tw_niels_t *out, | |||
const struct tw_niels_t *in, | |||
int nin, | |||
int idx | |||
) { | |||
big_register_t big_one = 1, big_i = idx; | |||
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx); | |||
big_register_t *o = (big_register_t *)out; | |||
const big_register_t *i = (const big_register_t *)in; | |||
int j; | |||
@@ -139,64 +139,73 @@ scalarmul ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar[448/WORD_BITS] | |||
) { | |||
const int nbits=448; /* HACK? */ | |||
const int nbits=450; /* MAGIC */ | |||
word_t prepared_data[448*2/WORD_BITS] = { | |||
U64LE(0x9595b847fdf73126), | |||
U64LE(0x9bb9b8a856af5200), | |||
U64LE(0xb3136e22f37d5c4f), | |||
U64LE(0x0000000189a19442), | |||
U64LE(0xebec9967f5d3f5c2), | |||
U64LE(0x0aa09b49b16c9a02), | |||
U64LE(0x7f6126aec172cd8e), | |||
U64LE(0x00000007b027e54d), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x4000000000000000), | |||
U64LE(0x721cf5b5529eec33), | |||
U64LE(0x7a4cf635c8e9c2ab), | |||
U64LE(0xeec492d944a725bf), | |||
U64LE(0x000000020cd77058), | |||
U64LE(0xc873d6d54a7bb0cf), | |||
U64LE(0xe933d8d723a70aad), | |||
U64LE(0xbb124b65129c96fd), | |||
U64LE(0x00000008335dc163), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000) | |||
}; /* TODO: split off */ | |||
}; /* MAGIC */ | |||
word_t scalar2[448/WORD_BITS]; | |||
convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); | |||
const int WINDOW = 5, /* MAGIC */ | |||
WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1, | |||
NTABLE = 1<<(WINDOW-1); | |||
struct tw_extensible_t tabulator; | |||
copy_tw_extensible(&tabulator, working); | |||
double_tw_extensible(&tabulator); | |||
struct tw_pniels_t pn, multiples[8]; | |||
struct tw_pniels_t pn, multiples[NTABLE]; | |||
convert_tw_extensible_to_tw_pniels(&pn, &tabulator); | |||
convert_tw_extensible_to_tw_pniels(&multiples[0], working); | |||
int i; | |||
for (i=1; i<8; i++) { | |||
int i,j; | |||
for (i=1; i<NTABLE; i++) { | |||
add_tw_pniels_to_tw_extensible(working, &pn); | |||
convert_tw_extensible_to_tw_pniels(&multiples[i], working); | |||
} | |||
i = nbits - 4; | |||
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF, | |||
inv = (bits>>3)-1; | |||
i = nbits - WINDOW; | |||
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK, | |||
inv = (bits>>(WINDOW-1))-1; | |||
bits ^= inv; | |||
constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); | |||
constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK); | |||
cond_negate_tw_pniels(&pn, inv); | |||
convert_tw_pniels_to_tw_extensible(working, &pn); | |||
for (i-=4; i>=0; i-=4) { | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
for (i-=WINDOW; i>=0; i-=WINDOW) { | |||
for (j=0; j<WINDOW; j++) { | |||
double_tw_extensible(working); | |||
} | |||
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF; | |||
inv = (bits>>3)-1; | |||
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); | |||
if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { | |||
bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); | |||
} | |||
bits &= WINDOW_MASK; | |||
inv = (bits>>(WINDOW-1))-1; | |||
bits ^= inv; | |||
constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7); | |||
constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK); | |||
cond_negate_tw_pniels(&pn, inv); | |||
add_tw_pniels_to_tw_extensible(working, &pn); | |||
} | |||
@@ -207,81 +216,89 @@ scalarmul_vlook ( | |||
struct tw_extensible_t *working, | |||
const word_t scalar[448/WORD_BITS] | |||
) { | |||
const int nbits=448; /* HACK? */ | |||
const int nbits=450; /* HACK? */ | |||
word_t prepared_data[448*2/WORD_BITS] = { | |||
U64LE(0x9595b847fdf73126), | |||
U64LE(0x9bb9b8a856af5200), | |||
U64LE(0xb3136e22f37d5c4f), | |||
U64LE(0x0000000189a19442), | |||
U64LE(0xebec9967f5d3f5c2), | |||
U64LE(0x0aa09b49b16c9a02), | |||
U64LE(0x7f6126aec172cd8e), | |||
U64LE(0x00000007b027e54d), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x4000000000000000), | |||
U64LE(0x721cf5b5529eec33), | |||
U64LE(0x7a4cf635c8e9c2ab), | |||
U64LE(0xeec492d944a725bf), | |||
U64LE(0x000000020cd77058), | |||
U64LE(0xc873d6d54a7bb0cf), | |||
U64LE(0xe933d8d723a70aad), | |||
U64LE(0xbb124b65129c96fd), | |||
U64LE(0x00000008335dc163), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000), | |||
U64LE(0x0000000000000000) | |||
}; /* TODO: split off */ | |||
}; /* MAGIC: split off */ | |||
word_t scalar2[448/WORD_BITS]; | |||
convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); | |||
const int WINDOW = 5, /* MAGIC */ | |||
WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1, | |||
NTABLE = 1<<(WINDOW-1); | |||
struct tw_extensible_t tabulator; | |||
copy_tw_extensible(&tabulator, working); | |||
double_tw_extensible(&tabulator); | |||
struct tw_pniels_t pn, multiples[8]; | |||
struct tw_pniels_t pn, multiples[NTABLE]; | |||
convert_tw_extensible_to_tw_pniels(&pn, &tabulator); | |||
convert_tw_extensible_to_tw_pniels(&multiples[0], working); | |||
int i; | |||
for (i=1; i<8; i++) { | |||
int i,j; | |||
for (i=1; i<NTABLE; i++) { | |||
add_tw_pniels_to_tw_extensible(working, &pn); | |||
convert_tw_extensible_to_tw_pniels(&multiples[i], working); | |||
} | |||
i = nbits - 4; | |||
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF, | |||
inv = (bits>>3)-1; | |||
i = nbits - WINDOW; | |||
int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK, | |||
inv = (bits>>(WINDOW-1))-1; | |||
bits ^= inv; | |||
copy_tw_pniels(&pn, &multiples[bits&7]); | |||
copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]); | |||
cond_negate_tw_pniels(&pn, inv); | |||
convert_tw_pniels_to_tw_extensible(working, &pn); | |||
for (i-=4; i>=0; i-=4) { | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
double_tw_extensible(working); | |||
for (i-=WINDOW; i>=0; i-=WINDOW) { | |||
for (j=0; j<WINDOW; j++) { | |||
double_tw_extensible(working); | |||
} | |||
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & 0xF; | |||
inv = (bits>>3)-1; | |||
bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); | |||
if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { | |||
bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); | |||
} | |||
bits &= WINDOW_MASK; | |||
inv = (bits>>(WINDOW-1))-1; | |||
bits ^= inv; | |||
copy_tw_pniels(&pn, &multiples[bits&7]); | |||
copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]); | |||
cond_negate_tw_pniels(&pn, inv); | |||
add_tw_pniels_to_tw_extensible(working, &pn); | |||
} | |||
} | |||
mask_t | |||
scalarmul_fixed_base ( | |||
struct tw_extensible_t *out, | |||
const word_t scalar[448/WORD_BITS], | |||
static mask_t | |||
schedule_scalar_for_combs ( | |||
word_t *scalar2, | |||
const word_t *scalar, | |||
unsigned int nbits, | |||
const struct fixed_base_table_t *table | |||
) { | |||
unsigned int i; | |||
unsigned int n = table->n, t = table->t, s = table->s; | |||
assert(n >= 1 && t >= 1 && s >= 1); | |||
if (n*t*s < nbits) { | |||
if (n*t*s < nbits || n < 1 || t < 1 || s < 1) { | |||
return MASK_FAILURE; | |||
} | |||
@@ -289,10 +306,9 @@ scalarmul_fixed_base ( | |||
scalar2_words = scalar_words; | |||
if (scalar2_words < 448 / WORD_BITS) | |||
scalar2_words = 448 / WORD_BITS; | |||
word_t scalar2[scalar2_words], scalar3[scalar2_words]; | |||
word_t scalar3[scalar2_words]; | |||
/* Copy scalar to scalar3, but clear its high bits (if there are any) */ | |||
unsigned int i,j,k; | |||
for (i=0; i<scalar_words; i++) { | |||
scalar3[i] = scalar[i]; | |||
} | |||
@@ -309,6 +325,31 @@ scalarmul_fixed_base ( | |||
table->scalar_adjustments , 448 / WORD_BITS | |||
); | |||
return MASK_SUCCESS; | |||
} | |||
mask_t | |||
scalarmul_fixed_base ( | |||
struct tw_extensible_t *out, | |||
const word_t scalar[448/WORD_BITS], | |||
unsigned int nbits, | |||
const struct fixed_base_table_t *table | |||
) { | |||
unsigned int i,j,k; | |||
unsigned int n = table->n, t = table->t, s = table->s; | |||
unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS; | |||
if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS; | |||
word_t scalar2[scalar2_words]; | |||
mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table); | |||
if (!succ) return MASK_FAILURE; | |||
#ifdef __clang_analyzer__ | |||
assert(t >= 1); | |||
#endif | |||
struct tw_niels_t ni; | |||
for (i=0; i<s; i++) { | |||
@@ -345,6 +386,90 @@ scalarmul_fixed_base ( | |||
return MASK_SUCCESS; | |||
} | |||
mask_t | |||
linear_combo_combs_vt ( | |||
struct tw_extensible_t *out, | |||
const word_t scalar1[448/WORD_BITS], | |||
unsigned int nbits1, | |||
const struct fixed_base_table_t *table1, | |||
const word_t scalar2[448/WORD_BITS], | |||
unsigned int nbits2, | |||
const struct fixed_base_table_t *table2 | |||
) { | |||
unsigned int i,j,k,sc; | |||
unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2; | |||
unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS; | |||
if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS; | |||
unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS; | |||
if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS; | |||
word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words]; | |||
/* Schedule the scalars */ | |||
mask_t succ; | |||
succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1); | |||
if (!succ) return MASK_FAILURE; | |||
succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2); | |||
if (!succ) return MASK_FAILURE; | |||
#ifdef __clang_analyzer__ | |||
assert(table1->t >= 1); | |||
assert(table2->t >= 1); | |||
#endif | |||
struct tw_niels_t ni; | |||
unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0; | |||
word_t *scalars[2] = {scalar1b,scalar2b}; | |||
for (i=0; i<smax; i++) { | |||
if (i) double_tw_extensible(out); | |||
for (sc=0; sc<2; sc++) { | |||
const struct fixed_base_table_t *table = sc ? table2 : table1; | |||
int ii = i-smax+table->s; | |||
if (ii < 0) continue; | |||
assert(ii < (int)table->s); | |||
for (j=0; j<table->n; j++) { | |||
int tab = 0; | |||
for (k=0; k<table->t; k++) { | |||
unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t); | |||
if (bit < swords[sc] * WORD_BITS) { | |||
tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k; | |||
} | |||
} | |||
mask_t invert = (tab>>(table->t-1))-1; | |||
tab ^= invert; | |||
tab &= (1<<(table->t-1)) - 1; | |||
copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]); | |||
cond_negate_tw_niels(&ni,invert); | |||
if (started) { | |||
add_tw_niels_to_tw_extensible(out, &ni); | |||
} else { | |||
convert_tw_niels_to_tw_extensible(out, &ni); | |||
started = 1; | |||
} | |||
} | |||
} | |||
assert(started); | |||
} | |||
return MASK_SUCCESS; | |||
} | |||
mask_t | |||
precompute_fixed_base ( | |||
struct fixed_base_table_t *out, | |||
@@ -354,7 +479,7 @@ precompute_fixed_base ( | |||
unsigned int s, | |||
struct tw_niels_t *prealloc | |||
) { | |||
if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { | |||
if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */ | |||
memset(out, 0, sizeof(*out)); | |||
return 0; | |||
} | |||
@@ -402,7 +527,7 @@ precompute_fixed_base ( | |||
adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS); | |||
/* FIXME: factor out somehow */ | |||
/* MAGIC: factor out somehow */ | |||
const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { | |||
U64LE(0xdc873d6d54a7bb0d), | |||
U64LE(0xde933d8d723a70aa), | |||
@@ -462,13 +587,13 @@ precompute_fixed_base ( | |||
/* Gray-code phase */ | |||
for (j=0;; j++) { | |||
int gray = j ^ (j>>1); | |||
int idx = ((i+1)<<(t-1))-1 ^ gray; | |||
int idx = (((i+1)<<(t-1))-1) ^ gray; | |||
convert_tw_extensible_to_tw_pniels(&pn_tmp, &start); | |||
copy_tw_niels(&table[idx], &pn_tmp.n); | |||
p448_copy(&zs[idx], &pn_tmp.z); | |||
if (j >= (1<<(t-1)) - 1) break; | |||
if (j >= (1u<<(t-1)) - 1) break; | |||
int delta = (j+1) ^ ((j+1)>>1) ^ gray; | |||
for (k=0; delta>1; k++) | |||
@@ -777,7 +902,7 @@ linear_combo_var_fixed_vt( | |||
const struct tw_niels_t *precmp, | |||
unsigned int table_bits_pre | |||
) { | |||
const int table_bits_var = 3; | |||
const int table_bits_var = 4; | |||
struct smvt_control control_var[nbits_var/(table_bits_var+1)+3]; | |||
struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3]; | |||
@@ -2,12 +2,8 @@ | |||
* Copyright (c) 2014 Cryptography Research, Inc. | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
*/ | |||
#ifndef __APPLE__ | |||
#define _BSD_SOURCE | |||
#include <endian.h> | |||
#endif | |||
#include "sha512.h" | |||
#include "word.h" | |||
#include <string.h> | |||
#include <assert.h> | |||
@@ -20,14 +16,6 @@ rotate_r ( | |||
return (x >> d) | (x << (64-d)); | |||
} | |||
#ifdef __APPLE__ | |||
static inline uint64_t | |||
htobe64 (uint64_t x) { | |||
__asm__ ("bswapq %0" : "+r"(x)); | |||
return x; | |||
} | |||
#endif | |||
static const uint64_t | |||
sha512_init_state[8] = { | |||
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, | |||
@@ -17,23 +17,28 @@ | |||
#include "goldilocks.h" | |||
#include "sha512.h" | |||
double now() { | |||
static __inline__ void | |||
ignore_result ( int result ) { | |||
(void)result; | |||
} | |||
static double now() { | |||
struct timeval tv; | |||
gettimeofday(&tv, NULL); | |||
return tv.tv_sec + tv.tv_usec/1000000.0; | |||
} | |||
void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) { | |||
static void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) { | |||
crandom_generate(crand, (unsigned char *)a, sizeof(*a)); | |||
p448_strong_reduce(a); | |||
} | |||
void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) { | |||
static void q448_randomize( struct crandom_state_t *crand, word_t sk[448/WORD_BITS] ) { | |||
crandom_generate(crand, (unsigned char *)sk, 448/8); | |||
} | |||
void p448_print( const char *descr, const struct p448_t *a ) { | |||
static void p448_print( const char *descr, const struct p448_t *a ) { | |||
p448_t b; | |||
p448_copy(&b, a); | |||
p448_strong_reduce(&b); | |||
@@ -45,17 +50,21 @@ void p448_print( const char *descr, const struct p448_t *a ) { | |||
printf("\n"); | |||
} | |||
void p448_print_full( const char *descr, const struct p448_t *a ) { | |||
static void __attribute__((unused)) | |||
p448_print_full ( | |||
const char *descr, | |||
const struct p448_t *a | |||
) { | |||
int j; | |||
printf("%s = 0x", descr); | |||
for (j=15; j>=0; j--) { | |||
printf("%02" PRIxWORD "_" PRIxWORD58 " ", | |||
a->limb[j]>>28, a->limb[j]&(1<<28)-1); | |||
a->limb[j]>>28, a->limb[j]&((1<<28)-1)); | |||
} | |||
printf("\n"); | |||
} | |||
void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) { | |||
static void q448_print( const char *descr, const word_t secret[448/WORD_BITS] ) { | |||
int j; | |||
printf("%s = 0x", descr); | |||
for (j=448/WORD_BITS-1; j>=0; j--) { | |||
@@ -295,7 +304,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
(void)montgomery_ladder(&a,&b,sk,448,0); | |||
ignore_result(montgomery_ladder(&a,&b,sk,448,0)); | |||
} | |||
when = now() - when; | |||
printf("full ladder: %5.1fµs\n", when * 1e6 / i); | |||
@@ -310,11 +319,18 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
scalarmul_vlook(&ext,sk); | |||
untwist_and_double_and_serialize(&a,&ext); | |||
} | |||
when = now() - when; | |||
printf("edwards svl: %5.1fµs\n", when * 1e6 / i); | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
scalarmul(&ext,sk); | |||
untwist_and_double_and_serialize(&a,&ext); | |||
} | |||
when = now() - when; | |||
printf("edwards smc: %5.1fµs\n", when * 1e6 / i); | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
q448_randomize(&crand, sk); | |||
@@ -326,7 +342,7 @@ int main(int argc, char **argv) { | |||
struct tw_niels_t wnaft[1<<6]; | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
(void)precompute_fixed_base_wnaf(wnaft,&ext,6); | |||
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,6)); | |||
} | |||
when = now() - when; | |||
printf("wnaf6 pre: %5.1fµs\n", when * 1e6 / i); | |||
@@ -341,7 +357,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
(void)precompute_fixed_base_wnaf(wnaft,&ext,4); | |||
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,4)); | |||
} | |||
when = now() - when; | |||
printf("wnaf4 pre: %5.1fµs\n", when * 1e6 / i); | |||
@@ -356,7 +372,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
(void)precompute_fixed_base_wnaf(wnaft,&ext,5); | |||
ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,5)); | |||
} | |||
when = now() - when; | |||
printf("wnaf5 pre: %5.1fµs\n", when * 1e6 / i); | |||
@@ -401,7 +417,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
if (i) destroy_fixed_base(&t_5_5_18); | |||
(void)precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL); | |||
ignore_result(precompute_fixed_base(&t_5_5_18, &ext, 5, 5, 18, NULL)); | |||
} | |||
when = now() - when; | |||
printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i); | |||
@@ -409,7 +425,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
if (i) destroy_fixed_base(&t_3_5_30); | |||
(void)precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL); | |||
ignore_result(precompute_fixed_base(&t_3_5_30, &ext, 3, 5, 30, NULL)); | |||
} | |||
when = now() - when; | |||
printf("pre(3,5,30): %5.1fµs\n", when * 1e6 / i); | |||
@@ -417,7 +433,7 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
if (i) destroy_fixed_base(&t_5_3_30); | |||
(void)precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL); | |||
ignore_result(precompute_fixed_base(&t_5_3_30, &ext, 5, 3, 30, NULL)); | |||
} | |||
when = now() - when; | |||
printf("pre(5,3,30): %5.1fµs\n", when * 1e6 / i); | |||
@@ -425,15 +441,15 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
if (i) destroy_fixed_base(&t_15_3_10); | |||
(void)precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL); | |||
ignore_result(precompute_fixed_base(&t_15_3_10, &ext, 15, 3, 10, NULL)); | |||
} | |||
when = now() - when; | |||
printf("pre(15,3,10): %5.1fµs\n", when * 1e6 / i); | |||
printf("pre(15,3,10):%5.1fµs\n", when * 1e6 / i); | |||
when = now(); | |||
for (i=0; i<nbase/10; i++) { | |||
if (i) destroy_fixed_base(&t_8_4_14); | |||
(void)precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL); | |||
ignore_result(precompute_fixed_base(&t_8_4_14, &ext, 8, 4, 14, NULL)); | |||
} | |||
when = now() - when; | |||
printf("pre(8,4,14): %5.1fµs\n", when * 1e6 / i); | |||
@@ -471,7 +487,7 @@ int main(int argc, char **argv) { | |||
scalarmul_fixed_base(&ext, sk, 448, &t_15_3_10); | |||
} | |||
when = now() - when; | |||
printf("com(15,3,10): %5.1fµs\n", when * 1e6 / i); | |||
printf("com(15,3,10):%5.1fµs\n", when * 1e6 / i); | |||
printf("\nGoldilocks:\n"); | |||
@@ -494,7 +510,7 @@ int main(int argc, char **argv) { | |||
printf("keygen: %5.1fµs\n", when * 1e6 / i); | |||
uint8_t ss1[64],ss2[64]; | |||
int gres1,gres2; | |||
int gres1=0,gres2=0; | |||
when = now(); | |||
for (i=0; i<nbase; i++) { | |||
if (i&1) { | |||
@@ -540,18 +556,43 @@ int main(int argc, char **argv) { | |||
when = now(); | |||
for (i=0; i<nbase; i++) { | |||
res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk); | |||
(void)res; | |||
int ver = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk); | |||
assert(!ver); | |||
} | |||
when = now() - when; | |||
printf("verify: %5.1fµs\n", when * 1e6 / i); | |||
struct goldilocks_precomputed_public_key_t *pre = NULL; | |||
when = now(); | |||
for (i=0; i<nbase; i++) { | |||
goldilocks_destroy_precomputed_public_key(pre); | |||
pre = goldilocks_precompute_public_key(&gpk); | |||
} | |||
when = now() - when; | |||
printf("precompute: %5.1fµs\n", when * 1e6 / i); | |||
when = now(); | |||
for (i=0; i<nbase; i++) { | |||
int ver = goldilocks_verify_precomputed(sout,(const unsigned char *)message,message_len,pre); | |||
assert(!ver); | |||
} | |||
when = now() - when; | |||
printf("verify pre: %5.1fµs\n", when * 1e6 / i); | |||
when = now(); | |||
for (i=0; i<nbase; i++) { | |||
int ret = goldilocks_shared_secret_precomputed(ss1,&gsk,pre); | |||
assert(!ret); | |||
} | |||
when = now() - when; | |||
printf("ecdh pre: %5.1fµs\n", when * 1e6 / i); | |||
printf("\nTesting...\n"); | |||
int failures=0, successes = 0; | |||
for (i=0; i<nbase/10; i++) { | |||
(void)goldilocks_keygen(&gsk,&gpk); | |||
ignore_result(goldilocks_keygen(&gsk,&gpk)); | |||
goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk); | |||
res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk); | |||
if (res) failures++; | |||
@@ -574,9 +615,9 @@ int main(int argc, char **argv) { | |||
y = (hword_t)y; | |||
word_t z=x*y; | |||
(void)montgomery_ladder(&b,&a,&x,WORD_BITS,0); | |||
(void)montgomery_ladder(&c,&b,&y,WORD_BITS,0); | |||
(void)montgomery_ladder(&b,&a,&z,WORD_BITS,0); | |||
ignore_result(montgomery_ladder(&b,&a,&x,WORD_BITS,0)); | |||
ignore_result(montgomery_ladder(&c,&b,&y,WORD_BITS,0)); | |||
ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0)); | |||
p448_sub(&d,&b,&c); | |||
p448_bias(&d,2); | |||
@@ -655,7 +696,7 @@ int main(int argc, char **argv) { | |||
untwist_and_double(&exta,&exv); | |||
serialize_extensible(&b, &exta); | |||
(void)precompute_fixed_base_wnaf(wnaft,&exu,5); | |||
ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5)); | |||
linear_combo_var_fixed_vt(&ext,sk,448,tk,448,wnaft,5); | |||
untwist_and_double(&exta,&exv); | |||
serialize_extensible(&c, &exta); | |||
@@ -6,7 +6,7 @@ | |||
int failed_tests, n_tests, failed_this_test, running_a_test; | |||
void end_test() { | |||
static void end_test() { | |||
if (!failed_this_test) { | |||
printf("[PASS]\n"); | |||
} | |||
@@ -14,7 +14,7 @@ void end_test() { | |||
running_a_test = 0; | |||
} | |||
void begin_test(const char *name) { | |||
static void begin_test(const char *name) { | |||
if (running_a_test) end_test(); | |||
printf("%s...%*s",name,(int)(30-strlen(name)),""); | |||
fflush(stdout); | |||
@@ -110,8 +110,9 @@ int main(int argc, char **argv) { | |||
(void) argv; | |||
n_tests = running_a_test = failed_tests = 0; | |||
begin_test("SHA-512 NIST Monte Carlo"); | |||
test_sha512_monte_carlo(); | |||
begin_test("Arithmetic"); | |||
test_arithmetic(); | |||
begin_test("EC point operations"); | |||
test_pointops(); | |||
@@ -122,6 +123,15 @@ int main(int argc, char **argv) { | |||
begin_test("Scalarmul commutativity"); | |||
test_scalarmul_commutativity(); | |||
begin_test("Linear combo"); | |||
test_linear_combo(); | |||
begin_test("SHA-512 NIST Monte Carlo"); | |||
test_sha512_monte_carlo(); | |||
begin_test("Goldilocks complete system"); | |||
test_goldilocks(); | |||
if (running_a_test) end_test(); | |||
printf("\n"); | |||
if (failed_tests) { | |||
@@ -33,10 +33,16 @@ void youfail(); | |||
int test_sha512_monte_carlo(); | |||
int test_linear_combo (); | |||
int test_scalarmul_compatibility (); | |||
int test_scalarmul_commutativity (); | |||
int test_arithmetic (); | |||
int test_goldilocks (); | |||
int test_pointops (); | |||
#endif // __GOLDILOCKS_TEST_H__ |
@@ -0,0 +1,196 @@ | |||
#include "p448.h" | |||
#include "test.h" | |||
#include <gmp.h> | |||
#include <string.h> | |||
#include <stdio.h> | |||
mpz_t mp_p448; | |||
static mask_t mpz_to_p448 ( | |||
struct p448_t *out, | |||
const mpz_t in | |||
) { | |||
uint8_t ser[56]; | |||
mpz_t modded; | |||
memset(ser,0,sizeof(ser)); | |||
mpz_init(modded); | |||
mpz_mod(modded, in, mp_p448); | |||
mpz_export(ser, NULL, -1, 1, -1, 0, modded); | |||
mask_t succ = p448_deserialize(out, ser); | |||
return succ; | |||
} | |||
static mask_t p448_assert_eq_gmp( | |||
const char *descr, | |||
const struct p448_t *x, | |||
const mpz_t y, | |||
float lowBound, | |||
float highBound | |||
) { | |||
uint8_t xser[56], yser[56]; | |||
mpz_t modded; | |||
memset(yser,0,sizeof(yser)); | |||
p448_serialize(xser, x); | |||
mpz_init(modded); | |||
mpz_mod(modded, y, mp_p448); | |||
mpz_export(yser, NULL, -1, 1, -1, 0, modded); | |||
unsigned int i; | |||
for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) { | |||
int bits = sizeof(x->limb[0]) * 448 / sizeof(*x); | |||
word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ? | |||
(1ull<<bits) - 2 : (1ull<<bits) - 1; | |||
if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) { | |||
youfail(); | |||
printf(" P448 limb %d -> " PRIxWORDfull " is out of bounds (%0.2f, %0.2f) for test %s (yardstick = " PRIxWORDfull ")\n", | |||
i, x->limb[i], lowBound, highBound, descr, yardstick); | |||
break; | |||
} | |||
} | |||
if (memcmp(xser,yser,56)) { | |||
youfail(); | |||
printf(" Failed arithmetic test %s\n", descr); | |||
p448_print(" p448", x); | |||
printf(" gmp = 0x"); | |||
int j; | |||
for (j=55; j>=0; j--) { | |||
printf("%02x", yser[j]); | |||
} | |||
printf("\n"); | |||
return MASK_FAILURE; | |||
} | |||
mpz_clear(modded); | |||
return MASK_SUCCESS; | |||
} | |||
static mask_t test_add_sub ( | |||
const mpz_t x, | |||
const mpz_t y, | |||
word_t word | |||
) { | |||
struct p448_t xx,yy,tt; | |||
mpz_t t; | |||
mask_t succ = MASK_SUCCESS; | |||
succ = mpz_to_p448(&xx,x); | |||
succ &= mpz_to_p448(&yy,y); | |||
mpz_init(t); | |||
p448_add(&tt,&xx,&yy); | |||
mpz_add(t,x,y); | |||
succ &= p448_assert_eq_gmp("add",&tt,t,0,2.1); | |||
p448_sub(&tt,&xx,&yy); | |||
p448_bias(&tt,2); | |||
mpz_sub(t,x,y); | |||
succ &= p448_assert_eq_gmp("sub",&tt,t,0,3.1); | |||
p448_copy(&tt,&xx); | |||
p448_addw(&tt,word); | |||
mpz_add_ui(t,x,word); | |||
succ &= p448_assert_eq_gmp("addw",&tt,t,0,2.1); | |||
p448_copy(&tt,&xx); | |||
p448_subw(&tt,word); | |||
p448_bias(&tt,1); | |||
mpz_sub_ui(t,x,word); | |||
succ &= p448_assert_eq_gmp("subw",&tt,t,0,2.1); | |||
if (!succ) { | |||
p448_print(" x", &xx); | |||
p448_print(" y", &yy); | |||
} | |||
mpz_clear(t); | |||
return succ; | |||
} | |||
static mask_t test_mul_sqr ( | |||
const mpz_t x, | |||
const mpz_t y, | |||
word_t word | |||
) { | |||
struct p448_t xx,yy,tt; | |||
mpz_t t; | |||
mask_t succ = MASK_SUCCESS; | |||
succ = mpz_to_p448(&xx,x); | |||
succ &= mpz_to_p448(&yy,y); | |||
mpz_init(t); | |||
p448_mul(&tt,&xx,&yy); | |||
mpz_mul(t,x,y); | |||
succ &= p448_assert_eq_gmp("mul",&tt,t,0,1.1); | |||
p448_mulw(&tt,&xx,word); | |||
mpz_mul_ui(t,x,word); | |||
succ &= p448_assert_eq_gmp("mulw",&tt,t,0,1.1); | |||
p448_sqr(&tt,&xx); | |||
mpz_mul(t,x,x); | |||
succ &= p448_assert_eq_gmp("sqrx",&tt,t,0,1.1); | |||
p448_sqr(&tt,&yy); | |||
mpz_mul(t,y,y); | |||
succ &= p448_assert_eq_gmp("sqy",&tt,t,0,1.1); | |||
if (!succ) { | |||
p448_print(" x", &xx); | |||
p448_print(" y", &yy); | |||
} | |||
mpz_clear(t); | |||
return succ; | |||
} | |||
int test_arithmetic () { | |||
int j, ntests = 100000; | |||
gmp_randstate_t state; | |||
gmp_randinit_mt(state); | |||
uint8_t pser[56]; | |||
for (j=0; j<56; j++) { | |||
pser[j] = (j==28) ? 0xFE : 0xFF; | |||
} | |||
mpz_init(mp_p448); | |||
mpz_import(mp_p448, 56, -1, 1, -1, 0, pser); | |||
mpz_t x,y; | |||
mpz_init(x); | |||
mpz_init(y); | |||
mask_t succ = MASK_SUCCESS; | |||
int bits = sizeof(word_t) * 448 / sizeof(p448_t); | |||
for (j=0; j<ntests; j++) { | |||
if (j&1) { | |||
mpz_rrandomb(x, state, 448); | |||
mpz_rrandomb(y, state, 448); | |||
} else { | |||
mpz_urandomb(x, state, 448); | |||
mpz_urandomb(y, state, 448); | |||
} | |||
word_t word = gmp_urandomm_ui (state, 1ull<<bits); | |||
succ &= test_add_sub(x,y,word); | |||
succ &= test_mul_sqr(x,y,word); | |||
// TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...? | |||
} | |||
mpz_clear(x); | |||
mpz_clear(y); | |||
mpz_clear(mp_p448); | |||
gmp_randclear(state); | |||
return succ ? 0 : 1; | |||
} | |||
@@ -0,0 +1,195 @@ | |||
#include "test.h" | |||
#include "goldilocks.h" | |||
#include <stdio.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
int test_goldilocks () { | |||
const char *message1 = "hello world"; | |||
const char *message2 = "Jello world"; | |||
unsigned char signature[GOLDI_SIGNATURE_BYTES]; | |||
unsigned char | |||
ss12[GOLDI_SHARED_SECRET_BYTES], | |||
ss21[GOLDI_SHARED_SECRET_BYTES], | |||
ss21p[GOLDI_SHARED_SECRET_BYTES], | |||
proto[GOLDI_SYMKEY_BYTES]; | |||
struct goldilocks_public_key_t pub, pub2; | |||
struct goldilocks_private_key_t priv, priv2; | |||
struct goldilocks_precomputed_public_key_t *pre = NULL; | |||
int i, ret, good = 1; | |||
ret = goldilocks_init(); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed init.\n"); | |||
} | |||
for (i=0; i<1000 && good; i++) { | |||
ret = goldilocks_keygen(&priv, &pub); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed keygen trial %d.\n", i); | |||
good = 0; | |||
} | |||
goldilocks_destroy_precomputed_public_key( pre ); | |||
pre = goldilocks_precompute_public_key ( &pub ); | |||
if (!pre) { | |||
youfail(); | |||
printf(" Failed precomp-public trial %d.\n", i); | |||
return -1; | |||
} | |||
ret = goldilocks_sign( | |||
signature, | |||
(const unsigned char *)message1, | |||
strlen(message1), | |||
&priv | |||
); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed sign trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_verify( | |||
signature, | |||
(const unsigned char *)message1, | |||
strlen(message1), | |||
&pub | |||
); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed verify trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_verify_precomputed ( | |||
signature, | |||
(const unsigned char *)message1, | |||
strlen(message1), | |||
pre | |||
); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed verify-pre trial %d.\n", i); | |||
good = 0; | |||
} | |||
/* terrible negative test */ | |||
ret = goldilocks_verify( | |||
signature, | |||
(const unsigned char *)message2, | |||
strlen(message1), | |||
&pub | |||
); | |||
if (ret != GOLDI_EINVAL) { | |||
youfail(); | |||
printf(" Failed nega-verify trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_verify_precomputed( | |||
signature, | |||
(const unsigned char *)message2, | |||
strlen(message1), | |||
pre | |||
); | |||
if (ret != GOLDI_EINVAL) { | |||
youfail(); | |||
printf(" Failed nega-verify-pre trial %d.\n", i); | |||
good = 0; | |||
} | |||
/* honestly a slightly better negative test */ | |||
memset(signature,0,sizeof(signature)); | |||
ret = goldilocks_verify( | |||
signature, | |||
(const unsigned char *)message1, | |||
strlen(message1), | |||
&pub | |||
); | |||
if (ret != GOLDI_EINVAL) { | |||
youfail(); | |||
printf(" Failed nega-verify-0 trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_verify_precomputed( | |||
signature, | |||
(const unsigned char *)message1, | |||
strlen(message1), | |||
pre | |||
); | |||
if (ret != GOLDI_EINVAL) { | |||
youfail(); | |||
printf(" Failed nega-verify-pre-0 trial %d.\n", i); | |||
good = 0; | |||
} | |||
/* ecdh */ | |||
ret = goldilocks_keygen(&priv2, &pub2); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed keygen2 trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_shared_secret ( ss12, &priv, &pub2 ); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed ss12 trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_shared_secret ( ss21, &priv2, &pub ); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed ss21 trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_shared_secret_precomputed ( ss21p, &priv2, pre ); | |||
if (ret) { | |||
youfail(); | |||
printf(" Failed ss21p trial %d.\n", i); | |||
good = 0; | |||
} | |||
if (memcmp(ss12,ss21,sizeof(ss12))) { | |||
youfail(); | |||
printf(" Failed shared-secret trial %d.\n", i); | |||
good = 0; | |||
} | |||
if (memcmp(ss21,ss21p,sizeof(ss21))) { | |||
youfail(); | |||
printf(" Failed shared-secret precomp trial %d.\n", i); | |||
good = 0; | |||
} | |||
/* test derive / underive / priv to pub */ | |||
goldilocks_underive_private_key ( proto, &priv ); | |||
ret = goldilocks_derive_private_key ( &priv2, proto ); | |||
if (ret || memcmp(&priv,&priv2,sizeof(priv))) { | |||
youfail(); | |||
printf(" Failed derive round-trip trial %d.\n", i); | |||
good = 0; | |||
} | |||
ret = goldilocks_private_to_public ( &pub2, &priv ); | |||
if (ret || memcmp(&pub,&pub2,sizeof(pub))) { | |||
youfail(); | |||
printf(" Failed private-to-public trial %d.\n", i); | |||
good = 0; | |||
} | |||
} | |||
goldilocks_destroy_precomputed_public_key( pre ); | |||
return good ? 0 : -1; | |||
} |
@@ -159,6 +159,92 @@ single_scalarmul_compatibility_test ( | |||
return ret; | |||
} | |||
static int | |||
single_linear_combo_test ( | |||
const struct p448_t *base1, | |||
const word_t *scalar1, | |||
int nbits1, | |||
const struct p448_t *base2, | |||
const word_t *scalar2, | |||
int nbits2 | |||
) { | |||
/* MAGIC */ | |||
const struct p448_t | |||
sqrt_d_minus_1 = {{ | |||
U58LE(0xd2e21836749f46), | |||
U58LE(0x888db42b4f0179), | |||
U58LE(0x5a189aabdeea38), | |||
U58LE(0x51e65ca6f14c06), | |||
U58LE(0xa49f7b424d9770), | |||
U58LE(0xdcac4628c5f656), | |||
U58LE(0x49443b8748734a), | |||
U58LE(0x12fec0c0b25b7a) | |||
}}; | |||
struct tw_extensible_t text1, text2, working; | |||
struct tw_pniels_t pn; | |||
struct p448_t result_comb, result_combo, result_wnaf; | |||
mask_t succ = | |||
deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1) | |||
& deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2); | |||
if (!succ) return 1; | |||
struct fixed_base_table_t t1, t2; | |||
struct tw_niels_t wnaf[32]; | |||
memset(&t1,0,sizeof(t1)); | |||
memset(&t2,0,sizeof(t2)); | |||
succ = precompute_fixed_base(&t1, &text1, 5, 5, 18, NULL); | |||
succ &= precompute_fixed_base(&t2, &text2, 6, 3, 25, NULL); | |||
succ &= precompute_fixed_base_wnaf(wnaf, &text2, 5); | |||
if (!succ) { | |||
destroy_fixed_base(&t1); | |||
destroy_fixed_base(&t2); | |||
return -1; | |||
} | |||
/* use the dedicated wNAF linear combo algorithm */ | |||
copy_tw_extensible(&working, &text1); | |||
linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5); | |||
untwist_and_double_and_serialize(&result_wnaf, &working); | |||
/* use the dedicated combs algorithm */ | |||
succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2); | |||
untwist_and_double_and_serialize(&result_combo, &working); | |||
/* use two combs */ | |||
succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1); | |||
convert_tw_extensible_to_tw_pniels(&pn, &working); | |||
succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2); | |||
add_tw_pniels_to_tw_extensible(&working, &pn); | |||
untwist_and_double_and_serialize(&result_comb, &working); | |||
mask_t consistent = MASK_SUCCESS; | |||
consistent &= p448_eq(&result_combo, &result_wnaf); | |||
consistent &= p448_eq(&result_comb, &result_wnaf); | |||
if (!succ || !consistent) { | |||
youfail(); | |||
printf(" Failed linear combo consistency test with nbits=%d,%d.\n",nbits1,nbits2); | |||
p448_print(" base1", base1); | |||
scalar_print(" scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS); | |||
p448_print(" base2", base2); | |||
scalar_print(" scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS); | |||
p448_print(" combs", &result_comb); | |||
p448_print(" combo", &result_combo); | |||
p448_print(" wNAFs", &result_wnaf); | |||
return -1; | |||
} | |||
destroy_fixed_base(&t1); | |||
destroy_fixed_base(&t2); | |||
return 0; | |||
} | |||
/* 0 = succeed, 1 = inval, -1 = fail */ | |||
static int | |||
single_scalarmul_commutativity_test ( | |||
@@ -251,6 +337,49 @@ int test_scalarmul_commutativity () { | |||
return 0; | |||
} | |||
int test_linear_combo () { | |||
int i,j,k,got; | |||
struct crandom_state_t crand; | |||
crandom_init_from_buffer(&crand, "scalarmul_linear_combos_test RNG"); | |||
for (i=0; i<=448; i+=7) { | |||
for (j=0; j<=448; j+=7) { | |||
got = 0; | |||
for (k=0; k<128 && !got; k++) { | |||
uint8_t ser[56]; | |||
word_t scalar1[7], scalar2[7]; | |||
crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1)); | |||
crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); | |||
p448_t base1; | |||
crandom_generate(&crand, ser, sizeof(ser)); | |||
mask_t succ = p448_deserialize(&base1, ser); | |||
if (!succ) continue; | |||
p448_t base2; | |||
crandom_generate(&crand, ser, sizeof(ser)); | |||
succ = p448_deserialize(&base2, ser); | |||
if (!succ) continue; | |||
int ret = single_linear_combo_test (&base1, scalar1, i, &base2, scalar2, j); | |||
got = !ret; | |||
if (ret == -1) return -1; | |||
} | |||
if (!got) { | |||
youfail(); | |||
printf(" Unlikely: rejected 128 scalars in a row.\n"); | |||
return -1; | |||
} | |||
} | |||
} | |||
return 0; | |||
} | |||
int test_scalarmul_compatibility () { | |||
int i,j,k,got; | |||