Browse Source

Big changes for curve flexibility. For details see HISTORY.txt.

Very experimental Ed480-Ridinghood support is now in.  It's not fully optimized,
but in general the current build is 8-15% slower than Goldilocks.  It only works on
arch_x86_64, though arch_ref64 support ought to be easy.  Support on other arches
will be trickier, which is of course why I chose Goldilocks over Ridinghood in the
first place.

Next up, E-521.  Hopefully.

The code is starting to get spread out over a lot of files.  Some are per field*arch,
some per field, some per curve, some global.  It's hard to do much about this, though,
with a rather ugly .c.inc system.

There's currently no way to make a Ridinghood eBAT.  In fact, I haven't tested eBAT
support in this commit.  I also haven't tested NEON, but at least ARCH_32 works on
Intel.
master
Mike Hamburg 10 years ago
parent
commit
1f480b0f95
49 changed files with 2082 additions and 185 deletions
  1. +41
    -0
      HISTORY.txt
  2. +9
    -4
      Makefile
  3. +7
    -3
      include/goldilocks.h
  4. +376
    -0
      include/ridinghood.h
  5. +12
    -75
      src/ec_point.c
  6. +0
    -2
      src/include/ec_point.h
  7. +46
    -28
      src/include/field.h
  8. +14
    -33
      src/include/magic.h
  9. +8
    -3
      src/include/word.h
  10. +0
    -0
      src/p448/arch_32/arch_config.h
  11. +0
    -0
      src/p448/arch_32/p448.c
  12. +0
    -0
      src/p448/arch_32/p448.h
  13. +0
    -0
      src/p448/arch_arm_32/arch_config.h
  14. +0
    -0
      src/p448/arch_arm_32/p448.c
  15. +0
    -0
      src/p448/arch_arm_32/p448.h
  16. +0
    -0
      src/p448/arch_neon/arch_config.h
  17. +0
    -0
      src/p448/arch_neon/neon_emulation.h
  18. +0
    -0
      src/p448/arch_neon/p448.c
  19. +0
    -0
      src/p448/arch_neon/p448.h
  20. +0
    -0
      src/p448/arch_neon_experimental/arch_config.h
  21. +0
    -0
      src/p448/arch_neon_experimental/p448.c
  22. +0
    -0
      src/p448/arch_neon_experimental/p448.h
  23. +0
    -0
      src/p448/arch_ref64/arch_config.h
  24. +0
    -0
      src/p448/arch_ref64/p448.c
  25. +0
    -0
      src/p448/arch_ref64/p448.h
  26. +0
    -0
      src/p448/arch_x86_64/arch_config.h
  27. +0
    -0
      src/p448/arch_x86_64/p448.c
  28. +0
    -0
      src/p448/arch_x86_64/p448.h
  29. +0
    -0
      src/p448/arch_x86_64/x86-64-arith.h
  30. +43
    -0
      src/p448/f_arithmetic.c
  31. +39
    -0
      src/p448/f_field.h
  32. +35
    -0
      src/p448/f_magic.h
  33. +123
    -0
      src/p448/field.h
  34. +12
    -12
      src/p448/magic.c
  35. +1
    -0
      src/p480/arch_x86_64/arch_config.h
  36. +435
    -0
      src/p480/arch_x86_64/p480.c
  37. +257
    -0
      src/p480/arch_x86_64/p480.h
  38. +279
    -0
      src/p480/arch_x86_64/x86-64-arith.h
  39. +43
    -0
      src/p480/f_arithmetic.c
  40. +39
    -0
      src/p480/f_field.h
  41. +35
    -0
      src/p480/f_magic.h
  42. +68
    -0
      src/p480/magic.c
  43. +43
    -0
      src/p521/f_arithmetic.c
  44. +39
    -0
      src/p521/f_field.h
  45. +5
    -6
      test/bench.c
  46. +4
    -5
      test/test.c
  47. +51
    -12
      test/test_arithmetic.c
  48. +10
    -0
      test/test_pointops.c
  49. +8
    -2
      test/test_scalarmul.c

+ 41
- 0
HISTORY.txt View File

@@ -1,3 +1,44 @@
October 23, 2014:
Pushing through changes for curve flexibility. First up is
Ed480-Ridinghood, because it has the same number of words. Next
is E-521.
Experimental support for Ed480-Ridinghood. To use, compile with
make ... FIELD=p480 -XCFLAGS=-DGOLDI_FIELD_BITS=480
I still need to figure out what to do about the fact that the library
is called "goldilocks", but in will soon support curves that are not
ed448-goldilocks, at least experimentally.
Currently the whole system's header "goldilocks.h" doesn't have
a simpler way to override field size, but it does work (as a hack)
with -DGOLDI_FIELD_BITS=...
There is no support yet for coexistence of multiple fields in one
library. The field routines will have unique names, but scalarmul*
won't, and the top-level goldilocks routines have fixed names.
Current timings on Haswell:
Goldilocks: 178kcy keygen, 536kcy ecdh
Ridinghood: 193kcy keygen, 617kcy ecdh
Note that Ridinghood ECDH does worse than 480/448. This is at least
in part because I haven't calculated the overflow handling limits yet
in ec_point.h (this is a disadvantage of dropping the automated
tool for generating that file). So I'm reducing much more often
than I need to. (There's a really loud TODO in ec_point.h for that.)
Also, I haven't tested the limits on these reductions in a while, so
it could be that there are actual (security-critical) bugs in this
area, at least for p448. Now that there's field flexibility, it's
probably a good idea to make a field impl with extra words to check
this.
Furthermore, field_mulw_scc will perform differently on these two
curves based on whether the curve constant is positive or negative.
I should probably go optimize the "hot" routines like montgomery_step
to have separate cases for positive and negative.

September 29, 2014:
Yesterday I put in some more architecture detection, but it should
really be based on the arch directory, because what's in there really


+ 9
- 4
Makefile View File

@@ -20,12 +20,13 @@ else
ARCH ?= arch_arm_32
endif

FIELD ?= p448

WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
LANGFLAGS = -std=c99 -fno-strict-aliasing
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
OFLAGS = -O3
@@ -63,7 +64,8 @@ ASFLAGS = $(ARCHFLAGS)
HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o build/arithmetic.o
build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \
build/f_arithmetic.o build/arithmetic.o

TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o
@@ -113,7 +115,10 @@ build/%.s: src/%.c $(HEADERS)
build/%.s: test/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<

build/%.s: src/$(ARCH)/%.c $(HEADERS)
build/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<

build/%.s: src/$(FIELD)/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<

doc/timestamp:
@@ -131,7 +136,7 @@ $(BATNAME): include/* src/* src/*/* test/batarch.map
targ="$@/crypto_$$prim/ed448goldilocks"; \
(while read arch where; do \
mkdir -p $$targ/`basename $$arch`; \
cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/$$where/*.c src/$$where/*.h $$targ/`basename $$arch`; \
cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $$targ/`basename $$arch`/config.h; \
perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h; \


+ 7
- 3
include/goldilocks.h View File

@@ -22,14 +22,18 @@
#define GOLDI_IMPLEMENT_SIGNATURES 1
#endif

/** The size of the Goldilocks field, in bits. */
/** The size of the Goldilocks field, in bits.
* Ifdef'd so you can override when testing experimental Ed480-Ridinghood or E-521.
*/
#ifndef GOLDI_FIELD_BITS
#define GOLDI_FIELD_BITS 448
#endif

/** The size of the Goldilocks scalars, in bits. */
#define GOLDI_SCALAR_BITS 446
#define GOLDI_SCALAR_BITS (GOLDI_FIELD_BITS-2)

/** The same size, in bytes. */
#define GOLDI_FIELD_BYTES (GOLDI_FIELD_BITS/8)
#define GOLDI_FIELD_BYTES ((GOLDI_FIELD_BITS+7)/8)

/** The size of a Goldilocks public key, in bytes. */
#define GOLDI_PUBLIC_KEY_BYTES GOLDI_FIELD_BYTES


+ 376
- 0
include/ridinghood.h View File

@@ -0,0 +1,376 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/**
* @file goldilocks.h
* @author Mike Hamburg
* @brief Goldilocks high-level functions.
*/
#ifndef __GOLDILOCKS_H__
#define __GOLDILOCKS_H__ 1

#include <stdint.h>

#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
/** If nonzero, implement precomputation for verify and ECDH. */
#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
#endif

#ifndef GOLDI_IMPLEMENT_SIGNATURES
/** If nonzero, implement signatures. */
#define GOLDI_IMPLEMENT_SIGNATURES 1
#endif

/** The size of the Goldilocks field, in bits. */
#define GOLDI_FIELD_BITS 448

/** The size of the Goldilocks scalars, in bits. */
#define GOLDI_SCALAR_BITS 446

/** The same size, in bytes. */
#define GOLDI_FIELD_BYTES (GOLDI_FIELD_BITS/8)

/** The size of a Goldilocks public key, in bytes. */
#define GOLDI_PUBLIC_KEY_BYTES GOLDI_FIELD_BYTES

/** The extra bytes in a Goldilocks private key for the symmetric key. */
#define GOLDI_SYMKEY_BYTES 32

/** The size of a shared secret. */
#define GOLDI_SHARED_SECRET_BYTES 64

/** The size of a Goldilocks private key, in bytes. */
#define GOLDI_PRIVATE_KEY_BYTES (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)

/** The size of a Goldilocks signature, in bytes. */
#define GOLDI_SIGNATURE_BYTES (2*GOLDI_FIELD_BYTES)

/**
* @brief Serialized form of a Goldilocks public key.
*
* @warning This isn't even my final form!
*/
struct goldilocks_public_key_t {
uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
};

/**
* @brief Serialized form of a Goldilocks private key.
*
* Contains 56 bytes of actual private key, 56 bytes of
* public key, and 32 bytes of symmetric key for randomization.
*
* @warning This isn't even my final form!
*/
struct goldilocks_private_key_t {
uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
};

#ifdef __cplusplus
extern "C" {
#endif

/** @brief No error. */
static const int GOLDI_EOK = 0;

/** @brief Error: your key or other state is corrupt. */
static const int GOLDI_ECORRUPT = 44801;

/** @brief Error: other party's key is corrupt. */
static const int GOLDI_EINVAL = 44802;

/** @brief Error: not enough entropy. */
static const int GOLDI_ENODICE = 44804;

/** @brief Error: you need to initialize the library first. */
static const int GOLDI_EUNINIT = 44805;

/** @brief Error: called init() but we are already initialized. */
static const int GOLDI_EALREADYINIT = 44805;

/**
* @brief Initialize Goldilocks' precomputed tables and
* random number generator. This function must be called before
* any of the other Goldilocks routines (except
* goldilocks_shared_secret in the current version) and should be
* called only once per process.
*
* There is currently no way to tear down this state. It is possible
* that a future version of this library will not require this function.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EALREADYINIT Already initialized.
* @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
* @retval Nonzero An error occurred.
*/
int
goldilocks_init (void)
__attribute__((warn_unused_result,visibility ("default")));


/**
* @brief Generate a new random keypair.
* @param [out] privkey The generated private key.
* @param [out] pubkey The generated public key.
*
* @warning This isn't even my final form!
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ENODICE Insufficient entropy.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_keygen (
struct goldilocks_private_key_t *privkey,
struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));

/**
* @brief Derive a key from its compressed form.
* @param [out] privkey The derived private key.
* @param [in] proto The compressed or proto-key, which must be 32 random bytes.
*
* @warning This isn't even my final form!
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_derive_private_key (
struct goldilocks_private_key_t *privkey,
const unsigned char proto[GOLDI_SYMKEY_BYTES]
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Compress a private key (by copying out the proto-key)
* @param [out] proto The proto-key.
* @param [in] privkey The private key.
*
* @warning This isn't even my final form!
* @todo test.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
void
goldilocks_underive_private_key (
unsigned char proto[GOLDI_SYMKEY_BYTES],
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Extract the public key from a private key.
*
* This is essentially a memcpy from the public part of the privkey.
*
* @param [out] pubkey The extracted private key.
* @param [in] privkey The private key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT The private key is corrupt.
*/
int
goldilocks_private_to_public (
struct goldilocks_public_key_t *pubkey,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2),visibility ("default")));

/**
* @brief Generate a Diffie-Hellman shared secret in constant time.
*
* This function uses some compile-time flags whose merit remains to
* be decided.
*
* If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
* of zeros to the secret before hashing. In the case that the other
* party's key is detectably corrupt, instead the symmetric part
* of the secret key is used to produce a pseudorandom value.
*
* If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
* the two parties' public keys is prepended to the hash.
*
* In the current version, this function can safely be run even without
* goldilocks_init(). But this property is not guaranteed for future
* versions, so call it anyway.
*
* @warning This isn't even my final form!
*
* @param [out] shared The shared secret established with the other party.
* @param [in] my_privkey My private key.
* @param [in] your_pubkey The other party's public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EINVAL The other party's key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_shared_secret (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_public_key_t *your_pubkey
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

#if GOLDI_IMPLEMENT_SIGNATURES
/**
* @brief Sign a message.
*
* The signature is deterministic, using the symmetric secret found in the
* secret key to form a nonce.
*
* The technique used in signing is a modified Schnorr system, like EdDSA.
*
* @warning This isn't even my final form!
*
* @param [out] signature_out Space for the output signature.
* @param [in] message The message to be signed.
* @param [in] message_len The length of the message to be signed.
* @param [in] privkey My private key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_sign (
uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2,4),visibility ("default")));

/**
* @brief Verify a signature.
*
* This function is fairly strict. It will correctly detect when
* the signature has the wrong cofactor component, or when the sig
* values aren't less than p or q.
*
* Currently this function does not detect when the public key is weird,
* eg 0, has cofactor, etc. As a result, a party with a bogus public
* key could create signatures that succeed on some systems and fail on
* others.
*
* @warning This isn't even my final form!
*
* @param [in] signature The signature.
* @param [in] message The message to be verified.
* @param [in] message_len The length of the message to be verified.
* @param [in] pubkey The signer's public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EINVAL The public key or signature is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_verify (
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
#endif

#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

/** A public key which has been expanded by precomputation for higher speed. */
struct goldilocks_precomputed_public_key_t;

/**
* @brief Expand a public key by precomputation.
*
* @todo Give actual error returns, instead of ambiguous NULL.
*
* @warning This isn't even my final form!
*
* @param [in] pub The public key.
* @retval NULL We ran out of memory, or the
*/
struct goldilocks_precomputed_public_key_t *
goldilocks_precompute_public_key (
const struct goldilocks_public_key_t *pub
) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));

/**
* @brief Overwrite an expanded public key with zeros, then destroy it.
*
* If the input is NULL, this function does nothing.
*
* @param [in] precom The public key.
*/
void
goldilocks_destroy_precomputed_public_key (
struct goldilocks_precomputed_public_key_t *precom
) __attribute__((visibility ("default")));

/**
* @brief Verify a signature.
*
* This function is fairly strict. It will correctly detect when
* the signature has the wrong cofactor component, or when the sig
* values aren't less than p or q.
*
* @warning This isn't even my final form!
*
* @param [in] signature The signature.
* @param [in] message The message to be verified.
* @param [in] message_len The length of the message to be verified.
* @param [in] pubkey The signer's public key, expanded by precomputation.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EINVAL The public key or signature is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_verify_precomputed (
const uint8_t signature[GOLDI_SIGNATURE_BYTES],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_precomputed_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
/**
* @brief Generate a Diffie-Hellman shared secret in constant time.
* Uses a precomputation on the other party's public key for efficiency.
*
* This function uses some compile-time flags whose merit remains to
* be decided.
*
* If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
* of zeros to the secret before hashing. In the case that the other
* party's key is detectably corrupt, instead the symmetric part
* of the secret key is used to produce a pseudorandom value.
*
* If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
* the two parties' public keys is prepended to the hash.
*
* In the current version, this function can safely be run even without
* goldilocks_init(). But this property is not guaranteed for future
* versions, so call it anyway.
*
* @warning This isn't even my final form!
*
* @param [out] shared The shared secret established with the other party.
* @param [in] my_privkey My private key.
* @param [in] your_pubkey The other party's precomputed public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EINVAL The other party's key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_shared_secret_precomputed (
uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_precomputed_public_key_t *your_pubkey
) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __GOLDILOCKS_H__ */

+ 12
- 75
src/ec_point.c View File

@@ -12,7 +12,8 @@
#include "ec_point.h"
#include "magic.h"

#define is32 (GOLDI_BITS == 32)
#define is32 (GOLDI_BITS == 32 || FIELD_BITS == 480)
/* TODO XXX PERF FIXME: better detection of overflow conditions */

/* I wanted to just use if (is32)
* But clang's -Wunreachable-code flags it.
@@ -52,60 +53,6 @@ field_mulw_scc_wr (
field_weak_reduce(out);
}

static __inline__ void
field_sqrn (
field_t *__restrict__ y,
const field_t *x,
int n
) {
field_t tmp;
assert(n>0);
if (n&1) {
field_sqr(y,x);
n--;
} else {
field_sqr(&tmp,x);
field_sqr(y,&tmp);
n-=2;
}
for (; n; n-=2) {
field_sqr(&tmp,y);
field_sqr(y,&tmp);
}
}

void
field_isr ( /* TODO: MAGIC */
struct field_t* a,
const struct field_t* x
) {
struct field_t L0, L1, L2;
field_sqr ( &L1, x );
field_mul ( &L2, x, &L1 );
field_sqr ( &L1, &L2 );
field_mul ( &L2, x, &L1 );
field_sqrn ( &L1, &L2, 3 );
field_mul ( &L0, &L2, &L1 );
field_sqrn ( &L1, &L0, 3 );
field_mul ( &L0, &L2, &L1 );
field_sqrn ( &L2, &L0, 9 );
field_mul ( &L1, &L0, &L2 );
field_sqr ( &L0, &L1 );
field_mul ( &L2, x, &L0 );
field_sqrn ( &L0, &L2, 18 );
field_mul ( &L2, &L1, &L0 );
field_sqrn ( &L0, &L2, 37 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L0, &L1, 37 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L0, &L1, 111 );
field_mul ( &L2, &L1, &L0 );
field_sqr ( &L0, &L2 );
field_mul ( &L1, x, &L0 );
field_sqrn ( &L0, &L1, 223 );
field_mul ( a, &L2, &L0 );
}

void
add_tw_niels_to_tw_extensible (
struct tw_extensible_t* d,
@@ -396,7 +343,7 @@ montgomery_step (
field_sqr ( &a->za, &a->zd );
field_sqr ( &a->xd, &L0 );
field_sqr ( &L0, &L1 );
field_mulw ( &a->zd, &a->xd, 1-EDWARDS_D );
field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
field_sub ( &L1, &a->xd, &L0 );
field_bias ( &L1, 2 );
IF32( field_weak_reduce( &L1 ) );
@@ -444,11 +391,9 @@ serialize_montgomery (
field_mul ( &L3, &L1, &L2 );
field_copy ( &L2, &a->z0 );
field_addw ( &L2, 1 );
field_sqr ( &L1, &L2 );
field_mulw ( &L2, &L1, 1-EDWARDS_D );
field_neg ( &L1, &L2 );
field_sqr ( &L0, &L2 );
field_mulw_scc_wr ( &L1, &L0, EDWARDS_D-1 );
field_add ( &L2, &a->z0, &a->z0 );
field_bias ( &L2, 1 );
field_add ( &L0, &L2, &L2 );
field_add ( &L2, &L0, &L1 );
IF32( field_weak_reduce( &L2 ) );
@@ -512,13 +457,9 @@ untwist_and_double_and_serialize (
IF32( field_weak_reduce( b ) );
field_sqr ( &L2, &a->z );
field_sqr ( &L1, &L2 );
field_add ( &L2, b, b );
field_mulw ( b, &L2, 1-EDWARDS_D );
field_neg ( &L2, b );
field_bias ( &L2, 2 );
field_mulw ( &L0, &L2, 1-EDWARDS_D );
field_neg ( b, &L0 );
field_bias ( b, 2 );
field_add ( b, b, b );
field_mulw_scc ( &L2, b, EDWARDS_D-1 );
field_mulw_scc ( b, &L2, EDWARDS_D-1 );
field_mul ( &L0, &L2, &L1 );
field_mul ( &L2, b, &L0 );
field_isr ( &L0, &L2 );
@@ -654,10 +595,8 @@ deserialize_affine (
field_copy ( &L3, &L1 );
field_addw ( &L3, 1 );
field_sqr ( &L2, &L3 );
field_mulw ( &L3, &L2, 1-EDWARDS_D );
field_neg ( &a->x, &L3 );
field_add ( &L3, &L1, &L1 );
field_bias ( &L3, 1 );
field_mulw_scc ( &a->x, &L2, EDWARDS_D-1 ); /* PERF MULW */
field_add ( &L3, &L1, &L1 ); /* FIXME: i adjusted the bias here, was it right? */
field_add ( &a->y, &L3, &L3 );
field_add ( &L3, &a->y, &a->x );
IF32( field_weak_reduce( &L3 ) );
@@ -694,11 +633,9 @@ deserialize_and_twist_approx (
field_sqr ( &a->z, sz );
field_copy ( &a->y, &a->z );
field_addw ( &a->y, 1 );
field_sqr ( &a->x, &a->y );
field_mulw ( &a->y, &a->x, 1-EDWARDS_D );
field_neg ( &a->x, &a->y );
field_sqr ( &L0, &a->y );
field_mulw_scc ( &a->x, &L0, EDWARDS_D-1 );
field_add ( &a->y, &a->z, &a->z );
field_bias ( &a->y, 1 );
field_add ( &a->u, &a->y, &a->y );
field_add ( &a->y, &a->u, &a->x );
IF32( field_weak_reduce( &a->y ) );


+ 0
- 2
src/include/ec_point.h View File

@@ -543,8 +543,6 @@ copy_tw_pniels (
field_copy ( &a->z, &ds->z );
}



#ifdef __cplusplus
}; /* extern "C" */
#endif


+ 46
- 28
src/include/field.h View File

@@ -1,40 +1,16 @@
/**
* @file field.h
* @brief Field switch code.
* @brief Generic field header.
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
*/

#ifndef __FIELD_H__
#define __FIELD_H__

#include <string.h>
#include "constant_time.h"

#include "p448.h"
#define FIELD_BITS 448
#define field_t p448_t
#define field_mul p448_mul
#define field_sqr p448_sqr
#define field_add p448_add
#define field_sub p448_sub
#define field_mulw p448_mulw
#define field_addw p448_addw
#define field_subw p448_subw
#define field_neg p448_neg
#define field_set_ui p448_set_ui
#define field_bias p448_bias
#define field_cond_neg p448_cond_neg
#define field_inverse p448_inverse
#define field_eq p448_eq
#define field_isr p448_isr
#define field_simultaneous_invert p448_simultaneous_invert
#define field_weak_reduce p448_weak_reduce
#define field_strong_reduce p448_strong_reduce
#define field_serialize p448_serialize
#define field_deserialize p448_deserialize
#define field_is_zero p448_is_zero
#include "f_field.h"

/** @brief Bytes in a field element */
#define FIELD_BYTES (1+(FIELD_BITS-1)/8)
@@ -42,6 +18,22 @@
/** @brief Words in a field element */
#define FIELD_WORDS (1+(FIELD_BITS-1)/sizeof(word_t))

/* TODO: standardize notation */
/** @brief The number of words in the Goldilocks field. */
#define GOLDI_FIELD_WORDS DIV_CEIL(FIELD_BITS,WORD_BITS)

/** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */
#define COFACTOR_BITS 2

/** @brief The number of bits in a Goldilocks scalar. */
#define SCALAR_BITS (FIELD_BITS - COFACTOR_BITS)

/** @brief The number of bytes in a Goldilocks scalar. */
#define SCALAR_BYTES (1+(SCALAR_BITS)/8)

/** @brief The number of words in the Goldilocks field. */
#define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)

/**
* @brief For GMP tests: little-endian representation of the field modulus.
*/
@@ -119,5 +111,31 @@ field_eq (
const struct field_t *a,
const struct field_t *b
);
/**
* Square x, n times.
*/
static __inline__ void
__attribute__((unused,always_inline))
field_sqrn (
field_t *__restrict__ y,
const field_t *x,
int n
) {
field_t tmp;
assert(n>0);
if (n&1) {
field_sqr(y,x);
n--;
} else {
field_sqr(&tmp,x);
field_sqr(y,&tmp);
n-=2;
}
for (; n; n-=2) {
field_sqr(&tmp,y);
field_sqr(y,&tmp);
}
}

#endif /* __FIELD_H__ */
#endif // __FIELD_H__

+ 14
- 33
src/include/magic.h View File

@@ -4,16 +4,24 @@
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
* @brief Curve-independent declarations of magic numbers.
*/


#ifndef __GOLDI_MAGIC_H__
#define __GOLDI_MAGIC_H__ 1

#include "word.h"
#include "p448.h"
#include "ec_point.h"

/**
* @brief If true, use wider tables for the precomputed combs.
*/
#ifndef USE_BIG_COMBS
#if defined(__ARM_NEON__)
#define USE_BIG_COMBS 1
#else
#define USE_BIG_COMBS (WORD_BITS==64)
#endif
#endif

/* TODO: standardize notation */

@@ -32,16 +40,13 @@
/** @brief The number of words in the Goldilocks field. */
#define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)

#include "f_magic.h"

/**
* @brief sqrt(d-1), used for point formats and twisting.
*/
extern const struct field_t sqrt_d_minus_1;

/**
* @brief The Edwards "d" term for this curve.
*/
static const int64_t EDWARDS_D = -39081;

/**
* @brief The base point for Goldilocks.
*/
@@ -76,34 +81,10 @@ extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS];
*/
#define SCALARMUL_WNAF_COMBO_TABLE_BITS 4

/**
* @brief If true, use wider tables for the precomputed combs.
*/
#ifndef USE_BIG_COMBS
#if defined(__ARM_NEON__)
#define USE_BIG_COMBS 1
#else
#define USE_BIG_COMBS (WORD_BITS==64)
#endif
#endif

/** @brief The number of combs to use for signed comb algo */
#define COMB_N (USE_BIG_COMBS ? 5 : 8)

/** @brief The number of teeth of the combs for signed comb algo */
#define COMB_T (USE_BIG_COMBS ? 5 : 4)

/** @brief The spacing the of combs for signed comb algo */
#define COMB_S (USE_BIG_COMBS ? 18 : 14)

/**
* @brief The bit width of the precomputed WNAF tables. Size is 2^this elements.
*/
#define WNAF_PRECMP_BITS 5

/**
* @brief crandom magic structure guard constant = "return 4", cf xkcd #221
*/
#define CRANDOM_MAGIC 0x72657475726e2034ull

#endif /* __GOLDI_MAGIC_H__ */

+ 8
- 3
src/include/word.h View File

@@ -37,9 +37,12 @@ typedef int64_t sword_t;
typedef __int128_t dsword_t;
#define PRIxWORD PRIx64
#define PRIxWORDfull "%016" PRIx64
#define PRIxWORD58 "%014" PRIx64
#define PRIxWORD56 "%014" PRIx64
#define PRIxWORD60 "%015" PRIx60
#define U64LE(x) x##ull
#define U58LE(x) x##ull
#define U56LE(x) x##ull
#define U60LE(x) x##ull
#define letohWORD letoh64
#define GOLDI_BITS 64
#else
@@ -51,9 +54,11 @@ typedef int32_t sword_t;
typedef int64_t dsword_t;
#define PRIxWORD PRIx32
#define PRIxWORDfull "%08" PRIx32
#define PRIxWORD58 "%07" PRIx32
#define PRIxWORD56 "%07" PRIx32
#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
#define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
#define letohWORD letoh32
#define GOLDI_BITS 32
#endif


src/arch_32/arch_config.h → src/p448/arch_32/arch_config.h View File


src/arch_32/p448.c → src/p448/arch_32/p448.c View File


src/arch_32/p448.h → src/p448/arch_32/p448.h View File


src/arch_arm_32/arch_config.h → src/p448/arch_arm_32/arch_config.h View File


src/arch_arm_32/p448.c → src/p448/arch_arm_32/p448.c View File


src/arch_arm_32/p448.h → src/p448/arch_arm_32/p448.h View File


src/arch_neon/arch_config.h → src/p448/arch_neon/arch_config.h View File


src/arch_neon/neon_emulation.h → src/p448/arch_neon/neon_emulation.h View File


src/arch_neon/p448.c → src/p448/arch_neon/p448.c View File


src/arch_neon/p448.h → src/p448/arch_neon/p448.h View File


src/arch_neon_experimental/arch_config.h → src/p448/arch_neon_experimental/arch_config.h View File


src/arch_neon_experimental/p448.c → src/p448/arch_neon_experimental/p448.c View File


src/arch_neon_experimental/p448.h → src/p448/arch_neon_experimental/p448.h View File


src/arch_ref64/arch_config.h → src/p448/arch_ref64/arch_config.h View File


src/arch_ref64/p448.c → src/p448/arch_ref64/p448.c View File


src/arch_ref64/p448.h → src/p448/arch_ref64/p448.h View File


src/arch_x86_64/arch_config.h → src/p448/arch_x86_64/arch_config.h View File


src/arch_x86_64/p448.c → src/p448/arch_x86_64/p448.c View File


src/arch_x86_64/p448.h → src/p448/arch_x86_64/p448.h View File


src/arch_x86_64/x86-64-arith.h → src/p448/arch_x86_64/x86-64-arith.h View File


+ 43
- 0
src/p448/f_arithmetic.c View File

@@ -0,0 +1,43 @@
/**
* @cond internal
* @file f_arithmetic.c
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Field-specific arithmetic.
*/

#include "ec_point.h"

void
field_isr (
struct field_t* a,
const struct field_t* x
) {
struct field_t L0, L1, L2;
field_sqr ( &L1, x );
field_mul ( &L2, x, &L1 );
field_sqr ( &L1, &L2 );
field_mul ( &L2, x, &L1 );
field_sqrn ( &L1, &L2, 3 );
field_mul ( &L0, &L2, &L1 );
field_sqrn ( &L1, &L0, 3 );
field_mul ( &L0, &L2, &L1 );
field_sqrn ( &L2, &L0, 9 );
field_mul ( &L1, &L0, &L2 );
field_sqr ( &L0, &L1 );
field_mul ( &L2, x, &L0 );
field_sqrn ( &L0, &L2, 18 );
field_mul ( &L2, &L1, &L0 );
field_sqrn ( &L0, &L2, 37 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L0, &L1, 37 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L0, &L1, 111 );
field_mul ( &L2, &L1, &L0 );
field_sqr ( &L0, &L2 );
field_mul ( &L1, x, &L0 );
field_sqrn ( &L0, &L1, 223 );
field_mul ( a, &L2, &L0 );
}

+ 39
- 0
src/p448/f_field.h View File

@@ -0,0 +1,39 @@
/**
* @file f_field.h
* @brief Field-specific code.
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
*/
#ifndef __F_FIELD_H__
#define __F_FIELD_H__ 1

#include <string.h>
#include "constant_time.h"

#include "p448.h"
#define FIELD_BITS 448
#define field_t p448_t
#define field_mul p448_mul
#define field_sqr p448_sqr
#define field_add p448_add
#define field_sub p448_sub
#define field_mulw p448_mulw
#define field_addw p448_addw
#define field_subw p448_subw
#define field_neg p448_neg
#define field_set_ui p448_set_ui
#define field_bias p448_bias
#define field_cond_neg p448_cond_neg
#define field_inverse p448_inverse
#define field_eq p448_eq
#define field_isr p448_isr
#define field_simultaneous_invert p448_simultaneous_invert
#define field_weak_reduce p448_weak_reduce
#define field_strong_reduce p448_strong_reduce
#define field_serialize p448_serialize
#define field_deserialize p448_deserialize
#define field_is_zero p448_is_zero

#endif /* __F_FIELD_H__ */

+ 35
- 0
src/p448/f_magic.h View File

@@ -0,0 +1,35 @@
/**
* @file f_magic.h
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
*/

#ifndef __GOLDI_F_MAGIC_H__
#define __GOLDI_F_MAGIC_H__ 1

#include "field.h"
#include "ec_point.h"

/**
* @brief The Edwards "d" term for this curve.
*/
static const int64_t EDWARDS_D = -39081;

/** @brief The number of combs to use for signed comb algo */
#define COMB_N (USE_BIG_COMBS ? 5 : 8)

/** @brief The number of teeth of the combs for signed comb algo */
#define COMB_T (USE_BIG_COMBS ? 5 : 4)

/** @brief The spacing the of combs for signed comb algo */
#define COMB_S (USE_BIG_COMBS ? 18 : 14)

/**
* @brief crandom magic structure guard constant = "return 4", cf xkcd #221
*/
#define CRANDOM_MAGIC 0x72657475726e2034ull

#endif /* __GOLDI_F_MAGIC_H__ */

+ 123
- 0
src/p448/field.h View File

@@ -0,0 +1,123 @@
/**
* @file field.h
* @brief Field switch code.
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
*/
#ifndef __FIELD_H__
#define __FIELD_H__

#include <string.h>
#include "constant_time.h"

#include "p448.h"
#define FIELD_BITS 448
#define field_t p448_t
#define field_mul p448_mul
#define field_sqr p448_sqr
#define field_add p448_add
#define field_sub p448_sub
#define field_mulw p448_mulw
#define field_addw p448_addw
#define field_subw p448_subw
#define field_neg p448_neg
#define field_set_ui p448_set_ui
#define field_bias p448_bias
#define field_cond_neg p448_cond_neg
#define field_inverse p448_inverse
#define field_eq p448_eq
#define field_isr p448_isr
#define field_simultaneous_invert p448_simultaneous_invert
#define field_weak_reduce p448_weak_reduce
#define field_strong_reduce p448_strong_reduce
#define field_serialize p448_serialize
#define field_deserialize p448_deserialize
#define field_is_zero p448_is_zero

/** @brief Bytes in a field element */
#define FIELD_BYTES (1+(FIELD_BITS-1)/8)

/** @brief Words in a field element */
#define FIELD_WORDS (1+(FIELD_BITS-1)/sizeof(word_t))

/**
* @brief For GMP tests: little-endian representation of the field modulus.
*/
extern const uint8_t FIELD_MODULUS[FIELD_BYTES];

/**
* Copy one field element to another.
*/
static inline void
__attribute__((unused,always_inline))
field_copy (
struct field_t *__restrict__ a,
const struct field_t *__restrict__ b
) {
memcpy(a,b,sizeof(*a));
}

/**
* Negate a in place if doNegate.
*/
static inline void
__attribute__((unused,always_inline))
field_cond_neg(
field_t *a,
mask_t doNegate
) {
struct field_t negated;
field_neg(&negated, a);
field_bias(&negated, 2);
constant_time_select(a, &negated, a, sizeof(negated), doNegate);
}

/**
* Returns 1/sqrt(+- x).
*
* The Legendre symbol of the result is the same as that of the
* input.
*
* If x=0, returns 0.
*/
void
field_isr (
struct field_t* a,
const struct field_t* x
);
/**
* Batch inverts out[i] = 1/in[i]
*
* If any input is zero, all the outputs will be zero.
*/
void
field_simultaneous_invert (
struct field_t *__restrict__ out,
const struct field_t *in,
unsigned int n
);

/**
* Returns 1/x.
*
* If x=0, returns 0.
*/
void
field_inverse (
struct field_t* a,
const struct field_t* x
);

/**
* Returns -1 if a==b, 0 otherwise.
*/
mask_t
field_eq (
const struct field_t *a,
const struct field_t *b
);

#endif /* __FIELD_H__ */

src/magic.c → src/p448/magic.c View File

@@ -39,10 +39,10 @@ const struct affine_t goldilocks_base_point = {
0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
}},
#else
{{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
{{ U56LE(0xf0de840aed939f), U56LE(0xc170033f4ba0c7),
U56LE(0xf3932d94c63d96), U56LE(0x9cecfa96147eaa),
U56LE(0x5f065c3c59d070), U56LE(0x3a6a26adf73324),
U56LE(0x1b4faff4609845), U56LE(0x297ea0ea2692ff)
}},
#endif
{{ 19 }}
@@ -69,13 +69,13 @@ sqrt_d_minus_1 = {{
0xbdeea38,0x748734a,0x5a189aa,0x49443b8,
0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c
#else
U58LE(0xd2e21836749f46),
U58LE(0x888db42b4f0179),
U58LE(0x5a189aabdeea38),
U58LE(0x51e65ca6f14c06),
U58LE(0xa49f7b424d9770),
U58LE(0xdcac4628c5f656),
U58LE(0x49443b8748734a),
U58LE(0x12fec0c0b25b7a)
U56LE(0xd2e21836749f46),
U56LE(0x888db42b4f0179),
U56LE(0x5a189aabdeea38),
U56LE(0x51e65ca6f14c06),
U56LE(0xa49f7b424d9770),
U56LE(0xdcac4628c5f656),
U56LE(0x49443b8748734a),
U56LE(0x12fec0c0b25b7a)
#endif
}};

+ 1
- 0
src/p480/arch_x86_64/arch_config.h View File

@@ -0,0 +1 @@
#define WORD_BITS 64

+ 435
- 0
src/p480/arch_x86_64/p480.c View File

@@ -0,0 +1,435 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "p480.h"
#include "x86-64-arith.h"

void
p480_mul (
p480_t *__restrict__ cs,
const p480_t *as,
const p480_t *bs
) {
const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb;

__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<60) - 1;

uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));

/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
}
/*
for (int i=0; i<4; i++) {
aa[i] = a[i] + a[i+4];
bb[i] = b[i] + b[i+4];
}
*/

accum2 = widemul(&a[0],&b[3]);
accum0 = widemul(&aa[0],&bb[3]);
accum1 = widemul(&a[4],&b[7]);

mac(&accum2, &a[1], &b[2]);
mac(&accum0, &aa[1], &bb[2]);
mac(&accum1, &a[5], &b[6]);

mac(&accum2, &a[2], &b[1]);
mac(&accum0, &aa[2], &bb[1]);
mac(&accum1, &a[6], &b[5]);

mac(&accum2, &a[3], &b[0]);
mac(&accum0, &aa[3], &bb[0]);
mac(&accum1, &a[7], &b[4]);

accum0 -= accum2;
accum1 += accum2;

c[3] = ((uint64_t)(accum1)) & mask;
c[7] = ((uint64_t)(accum0)) & mask;

accum0 >>= 60;
accum1 >>= 60;
mac(&accum0, &aa[1],&bb[3]);
mac(&accum1, &a[5], &b[7]);
mac(&accum0, &aa[2], &bb[2]);
mac(&accum1, &a[6], &b[6]);
mac(&accum0, &aa[3], &bb[1]);
accum1 += accum0;

accum2 = widemul(&a[0],&b[0]);
accum1 -= accum2;
accum0 += accum2;
msb(&accum0, &a[1], &b[3]);
msb(&accum0, &a[2], &b[2]);
mac(&accum1, &a[7], &b[5]);
msb(&accum0, &a[3], &b[1]);
mac(&accum1, &aa[0], &bb[0]);
mac(&accum0, &a[4], &b[4]);

c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum2 = widemul(&a[2],&b[7]);
mac(&accum0, &a[6], &bb[3]);
mac(&accum1, &aa[2], &bbb[3]);

mac(&accum2, &a[3], &b[6]);
mac(&accum0, &a[7], &bb[2]);
mac(&accum1, &aa[3], &bbb[2]);

mac(&accum2, &a[0],&b[1]);
mac(&accum1, &aa[0], &bb[1]);
mac(&accum0, &a[4], &b[5]);

mac(&accum2, &a[1], &b[0]);
mac(&accum1, &aa[1], &bb[0]);
mac(&accum0, &a[5], &b[4]);

accum1 -= accum2;
accum0 += accum2;

c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum2 = widemul(&a[3],&b[7]);
mac(&accum0, &a[7], &bb[3]);
mac(&accum1, &aa[3], &bbb[3]);

mac(&accum2, &a[0],&b[2]);
mac(&accum1, &aa[0], &bb[2]);
mac(&accum0, &a[4], &b[6]);

mac(&accum2, &a[1], &b[1]);
mac(&accum1, &aa[1], &bb[1]);
mac(&accum0, &a[5], &b[5]);

mac(&accum2, &a[2], &b[0]);
mac(&accum1, &aa[2], &bb[0]);
mac(&accum0, &a[6], &b[4]);

accum1 -= accum2;
accum0 += accum2;

c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;

/* we could almost stop here, but it wouldn't be stable, so... */

accum0 >>= 60;
accum1 >>= 60;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}

void
p480_mulw (
p480_t *__restrict__ cs,
const p480_t *as,
uint64_t b
) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;

__uint128_t accum0, accum4;
uint64_t mask = (1ull<<60) - 1;

accum0 = widemul_rm(b, &a[0]);
accum4 = widemul_rm(b, &a[4]);

c[0] = accum0 & mask; accum0 >>= 60;
c[4] = accum4 & mask; accum4 >>= 60;

mac_rm(&accum0, b, &a[1]);
mac_rm(&accum4, b, &a[5]);

c[1] = accum0 & mask; accum0 >>= 60;
c[5] = accum4 & mask; accum4 >>= 60;

mac_rm(&accum0, b, &a[2]);
mac_rm(&accum4, b, &a[6]);

c[2] = accum0 & mask; accum0 >>= 60;
c[6] = accum4 & mask; accum4 >>= 60;

mac_rm(&accum0, b, &a[3]);
mac_rm(&accum4, b, &a[7]);

c[3] = accum0 & mask; accum0 >>= 60;
c[7] = accum4 & mask; accum4 >>= 60;
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 60;

accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 60;
}

void
p480_sqr (
p480_t *__restrict__ cs,
const p480_t *as
) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;

__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<60) - 1;

uint64_t aa[4] __attribute__((aligned(32)));

/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
}

accum2 = widemul(&a[0],&a[3]);
accum0 = widemul(&aa[0],&aa[3]);
accum1 = widemul(&a[4],&a[7]);

mac(&accum2, &a[1], &a[2]);
mac(&accum0, &aa[1], &aa[2]);
mac(&accum1, &a[5], &a[6]);

accum0 -= accum2;
accum1 += accum2;

c[3] = ((uint64_t)(accum1))<<1 & mask;
c[7] = ((uint64_t)(accum0))<<1 & mask;

accum0 >>= 59;
accum1 >>= 59;

mac2(&accum0, &aa[1],&aa[3]);
mac2(&accum1, &a[5], &a[7]);
mac(&accum0, &aa[2], &aa[2]);
accum1 += accum0;

msb2(&accum0, &a[1], &a[3]);
mac(&accum1, &a[6], &a[6]);
accum2 = widemul(&a[0],&a[0]);
accum1 -= accum2;
accum0 += accum2;

msb(&accum0, &a[2], &a[2]);
mac(&accum1, &aa[0], &aa[0]);
mac(&accum0, &a[4], &a[4]);

c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum2 = widemul2(&aa[2],&aa[3]);
msb2(&accum0, &a[2], &a[3]);
mac2(&accum1, &a[6], &a[7]);

accum1 += accum2;
accum0 += accum2;

accum2 = widemul2(&a[0],&a[1]);
mac2(&accum1, &aa[0], &aa[1]);
mac2(&accum0, &a[4], &a[5]);

accum1 -= accum2;
accum0 += accum2;

c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum2 = widemul(&aa[3],&aa[3]);
msb(&accum0, &a[3], &a[3]);
mac(&accum1, &a[7], &a[7]);

accum1 += accum2;
accum0 += accum2;

accum2 = widemul2(&a[0],&a[2]);
mac2(&accum1, &aa[0], &aa[2]);
mac2(&accum0, &a[4], &a[6]);

mac(&accum2, &a[1], &a[1]);
mac(&accum1, &aa[1], &aa[1]);
mac(&accum0, &a[5], &a[5]);

accum1 -= accum2;
accum0 += accum2;

c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;

accum0 >>= 60;
accum1 >>= 60;

accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;

/* we could almost stop here, but it wouldn't be stable, so... */

accum0 >>= 60;
accum1 >>= 60;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}

void
p480_strong_reduce (
p480_t *a
) {
uint64_t mask = (1ull<<60)-1;

/* first, clear high */
a->limb[4] += a->limb[7]>>60;
a->limb[0] += a->limb[7]>>60;
a->limb[7] &= mask;

/* now the total is less than 2^480 - 2^(480-60) + 2^(480-60+8) < 2p */

/* compute total_value - p. No need to reduce mod p. */

__int128_t scarry = 0;
int i;
for (i=0; i<8; i++) {
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
a->limb[i] = scarry & mask;
scarry >>= 60;
}

/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^480
* so let's add back in p. will carry back off the top for 2^480.
*/

assert(is_zero(scarry) | is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;

/* add it back */
for (i=0; i<8; i++) {
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 60;
}

assert(is_zero(carry + scarry));
}

mask_t
p480_is_zero (
const struct p480_t *a
) {
struct p480_t b;
p480_copy(&b,a);
p480_strong_reduce(&b);

uint64_t any = 0;
int i;
for (i=0; i<8; i++) {
any |= b.limb[i];
}
return is_zero(any);
}

void
p480_serialize (
uint8_t *serial,
const struct p480_t *x
) {
int i,j,k=0;
p480_t red;
p480_copy(&red, x);
p480_strong_reduce(&red);
word_t r = 0;
for (i=0; i<8; i+=2) {
r = red.limb[i];
for (j=0; j<7; j++) {
serial[k++] = r;
r >>= 8;
}
assert(r<16);
r += red.limb[i+1]<<4;
for (j=0; j<8; j++) {
serial[k++] = r;
r >>= 8;
}
assert(r==0);
}
}

mask_t
p480_deserialize (
p480_t *x,
const uint8_t serial[60]
) {
int i,j,k=0;

for (i=0; i<8; i+=2) {
word_t r = 0;
for (j=0; j<8; j++) {
r |= ((word_t)serial[k++])<<(8*j);
}
x->limb[i] = r & ((1ull<<60)-1);
r >>= 60;
for (j=0; j<7; j++) {
r |= ((word_t)serial[k++])<<(8*j+4);
}
x->limb[i+1] = r;
}
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 60 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
word_t ge = -1, mask = (1ull<<60)-1;
for (i=0; i<4; i++) {
ge &= x->limb[i];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
}


+ 257
- 0
src/p480/arch_x86_64/p480.h View File

@@ -0,0 +1,257 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __p480_H__
#define __p480_H__ 1

#include <stdint.h>
#include <assert.h>

#include "word.h"

typedef struct p480_t {
uint64_t limb[8];
} __attribute__((aligned(32))) p480_t;

#ifdef __cplusplus
extern "C" {
#endif

static __inline__ void
p480_set_ui (
p480_t *out,
uint64_t x
) __attribute__((unused,always_inline));

static __inline__ void
p480_add (
p480_t *out,
const p480_t *a,
const p480_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p480_sub (
p480_t *out,
const p480_t *a,
const p480_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p480_neg (
p480_t *out,
const p480_t *a
) __attribute__((unused,always_inline));

static __inline__ void
p480_addw (
p480_t *a,
uint64_t x
) __attribute__((unused,always_inline));
static __inline__ void
p480_subw (
p480_t *a,
uint64_t x
) __attribute__((unused,always_inline));
static __inline__ void
p480_copy (
p480_t *out,
const p480_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p480_weak_reduce (
p480_t *inout
) __attribute__((unused,always_inline));
void
p480_strong_reduce (
p480_t *inout
);

mask_t
p480_is_zero (
const p480_t *in
);
static __inline__ void
p480_bias (
p480_t *inout,
int amount
) __attribute__((unused,always_inline));
void
p480_mul (
p480_t *__restrict__ out,
const p480_t *a,
const p480_t *b
);

void
p480_mulw (
p480_t *__restrict__ out,
const p480_t *a,
uint64_t b
);

void
p480_sqr (
p480_t *__restrict__ out,
const p480_t *a
);

void
p480_serialize (
uint8_t *serial,
const struct p480_t *x
);

mask_t
p480_deserialize (
p480_t *x,
const uint8_t serial[60]
);

/* -------------- Inline functions begin here -------------- */

void
p480_set_ui (
p480_t *out,
uint64_t x
) {
int i;
out->limb[0] = x;
for (i=1; i<8; i++) {
out->limb[i] = 0;
}
}

void
p480_add (
p480_t *out,
const p480_t *a,
const p480_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
*/
}

void
p480_sub (
p480_t *out,
const p480_t *a,
const p480_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
*/
}

void
p480_neg (
struct p480_t *out,
const p480_t *a
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = -a->limb[i];
}
*/
}

void
p480_addw (
p480_t *a,
uint64_t x
) {
a->limb[0] += x;
}
void
p480_subw (
p480_t *a,
uint64_t x
) {
a->limb[0] -= x;
}

void
p480_copy (
p480_t *out,
const p480_t *a
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
}
}

void
p480_bias (
p480_t *a,
int amt
) {
uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
#if __AVX2__
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
uint64x4_t *aa = (uint64x4_t*) a;
aa[0] += lo;
aa[1] += hi;
#elif __SSE2__
uint64x2_t lo = {co1,co1}, hi = {co2,co1};
uint64x2_t *aa = (uint64x2_t*) a;
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
aa[3] += lo;
#else
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
a->limb[i] += (i==4) ? co2 : co1;
}
#endif
}

void
p480_weak_reduce (
p480_t *a
) {
/* PERF: use pshufb/palignr if anyone cares about speed of this */
uint64_t mask = (1ull<<60) - 1;
uint64_t tmp = a->limb[7] >> 60;
int i;
a->limb[4] += tmp;
for (i=7; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>60);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __p480_H__ */

+ 279
- 0
src/p480/arch_x86_64/x86-64-arith.h View File

@@ -0,0 +1,279 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __X86_64_ARITH_H__
#define __X86_64_ARITH_H__

#include <stdint.h>

/* TODO: non x86-64 versions of these.
* FUTURE: autogenerate
*/

static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"r"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"leaq (,%%rdx,2), %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
uint64_t lo2 = *acc2, hi2 = *acc2>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
"addq %[c], %[lo2]; "
"adcq %[d], %[hi2]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
"addq %%rax, %[lo2]; "
"adcq %%rdx, %[hi2]; "
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
*acc2 = (((__uint128_t)(hi2))<<64) | lo2;
}

static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"r"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t c,d, lo = *acc, hi = *acc>>64;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[lo], %[c]; "
"sbbq %[hi], %[d]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
*acc = (((__uint128_t)(d))<<64) | c;
}

static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
return ((__uint128_t)(a)) * b;
}

static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
return ((__int128_t)(a)) * b;
}
static __inline__ uint64_t opacify(uint64_t x) {
__asm__ volatile("" : "+r"(x));
return x;
}

static __inline__ mask_t is_zero(uint64_t x) {
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
return ~x;
}

#endif /* __X86_64_ARITH_H__ */

+ 43
- 0
src/p480/f_arithmetic.c View File

@@ -0,0 +1,43 @@
/**
* @cond internal
* @file f_arithmetic.c
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Field-specific arithmetic.
*/

#include "ec_point.h"

void
field_isr (
struct field_t* a,
const struct field_t* x
) {
struct field_t L0, L1, L2, L3;
field_sqr ( &L2, x );
field_mul ( &L1, x, &L2 );
field_sqrn ( &L0, &L1, 2 );
field_mul ( &L2, &L1, &L0 );
field_sqrn ( &L0, &L2, 4 );
field_mul ( &L1, &L2, &L0 );
field_sqr ( &L0, &L1 );
field_mul ( &L2, x, &L0 );
field_sqrn ( &L0, &L2, 8 );
field_mul ( &L2, &L1, &L0 );
field_sqrn ( &L0, &L2, 17 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L0, &L1, 17 );
field_mul ( &L1, &L2, &L0 );
field_sqrn ( &L3, &L1, 17 );
field_mul ( &L0, &L2, &L3 );
field_sqrn ( &L2, &L0, 51 );
field_mul ( &L0, &L1, &L2 );
field_sqrn ( &L1, &L0, 119 );
field_mul ( &L2, &L0, &L1 );
field_sqr ( &L0, &L2 );
field_mul ( &L1, x, &L0 );
field_sqrn ( &L0, &L1, 239 );
field_mul ( a, &L2, &L0 );
}

+ 39
- 0
src/p480/f_field.h View File

@@ -0,0 +1,39 @@
/**
* @file f_field.h
* @brief Field-specific code.
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
*/
#ifndef __F_FIELD_H__
#define __F_FIELD_H__ 1

#include <string.h>
#include "constant_time.h"

#include "p480.h"
#define FIELD_BITS 480
#define field_t p480_t
#define field_mul p480_mul
#define field_sqr p480_sqr
#define field_add p480_add
#define field_sub p480_sub
#define field_mulw p480_mulw
#define field_addw p480_addw
#define field_subw p480_subw
#define field_neg p480_neg
#define field_set_ui p480_set_ui
#define field_bias p480_bias
#define field_cond_neg p480_cond_neg
#define field_inverse p480_inverse
#define field_eq p480_eq
#define field_isr p480_isr
#define field_simultaneous_invert p480_simultaneous_invert
#define field_weak_reduce p480_weak_reduce
#define field_strong_reduce p480_strong_reduce
#define field_serialize p480_serialize
#define field_deserialize p480_deserialize
#define field_is_zero p480_is_zero

#endif /* __F_FIELD_H__ */

+ 35
- 0
src/p480/f_magic.h View File

@@ -0,0 +1,35 @@
/**
* @file f_magic.h
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
*/

#ifndef __GOLDI_F_MAGIC_H__
#define __GOLDI_F_MAGIC_H__ 1

#include "field.h"
#include "ec_point.h"

/**
* @brief The Edwards "d" term for this curve.
*/
static const int64_t EDWARDS_D = 53825;

/** @brief The number of combs to use for signed comb algo */
#define COMB_N (USE_BIG_COMBS ? 6 : 5)

/** @brief The number of teeth of the combs for signed comb algo */
#define COMB_T (USE_BIG_COMBS ? 5 : 4)

/** @brief The spacing the of combs for signed comb algo */
#define COMB_S (USE_BIG_COMBS ? 16 : 24)

/**
* @brief crandom magic structure guard constant = "return 4", cf xkcd #221
*/
#define CRANDOM_MAGIC 0x72657475726e2034ull

#endif /* __GOLDI_F_MAGIC_H__ */

+ 68
- 0
src/p480/magic.c View File

@@ -0,0 +1,68 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "field.h"
#include "magic.h"
#include "barrett_field.h"

/* FUTURE: automatically generate this file? */

const uint8_t FIELD_MODULUS[FIELD_BYTES] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
/*!*/ 0xfe, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
};

const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
U64LE(0x58b51bc56ea8f0c4),
U64LE(0xd361f6a2348b50c9),
U64LE(0x08089c139c0002ae),
U64LE(0x0001d2ac3d9503a0),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
0x40000000,
U64LE(0xcb9c25073e36965b),
U64LE(0x6f2d48d8460f1661),
U64LE(0x0ab6256f7aaaae3e),
U64LE(0x00026e3afcc6af80),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
0x00000000
};

const struct affine_t goldilocks_base_point = {
{{
U60LE(0x849ff7f845c30d3),
U60LE(0x7dda488553a4c5b),
U60LE(0x1d3a2d9844831ea),
U60LE(0xb33ecf6ade470a2),
U60LE(0x8b3cb95210bd3c3),
U60LE(0xfc955e59aeefa65),
U60LE(0x3ab247cd530013c),
U60LE(0x7ca42af3d564280)
}},
{{ 5 }}
};

static const word_t curve_prime_order_lo[(240+WORD_BITS-1)/WORD_BITS] = {
U64LE(0x72e70941cf8da597),
U64LE(0x9bcb52361183c598),
U64LE(0x02ad895bdeaaab8f),
U64LE(0x9b8ebf31abe0)
};
const struct barrett_prime_t curve_prime_order = {
GOLDI_FIELD_WORDS,
30 % WORD_BITS,
sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]),
curve_prime_order_lo
};

const struct field_t
sqrt_d_minus_1 = {{
232 /* Whoa, it comes out even. */
}};

+ 43
- 0
src/p521/f_arithmetic.c View File

@@ -0,0 +1,43 @@
/**
* @cond internal
* @file f_arithmetic.c
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @brief Field-specific arithmetic.
*/

#include "ec_point.h"

void
field_isr (
struct field_t* a,
const struct field_t* x
) {
struct field_t L0, L1, L2;
field_sqr ( &L1, x );
field_mul ( &L0, x, &L1 );
field_sqrn ( &L2, &L0, 2 );
field_mul ( &L1, &L0, &L2 );
field_sqrn ( &L2, &L1, 4 );
field_mul ( &L0, &L1, &L2 );
field_sqrn ( &L2, &L0, 8 );
field_mul ( &L1, &L0, &L2 );
field_sqrn ( &L2, &L1, 16 );
field_mul ( &L0, &L1, &L2 );
field_sqrn ( &L2, &L0, 32 );
field_mul ( &L1, &L0, &L2 );
field_sqr ( &L2, &L1 );
field_mul ( &L0, x, &L2 );
field_sqrn ( &L2, &L0, 64 );
field_mul ( &L0, &L1, &L2 );
field_sqrn ( &L2, &L0, 129 );
field_mul ( &L1, &L0, &L2 );
field_sqr ( &L2, &L1 );
field_mul ( &L0, x, &L2 );
field_sqrn ( &L2, &L0, 259 );
field_mul ( &L1, &L0, &L2 );
field_sqr ( &L0, &L1 );
field_mul ( a, x, &L0 );
}

+ 39
- 0
src/p521/f_field.h View File

@@ -0,0 +1,39 @@
/**
* @file f_field.h
* @brief Field-specific code.
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
*/
#ifndef __F_FIELD_H__
#define __F_FIELD_H__ 1

#include <string.h>
#include "constant_time.h"

#include "p521.h"
#define FIELD_BITS 521
#define field_t p521_t
#define field_mul p521_mul
#define field_sqr p521_sqr
#define field_add p521_add
#define field_sub p521_sub
#define field_mulw p521_mulw
#define field_addw p521_addw
#define field_subw p521_subw
#define field_neg p521_neg
#define field_set_ui p521_set_ui
#define field_bias p521_bias
#define field_cond_neg p521_cond_neg
#define field_inverse p521_inverse
#define field_eq p521_eq
#define field_isr p521_isr
#define field_simultaneous_invert p521_simultaneous_invert
#define field_weak_reduce p521_weak_reduce
#define field_strong_reduce p521_strong_reduce
#define field_serialize p521_serialize
#define field_deserialize p521_deserialize
#define field_is_zero p521_is_zero

#endif /* __F_FIELD_H__ */

+ 5
- 6
test/bench.c View File

@@ -39,13 +39,12 @@ static void q448_randomize( struct crandom_state_t *crand, word_t sk[SCALAR_WORD
}

static void field_print( const char *descr, const struct field_t *a ) {
field_t b;
field_copy(&b, a);
field_strong_reduce(&b);
int j;
unsigned char ser[FIELD_BYTES];
field_serialize(ser,a);
printf("%s = 0x", descr);
for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) {
printf(PRIxWORD58, b.limb[j]);
for (j=FIELD_BYTES - 1; j>=0; j--) {
printf("%02x", ser[j]);
}
printf("\n");
}
@@ -58,7 +57,7 @@ field_print_full (
int j;
printf("%s = 0x", descr);
for (j=15; j>=0; j--) {
printf("%02" PRIxWORD "_" PRIxWORD58 " ",
printf("%02" PRIxWORD "_" PRIxWORD56 " ",
a->limb[j]>>28, a->limb[j]&((1<<28)-1));
}
printf("\n");


+ 4
- 5
test/test.c View File

@@ -84,13 +84,12 @@ void field_print (
const char *descr,
const struct field_t *a
) {
field_t b;
field_copy(&b, a);
field_strong_reduce(&b);
int j;
unsigned char ser[FIELD_BYTES];
field_serialize(ser,a);
printf("%s = 0x", descr);
for (j=FIELD_WORDS - 1; j>=0; j--) {
printf(PRIxWORD58, b.limb[LIMBPERM(j)]);
for (j=FIELD_BYTES - 1; j>=0; j--) {
printf("%02x", ser[j]);
}
printf("\n");
}


+ 51
- 12
test/test_arithmetic.c View File

@@ -22,6 +22,8 @@ static mask_t mpz_to_field (

static mask_t field_assert_eq_gmp(
const char *descr,
const struct field_t *a,
const struct field_t *b,
const struct field_t *x,
const mpz_t y,
float lowBound,
@@ -40,7 +42,7 @@ static mask_t field_assert_eq_gmp(
unsigned int i;
for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
int radix_bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
int radix_bits = sizeof(x->limb[0]) * FIELD_BITS / sizeof(*x);
word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
(1ull<<radix_bits) - 2 : (1ull<<radix_bits) - 1; // FIELD_MAGIC
if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
@@ -54,6 +56,8 @@ static mask_t field_assert_eq_gmp(
if (memcmp(xser,yser,FIELD_BYTES)) {
youfail();
printf(" Failed arithmetic test %s\n", descr);
field_print(" a", a);
field_print(" b", b);
field_print(" goldi", x);
printf(" gmp = 0x");
int j;
@@ -82,28 +86,30 @@ static mask_t test_add_sub (
field_add(&tt,&xx,&yy);
mpz_add(t,x,y);
succ &= field_assert_eq_gmp("add",&tt,t,0,2.1);
succ &= field_assert_eq_gmp("add",&xx,&yy,&tt,t,0,2.1);
field_sub(&tt,&xx,&yy);
field_bias(&tt,2);
mpz_sub(t,x,y);
succ &= field_assert_eq_gmp("sub",&tt,t,0,3.1);
succ &= field_assert_eq_gmp("sub",&xx,&yy,&tt,t,0,3.1);
field_copy(&tt,&xx);
field_addw(&tt,word);
mpz_add_ui(t,x,word);
succ &= field_assert_eq_gmp("addw",&tt,t,0,2.1);
succ &= field_assert_eq_gmp("addw",&xx,&yy,&tt,t,0,2.1);
field_copy(&tt,&xx);
field_subw(&tt,word);
field_bias(&tt,1);
mpz_sub_ui(t,x,word);
succ &= field_assert_eq_gmp("subw",&tt,t,0,2.1);
succ &= field_assert_eq_gmp("subw",&xx,&yy,&tt,t,0,2.1);

/*
if (!succ) {
field_print(" x", &xx);
field_print(" y", &yy);
}
*/
mpz_clear(t);
@@ -124,19 +130,19 @@ static mask_t test_mul_sqr (
field_mul(&tt,&xx,&yy);
mpz_mul(t,x,y);
succ &= field_assert_eq_gmp("mul",&tt,t,0,1.1);
succ &= field_assert_eq_gmp("mul",&xx,&yy,&tt,t,0,1.1);
field_mulw(&tt,&xx,word);
mpz_mul_ui(t,x,word);
succ &= field_assert_eq_gmp("mulw",&tt,t,0,1.1);
succ &= field_assert_eq_gmp("mulw",&xx,&yy,&tt,t,0,1.1);
field_sqr(&tt,&xx);
mpz_mul(t,x,x);
succ &= field_assert_eq_gmp("sqrx",&tt,t,0,1.1);
succ &= field_assert_eq_gmp("sqrx",&xx,&yy,&tt,t,0,1.1);
field_sqr(&tt,&yy);
mpz_mul(t,y,y);
succ &= field_assert_eq_gmp("sqy",&tt,t,0,1.1);
succ &= field_assert_eq_gmp("sqy",&xx,&yy,&tt,t,0,1.1);
if (!succ) {
field_print(" x", &xx);
@@ -148,6 +154,36 @@ static mask_t test_mul_sqr (
return succ;
}

static mask_t test_isr (
const mpz_t x
) {
struct field_t xx,yy,ss,tt;
mask_t succ = 0;
succ = mpz_to_field(&xx,x);
field_isr(&ss,&xx);
field_sqr(&tt,&ss);
field_mul(&yy,&xx,&tt);
field_addw(&tt,1);
succ |= field_is_zero(&tt);
field_subw(&tt,2);
field_bias(&tt,1);
succ |= field_is_zero(&tt);
field_addw(&tt,1);
if (~succ) {
youfail();
printf("ISR failure.\n");
field_print(" x", &xx);
field_print(" s", &ss);
field_print(" t", &tt);
}
return succ;
}

int test_arithmetic (void) {
int j, ntests = 100000;
@@ -168,8 +204,8 @@ int test_arithmetic (void) {
if (j<256) {
mpz_set_ui(x,0);
mpz_set_ui(y,0);
mpz_setbit(x,(j%16)*28); // FIELD_MAGIC
mpz_setbit(y,(j/16)*28); // FIELD_MAGIC
mpz_setbit(x,(j%16)*28);
mpz_setbit(y,(j/16)*28);
} else if (j&1) {
mpz_rrandomb(x, state, FIELD_BITS);
mpz_rrandomb(y, state, FIELD_BITS);
@@ -183,6 +219,9 @@ int test_arithmetic (void) {
succ &= test_add_sub(x,y,word);
succ &= test_mul_sqr(x,y,word);
if (j < 1000)
succ &= test_isr(x);
// TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
}


+ 10
- 0
test/test_pointops.c View File

@@ -3,6 +3,7 @@
#include <stdio.h>

#include "ec_point.h"
#include "magic.h"
#include "field.h"
#include "crandom.h"

@@ -256,6 +257,15 @@ int test_pointops (void) {
struct crandom_state_t crand;
crandom_init_from_buffer(&crand, "test_pointops random initializer");
struct extensible_t ext_base;
if (!validate_affine(&goldilocks_base_point)) {
youfail();
printf(" Base point isn't on the curve.\n");
return -1;
}
convert_affine_to_extensible(&ext_base, &goldilocks_base_point);
if (!validate_ext(&ext_base, 2, "base")) return -1;
int i, ret;
for (i=0; i<1000; i++) {
uint8_t ser[FIELD_BYTES];


+ 8
- 2
test/test_scalarmul.c View File

@@ -39,8 +39,14 @@ single_scalarmul_compatibility_test (
if (!succ) {
return 1;
}
struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}}; // FIELD_MAGIC

#if FIELD_BITS == 448
struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
#elif FIELD_BITS == 480
struct { int n,t,s; } params[] = {{5,6,16},{6,5,16},{4,5,24},{4,4,30},{1,2,240}};
#else
struct { int n,t,s; } params[] = {{5,5,(SCALAR_BITS+24)/25},{1,2,(SCALAR_BITS+1)/2}};
#endif
const int nparams = sizeof(params)/sizeof(params[0]);
struct fixed_base_table_t fbt;
const int nsizes = 6;


Loading…
Cancel
Save