Browse Source

Import the code

master
Michael Hamburg 11 years ago
parent
commit
25697caf5c
18 changed files with 4833 additions and 0 deletions
  1. +42
    -0
      Makefile
  2. +235
    -0
      barrett_field.c
  3. +116
    -0
      barrett_field.h
  4. +780
    -0
      bench.c
  5. +381
    -0
      crandom.c
  6. +66
    -0
      crandom.h
  7. +621
    -0
      ec_point.c
  8. +440
    -0
      ec_point.h
  9. +3
    -0
      exported.sym
  10. +168
    -0
      goldilocks.c
  11. +34
    -0
      goldilocks.h
  12. +177
    -0
      intrinsics.h
  13. +387
    -0
      p448.c
  14. +242
    -0
      p448.h
  15. +728
    -0
      scalarmul.c
  16. +112
    -0
      scalarmul.h
  17. +55
    -0
      word.h
  18. +246
    -0
      x86-64-arith.h

+ 42
- 0
Makefile View File

@@ -0,0 +1,42 @@
# Copyright (c) 2014 Cryptography Research, Inc.
# Released under the MIT License. See LICENSE.txt for license information.

CC = clang
CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror \
-mavx2 -DMUST_HAVE_SSSE3 -mbmi2 \
-ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC

.PHONY: clean all runbench
.PRECIOUS: build/%.s
HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
build/p448.o build/ec_point.o build/scalarmul.o

all: bench
bench: *.h *.c
$(CC) $(CFLAGS) -o $@ *.c
build/timestamp:
mkdir -p build
touch $@

build/%.o: build/%.s
$(CC) -c -o $@ $<

build/%.s: %.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<

build/goldilocks.so: $(LIBCOMPONENTS)
rm -f $@
libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-exported_symbols_list exported.sym \
$(LIBCOMPONENTS)
runbench: bench
./$<

clean:
rm -fr build bench *.o *.s

+ 235
- 0
barrett_field.c View File

@@ -0,0 +1,235 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "barrett_field.h"
#include <assert.h>

word_t
add_nr_ext_packed(
word_t *out,
const word_t *a,
int nwords_a,
const word_t *c,
int nwords_c,
word_t mask
) {
int i;
dword_t carry = 0;
for (i=0; i<nwords_c; i++) {
out[i] = carry = carry + a[i] + (c[i]&mask);
carry >>= WORD_BITS;
}
for (; i<nwords_a; i++) {
out[i] = carry = carry + a[i];
carry >>= WORD_BITS;
}
return carry;
}

static __inline__ word_t
add_nr_packed(
word_t *a,
const word_t *c,
int nwords
) {
int i;
dword_t carry = 0;
for (i=0; i<nwords; i++) {
a[i] = carry = carry + a[i] + c[i];
carry >>= WORD_BITS;
}
return carry;
}

static __inline__ word_t
sub_nr_packed(
word_t *a,
const word_t *c,
int nwords
) {
int i;
dsword_t carry = 0;
for (i=0; i<nwords; i++) {
a[i] = carry = carry + a[i] - c[i];
carry >>= WORD_BITS;
}
return carry;
}

word_t
sub_nr_ext_packed(
word_t *out,
const word_t *a,
int nwords_a,
const word_t *c,
int nwords_c,
word_t mask
) {
int i;
dsword_t carry = 0;
for (i=0; i<nwords_c; i++) {
out[i] = carry = carry + a[i] - (c[i]&mask);
carry >>= WORD_BITS;
}
for (; i<nwords_a; i++) {
out[i] = carry = carry + a[i];
carry >>= WORD_BITS;
}
return carry;
}

static word_t
widemac(
word_t *accum,
int nwords_accum,
const word_t *mier,
int nwords_mier,
word_t mand,
word_t carry
) {
int i;
assert(nwords_accum >= nwords_mier);
for (i=0; i<nwords_mier; i++) {
/* UMAAL chain for the wordy part of p */
dword_t product = ((dword_t)mand) * mier[i];
product += accum[i];
product += carry;
accum[i] = product;
carry = product >> WORD_BITS;
}
for (; i<nwords_accum; i++) {
dword_t sum = ((dword_t)carry) + accum[i];
accum[i] = sum;
carry = sum >> WORD_BITS;
}
return carry;
}

void
barrett_reduce(
word_t *a,
int nwords_a,
word_t a_carry,
const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift
) {
/* TODO: non 2^k-c primes. */
int repeat, nwords_left_in_a=nwords_a;
/* TODO: is there a point to this a_carry business? */
assert(a_carry < ((word_t)1)<<p_shift && nwords_a >= nwords_p);
for (; nwords_left_in_a >= nwords_p; nwords_left_in_a--) {
for (repeat=0; repeat<2; repeat++) {
/* PERF: surely a more careful implementation could
* avoid this double round
*/
word_t mand = a[nwords_left_in_a-1] >> p_shift;
a[nwords_left_in_a-1] &= (((word_t)1)<<p_shift)-1;
if (p_shift && !repeat) {
/* collect high bits when there are any */
if (nwords_left_in_a < nwords_a) {
mand |= a[nwords_left_in_a] << (WORD_BITS-p_shift);
a[nwords_left_in_a] = 0;
} else {
mand |= a_carry << (WORD_BITS-p_shift);
}
}
word_t carry = widemac(a+nwords_left_in_a-nwords_p, nwords_p, p_lo, nwords_lo, mand, 0);
assert(!carry);
(void)carry;
}
}
assert(nwords_left_in_a == nwords_p-1);
/* OK, but it still isn't reduced. Add and subtract p_lo. */
word_t cout = add_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,-1);
if (p_shift) {
cout = (cout<<(WORD_BITS-p_shift)) + (a[nwords_p-1]>>p_shift);
a[nwords_p-1] &= (((word_t)1)<<p_shift)-1;
}
/* mask = carry-1: if no carry then do sub, otherwise don't */
sub_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,cout-1);
}

/* PERF: This function is horribly slow. Enough to break 1%. */
void
barrett_mul_or_mac(
word_t *accum,
int nwords_accum,
const word_t *a,
int nwords_a,
const word_t *b,
int nwords_b,
const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift,
mask_t doMac
) {
assert(nwords_accum >= nwords_p);
/* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
int nwords_tmp = (nwords_a > nwords_p) ? nwords_a : nwords_p;
nwords_tmp++;
if (nwords_tmp < nwords_accum && doMac)
nwords_tmp = nwords_accum;
word_t tmp[nwords_tmp];
int bpos, i;
for (i=0; i<nwords_tmp; i++) {
tmp[i] = 0;
}
if (doMac) {
for (i=0; i<nwords_accum; i++) {
tmp[i] = accum[i];
}

barrett_reduce(tmp, nwords_tmp, 0, p_lo, nwords_p, nwords_lo, p_shift);
}
for (bpos=nwords_b-1; bpos >= 0; bpos--) {
/* Invariant at the beginning of the loop: the high word is unused. */
assert(tmp[nwords_tmp-1] == 0);
/* shift up */
for (i=nwords_tmp-2; i>=0; i--) {
tmp[i+1] = tmp[i];
}

/* mac and reduce */
word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
/* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
assert(!carry);
barrett_reduce(tmp, nwords_tmp, carry, p_lo, nwords_p, nwords_lo, p_shift);
/* at this point, the number of words used is nwords_p <= nwords_tmp-1,
* so the high word is again clear */
}
for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
accum[i] = tmp[i];
}
for (; i<nwords_tmp; i++) {
assert(tmp[i] == 0);
}
for (; i<nwords_accum; i++) {
accum[i] = 0;
}
}

+ 116
- 0
barrett_field.h View File

@@ -0,0 +1,116 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __BARRETT_FIELD_H__
#define __BARRETT_FIELD_H__ 1

#include "word.h"

#ifdef __cplusplus
extern "C" {
#endif

void
barrett_reduce(
word_t *a,
int nwords_a,
word_t a_carry,
const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift
);
/*
* out = a+(c&mask), with carry returned.
* #out must equal #a (HACK?)
*/
word_t
add_nr_ext_packed(
word_t *out,
const word_t *a,
int nwords_a,
const word_t *c,
int nwords_c,
word_t mask
);
word_t
sub_nr_ext_packed(
word_t *out,
const word_t *a,
int nwords_a,
const word_t *c,
int nwords_c,
word_t mask
);

/*
* If doMac, accum = accum + a*b mod p.
* Otherwise, accum = a*b mod p.
*
* This function is not __restrict__; you may pass accum,
* a, b, etc all from the same location.
*/
void
barrett_mul_or_mac(
word_t *accum,
int nwords_accum,

const word_t *a,
int nwords_a,

const word_t *b,
int nwords_b,

const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift,
mask_t doMac
);
static inline void
barrett_mul(
word_t *out,
int nwords_out,

const word_t *a,
int nwords_a,

const word_t *b,
int nwords_b,

const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift
) {
barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,0);
}
static inline void
barrett_mac(
word_t *out,
int nwords_out,

const word_t *a,
int nwords_a,

const word_t *b,
int nwords_b,

const word_t *p_lo,
int nwords_p,
int nwords_lo,
int p_shift
) {
barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,-1);
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __BARRETT_FIELD_H__ */

+ 780
- 0
bench.c View File

@@ -0,0 +1,780 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include <sys/time.h>
#include <sys/types.h>
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>

#include "p448.h"
#include "ec_point.h"
#include "scalarmul.h"
#include "barrett_field.h"
#include "crandom.h"
#include "goldilocks.h"

word_t q448_lo[4] = {
0xdc873d6d54a7bb0dull,
0xde933d8d723a70aaull,
0x3bb124b65129c96full,
0x000000008335dc16ull
};

double now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1000000.0;
}

void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
crandom_generate(crand, (unsigned char *)a, sizeof(*a));
p448_strong_reduce(a);
}

void q448_randomize( struct crandom_state_t *crand, uint64_t sk[7] ) {
crandom_generate(crand, (unsigned char *)sk, sizeof(uint64_t)*7);
}

void p448_print( const char *descr, const struct p448_t *a ) {
p448_t b;
p448_copy(&b, a);
p448_strong_reduce(&b);
int j;
printf("%s = 0x", descr);
for (j=7; j>=0; j--) {
printf("%014llx", (unsigned long long)b.limb[j]);
}
printf("\n");
}

void p448_print_full( const char *descr, const struct p448_t *a ) {
int j;
printf("%s = 0x", descr);
for (j=7; j>=0; j--) {
printf("%02llx_%014llx ", a->limb[j]>>56, (unsigned long long)a->limb[j]&(1ull<<56)-1);
}
printf("\n");
}

void q448_print( const char *descr, const uint64_t secret[7] ) {
int j;
printf("%s = 0x", descr);
for (j=6; j>=0; j--) {
printf("%016llx", (unsigned long long)secret[j]);
}
printf("\n");
}

int main(int argc, char **argv) {
(void)argc;
(void)argv;

struct tw_extensible_t ext;
struct extensible_t exta;
struct tw_niels_t niels;
struct tw_pniels_t pniels;
struct affine_t affine;
struct montgomery_t mb;
struct p448_t a,b,c,d;
double when;
int i,j;
/* Bad randomness so we can debug. */
char initial_seed[32];
for (i=0; i<32; i++) initial_seed[i] = i;
struct crandom_state_t crand;
crandom_init_from_buffer(&crand, initial_seed);
uint64_t sk[7],tk[7];
q448_randomize(&crand, sk);
when = now();
for (i=0; i<10000000; i++) {
p448_mul(&c, &b, &a);
}
when = now() - when;
printf("mul: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<10000000; i++) {
p448_sqr(&c, &a);
}
when = now() - when;
printf("sqr: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<5000000; i++) {
p448_mul(&c, &b, &a);
p448_mul(&a, &b, &c);
}
when = now() - when;
printf("mul dep: %5.1fns\n", when * 1e9 / i / 2);
when = now();
for (i=0; i<10000000; i++) {
p448_mulw(&c, &b, 1234562);
}
when = now() - when;
printf("mulw: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<100000; i++) {
p448_randomize(&crand, &a);
}
when = now() - when;
printf("rand448: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<10000; i++) {
p448_isr(&c, &a);
}
when = now() - when;
printf("isr auto: %5.1fµs\n", when * 1e6 / i);
for (i=0; i<100; i++) {
p448_randomize(&crand, &a);
p448_isr(&d,&a);
p448_sqr(&b,&d);
p448_mul(&c,&b,&a);
p448_sqr(&b,&c);
p448_subw(&b,1);
p448_bias(&b,1);
if (!p448_is_zero(&b)) {
printf("ISR validation failure!\n");
p448_print("a", &a);
p448_print("s", &d);
}
}
when = now();
for (i=0; i<10000; i++) {
elligator_2s_inject(&affine, &a);
}
when = now() - when;
printf("elligator: %5.1fµs\n", when * 1e6 / i);
for (i=0; i<100; i++) {
p448_randomize(&crand, &a);
elligator_2s_inject(&affine, &a);
if (!p448_affine_validate(&affine)) {
printf("Elligator validation failure!\n");
p448_print("a", &a);
p448_print("x", &affine.x);
p448_print("y", &affine.y);
}
}
when = now();
for (i=0; i<10000; i++) {
affine_deserialize(&affine, &a);
}
when = now() - when;
printf("decompress: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<10000; i++) {
extensible_serialize(&a, &exta);
}
when = now() - when;
printf("compress: %5.1fµs\n", when * 1e6 / i);
int goods = 0;
for (i=0; i<100; i++) {
p448_randomize(&crand, &a);
mask_t good = affine_deserialize(&affine, &a);
if (good & !p448_affine_validate(&affine)) {
printf("Deserialize validation failure!\n");
p448_print("a", &a);
p448_print("x", &affine.x);
p448_print("y", &affine.y);
} else if (good) {
goods++;
convert_affine_to_extensible(&exta,&affine);
extensible_serialize(&b, &exta);
p448_sub(&c,&b,&a);
p448_bias(&c,2);
if (!p448_is_zero(&c)) {
printf("Reserialize validation failure!\n");
p448_print("a", &a);
p448_print("x", &affine.x);
p448_print("y", &affine.y);
affine_deserialize(&affine, &b);
p448_print("b", &b);
p448_print("x", &affine.x);
p448_print("y", &affine.y);
printf("\n");
}
}
}
if (goods<i/3) {
printf("Deserialization validation failure! Deserialized %d/%d points\n", goods, i);
}
uint64_t lsk[12];
for (i=0;i<10; i++) {
for (j=11; j>=0; j--) {
lsk[j] = random();
lsk[j] = lsk[j]<<22 ^ random();
lsk[j] = lsk[j]<<22 ^ random();
}
}
when = now();
for (i=0; i<1000000; i++) {
barrett_reduce(lsk,12,0,q448_lo,7,4,62);
}
when = now() - when;
printf("barrett red: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<100000; i++) {
barrett_mac(lsk,7,lsk,7,lsk,7,q448_lo,7,4,62);
}
when = now() - when;
printf("barrett mac: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_tw_extensible_add_niels(&ext, &niels);
}
when = now() - when;
printf("exti+niels: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_tw_extensible_add_pniels(&ext, &pniels);
}
when = now() - when;
printf("exti+pniels: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_tw_extensible_double(&ext);
}
when = now() - when;
printf("exti dbl: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_isogeny_tw_to_un(&exta, &ext);
}
when = now() - when;
printf("i->a isog: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_isogeny_un_to_tw(&ext, &exta);
}
when = now() - when;
printf("a->i isog: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000000; i++) {
p448_montgomery_step(&mb);
}
when = now() - when;
printf("monty step: %5.1fns\n", when * 1e9 / i);
when = now();
for (i=0; i<1000; i++) {
p448_montgomery_ladder(&a,&b,sk,448,0);
}
when = now() - when;
printf("full ladder: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<1000; i++) {
edwards_scalar_multiply(&ext,sk);
}
when = now() - when;
printf("edwards smz: %5.1fµs\n", when * 1e6 / i);
when = now();
int sum = 0;
for (i=0; i<1000; i++) {
q448_randomize(&crand, sk);
sum += edwards_scalar_multiply_vt(&ext,sk);
}
when = now() - when;
printf("edwards vtm: %5.1fµs (%0.2f avg bits = 1.5 + 448/%0.2f)\n",
when * 1e6 / i, 1.0*sum/i, 448.0*i/(sum-1.5*i));
struct tw_niels_t wnaft[1<<6];
when = now();
for (i=0; i<1000; i++) {
precompute_for_wnaf(wnaft,&ext,6);
}
when = now() - when;
printf("wnaf6 pre: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<1000; i++) {
q448_randomize(&crand, sk);
edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,6);
}
when = now() - when;
printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<1000; i++) {
precompute_for_wnaf(wnaft,&ext,4);
}
when = now() - when;
printf("wnaf4 pre: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<1000; i++) {
q448_randomize(&crand, sk);
edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,4);
}
when = now() - when;
printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);

when = now();
for (i=0; i<1000; i++) {
precompute_for_wnaf(wnaft,&ext,5);
}
when = now() - when;
printf("wnaf5 pre: %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<1000; i++) {
q448_randomize(&crand, sk);
edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,5);
}
when = now() - when;
printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
when = now();
sum = 0;
for (i=0; i<1000; i++) {
q448_randomize(&crand, sk);
q448_randomize(&crand, tk);
sum += edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
}
when = now() - when;
printf("vt vf combo: %5.1fµs (avg = %0.3f)\n", when * 1e6 / i, 1.0*sum/i);
when = now();
for (i=0; i<1000; i++) {
affine_deserialize(&affine, &a);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
edwards_scalar_multiply(&ext,sk);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
}
when = now() - when;
printf("edwards sm: %5.1fµs\n", when * 1e6 / i);
struct tw_niels_t table[80] __attribute__((aligned(32)));

while (1) {
p448_randomize(&crand, &a);
if (affine_deserialize(&affine, &a)) break;
}
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
when = now();
for (i=0; i<1000; i++) {
precompute_for_combs(table, &ext, 5, 5, 18);
}
when = now() - when;
printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<10000; i++) {
edwards_comb(&ext, sk, table, 5, 5, 18);
}
when = now() - when;
printf("com(5,5,18): %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<10000; i++) {
edwards_comb(&ext, sk, table, 3, 5, 30);
}
when = now() - when;
printf("com(3,5,30): %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<10000; i++) {
edwards_comb(&ext, sk, table, 2, 5, 45);
}
when = now() - when;
printf("com(2,5,45): %5.1fµs\n", when * 1e6 / i);

when = now();
for (i=0; i<10000; i++) {
edwards_comb(&ext, sk, table, 8, 4, 14);
}
when = now() - when;
printf("com(4,4,28): %5.1fµs\n", when * 1e6 / i);
when = now();
for (i=0; i<10000; i++) {
q448_randomize(&crand, sk);
edwards_comb(&ext, sk, table, 5, 5, 18);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
}
when = now() - when;
printf("keygen: %5.1fµs\n", when * 1e6 / i);
printf("\nGoldilocks:\n");
int res = goldilocks_init();
assert(!res);
uint8_t gpk[56],gsk[56],hsk[56],hpk[56];
when = now();
for (i=0; i<10000; i++) {
if (i&1) {
res = goldilocks_keygen(gsk,gpk);
} else {
res = goldilocks_keygen(hsk,hpk);
}
assert(!res);
}
when = now() - when;
printf("keygen: %5.1fµs\n", when * 1e6 / i);
uint8_t ss1[64],ss2[64];
int gres1,gres2;
when = now();
for (i=0; i<10000; i++) {
if (i&1) {
gres1 = goldilocks_shared_secret(ss1,gsk,hpk);
} else {
gres2 = goldilocks_shared_secret(ss2,hsk,gpk);
}
}
when = now() - when;
printf("ecdh: %5.1fµs\n", when * 1e6 / i);
if (gres1 || gres2 || memcmp(ss1,ss2,56)) {
printf("[FAIL] %d %d\n",gres1,gres2);
printf("ss1 = ");
for (i=0; i<56; i++) {
printf("%02x", ss1[i]);
}
printf("\nss2 = ");
for (i=0; i<56; i++) {
printf("%02x", ss2[i]);
}
printf("\n");
}
printf("\nTesting...\n");
int failures=0, successes = 0;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
uint64_t two = 2;
mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
if (!good) continue;
uint64_t x = rand(), y=rand(), z=x*y;
p448_montgomery_ladder(&b,&a,&x,64,0);
p448_montgomery_ladder(&c,&b,&y,64,0);
p448_montgomery_ladder(&b,&a,&z,64,0);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (!p448_is_zero(&d)) {
printf("Odd ladder validation failure %d!\n", ++failures);
p448_print("a", &a);
printf("x=%llx, y=%llx, z=%llx\n", x,y,z);
p448_print("c", &c);
p448_print("b", &b);
printf("\n");
}
}
failures = 0;
for (i=0; i<1000; i++) {
mask_t good;
do {
p448_randomize(&crand, &a);
good = affine_deserialize(&affine, &a);
} while (!good);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
isogeny_and_serialize(&c, &ext);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (good && !p448_is_zero(&d)){
printf("Iso+serial validation failure %d!\n", ++failures);
p448_print("a", &a);
p448_print("b", &b);
p448_print("c", &c);
printf("\n");
} else if (good) {
successes ++;
}
}
if (successes < i/3) {
printf("Iso+serial variation: only %d/%d successful.\n", successes, i);
}
failures = 0;
uint64_t four = 4;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
q448_randomize(&crand, sk);
mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
good &= p448_montgomery_ladder(&c,&b,sk,448,0);
mask_t goodb = affine_deserialize(&affine, &a);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
edwards_scalar_multiply(&ext,sk);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (good != goodb) {
printf("Compatibility validation failure %d: good: %d != %d\n", ++failures, (int)(-good), (int)(-goodb));
} else if (good && !p448_is_zero(&d)){
printf("Compatibility validation failure %d!\n", ++failures);
p448_print("a", &a);
q448_print("s", sk);
p448_print("c", &c);
p448_print("b", &b);
printf("\n");
} else if (good) {
successes ++;
}
}
if (successes < i/3) {
printf("Compatibility variation: only %d/%d successful.\n", successes, i);
}
successes = failures = 0;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
q448_randomize(&crand, sk);
if (!i) bzero(&sk, sizeof(sk));
mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
good &= p448_montgomery_ladder(&c,&b,sk,448,0);
if (!good) continue;
affine_deserialize(&affine, &a);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
precompute_for_combs(table, &ext, 5, 5, 18);
edwards_comb(&ext, sk, table, 5, 5, 18);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (!p448_is_zero(&d)){
printf("Comb validation failure %d!\n", ++failures);
p448_print("a", &a);
q448_print("s", sk);
p448_print("c", &c);
p448_print("b", &b);
printf("\n");
} else if (good) {
successes ++;
}
}
if (successes < i/3) {
printf("Comb variation: only %d/%d successful.\n", successes, i);
}
successes = failures = 0;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
q448_randomize(&crand, sk);
if (!i) bzero(&sk, sizeof(sk));
mask_t good = affine_deserialize(&affine, &a);
if (!good) continue;
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
struct tw_extensible_t exu;
copy_tw_extensible(&exu, &ext);
edwards_scalar_multiply(&ext,sk);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);
edwards_scalar_multiply_vt(&exu,sk);
p448_isogeny_tw_to_un(&exta,&exu);
extensible_serialize(&c, &exta);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (!p448_is_zero(&d)){
printf("WNAF validation failure %d!\n", ++failures);
p448_print("a", &a);
q448_print("s", sk);
p448_print("c", &c);
p448_print("b", &b);
printf("\n");
} else if (good) {
successes ++;
}
}
if (successes < i/3) {
printf("WNAF variation: only %d/%d successful.\n", successes, i);
}
successes = failures = 0;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
q448_randomize(&crand, sk);
if (!i) bzero(&sk, sizeof(sk));
mask_t good = affine_deserialize(&affine, &a);
if (!good) continue;
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
struct tw_extensible_t exu;
copy_tw_extensible(&exu, &ext);
edwards_scalar_multiply(&ext,sk);
p448_isogeny_tw_to_un(&exta,&ext);
extensible_serialize(&b, &exta);

precompute_for_wnaf(wnaft,&exu,5);
edwards_scalar_multiply_vt_pre(&exu,sk,wnaft,5);
p448_isogeny_tw_to_un(&exta,&exu);
extensible_serialize(&c, &exta);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (!p448_is_zero(&d)){
printf("PreWNAF validation failure %d!\n", ++failures);
p448_print("a", &a);
q448_print("s", sk);
p448_print("c", &c);
p448_print("b", &b);
for (j=0; j<1<<5; j++) {
printf("WNAFT %d\n", j);
p448_print(" a",&wnaft[j].a);
p448_print(" b",&wnaft[j].b);
p448_print(" c",&wnaft[j].c);
}
printf("\n\n");
} else if (good) {
successes ++;
}
}
if (successes < i/3) {
printf("PreWNAF variation: only %d/%d successful.\n", successes, i);
}
successes = failures = 0;
for (i=0; i<1000; i++) {
struct p448_t aa;
struct tw_extensible_t exu,exv,exw;
mask_t good;
do {
p448_randomize(&crand, &a);
good = affine_deserialize(&affine, &a);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&ext,&exta);
} while (!good);
do {
p448_randomize(&crand, &aa);
good = affine_deserialize(&affine, &aa);
convert_affine_to_extensible(&exta,&affine);
p448_isogeny_un_to_tw(&exu,&exta);
} while (!good);
p448_randomize(&crand, &aa);
q448_randomize(&crand, sk);
if (i==0 || i==2) bzero(&sk, sizeof(sk));
q448_randomize(&crand, tk);
if (i==0 || i==1) bzero(&tk, sizeof(tk));
copy_tw_extensible(&exv, &ext);
copy_tw_extensible(&exw, &exu);
edwards_scalar_multiply(&exv,sk);
edwards_scalar_multiply(&exw,tk);
convert_tw_extensible_to_tw_pniels(&pniels, &exw);
p448_tw_extensible_add_pniels(&exv,&pniels);
p448_isogeny_tw_to_un(&exta,&exv);
extensible_serialize(&b, &exta);

precompute_for_wnaf(wnaft,&exu,5);
edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
p448_isogeny_tw_to_un(&exta,&exv);
extensible_serialize(&c, &exta);
p448_sub(&d,&b,&c);
p448_bias(&d,2);
if (!p448_is_zero(&d)){
printf("PreWNAF combo validation failure %d!\n", ++failures);
p448_print("a", &a);
p448_print("A", &aa);
q448_print("s", sk);
q448_print("t", tk);
p448_print("c", &c);
p448_print("b", &b);
printf("\n\n");
} else if (good) {
successes ++;
}
}
if (successes < i) {
printf("PreWNAF combo variation: only %d/%d successful.\n", successes, i);
}
successes = failures = 0;
for (i=0; i<1000; i++) {
p448_randomize(&crand, &a);
q448_randomize(&crand, sk);
q448_randomize(&crand, tk);
uint64_t two = 2;
mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
p448_montgomery_ladder(&b,&a,sk,448,0);
p448_montgomery_ladder(&d,&b,tk,448,0);
p448_montgomery_ladder(&b,&a,tk,448,0);
p448_montgomery_ladder(&c,&b,sk,448,0);
p448_sub(&b,&c,&d);
p448_bias(&b,2);
mask_t success = p448_is_zero(&b) | ~good;
if (!success) {
printf("Ladder validation failure %d!\n", ++failures);
p448_print("a", &a);
q448_print("s", sk);
q448_print("t", tk);
p448_print("c", &c);
p448_print("d", &d);
printf("\n");
}
}
return 0;
}

+ 381
- 0
crandom.c View File

@@ -0,0 +1,381 @@
/* Copyright (c) 2011 Stanford University.
* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/* Chacha random number generator code copied from crandom */

#include "intrinsics.h"
#include "crandom.h"

volatile unsigned int crandom_features = 0;

unsigned int crandom_detect_features() {
unsigned int out = GEN;
# if (defined(__i386__) || defined(__x86_64__))
u_int32_t a,b,c,d;
a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
out |= GEN;
if (d & 1<<26) out |= SSE2;
if (d & 1<< 9) out |= SSSE3;
if (c & 1<<25) out |= AESNI;
if (c & 1<<28) out |= AVX;
if (b & 1<<5) out |= AVX2;
a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
if (c & 1<<11) out |= XOP;
# endif
return out;
}

/* ------------------------------- Vectorized code ------------------------------- */
#define shuffle(x,i) _mm_shuffle_epi32(x, \
i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64)

#define add _mm_add_epi32
#define add64 _mm_add_epi64

#define NEED_XOP (MIGHT_HAVE(XOP))
#define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP))
#define NEED_SSE2 (MIGHT_HAVE(SSE2) && !MUST_HAVE(SSSE3))
#define NEED_CONV (!MUST_HAVE(SSE2))

#if NEED_XOP
static __inline__ void
quarter_round_xop(
ssereg *a,
ssereg *b,
ssereg *c,
ssereg *d
) {
*a = add(*a,*b); *d = xop_rotate(16, *d ^ *a);
*c = add(*c,*d); *b = xop_rotate(12, *b ^ *c);
*a = add(*a,*b); *d = xop_rotate(8, *d ^ *a);
*c = add(*c,*d); *b = xop_rotate(7, *b ^ *c);
}
#endif

#if NEED_SSSE3
static const ssereg shuffle8 = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull };
static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull };
INTRINSIC ssereg ssse3_rotate_8(ssereg a) {
return _mm_shuffle_epi8(a, shuffle8);
}
INTRINSIC ssereg ssse3_rotate_16(ssereg a) {
return _mm_shuffle_epi8(a, shuffle16);
}
static __inline__ void
quarter_round_ssse3(
ssereg *a,
ssereg *b,
ssereg *c,
ssereg *d
) {
*a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a);
*c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
*a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a);
*c = add(*c,*d); *b = sse2_rotate(7, *b ^ *c);
}
#endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */

#if NEED_SSE2
static __inline__ void
quarter_round_sse2(
ssereg *a,
ssereg *b,
ssereg *c,
ssereg *d
) {
*a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a);
*c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
*a = add(*a,*b); *d = sse2_rotate(8, *d ^ *a);
*c = add(*c,*d); *b = sse2_rotate(7, *b ^ *c);
}
#endif

#define DOUBLE_ROUND(qrf) { \
qrf(&a1,&b1,&c1,&d1); \
qrf(&a2,&b2,&c2,&d2); \
b1 = shuffle(b1,1); \
c1 = shuffle(c1,2); \
d1 = shuffle(d1,3); \
b2 = shuffle(b2,1); \
c2 = shuffle(c2,2); \
d2 = shuffle(d2,3); \
\
qrf(&a1,&b1,&c1,&d1); \
qrf(&a2,&b2,&c2,&d2); \
b1 = shuffle(b1,3); \
c1 = shuffle(c1,2); \
d1 = shuffle(d1,1); \
b2 = shuffle(b2,3); \
c2 = shuffle(c2,2); \
d2 = shuffle(d2,1); \
}
#define OUTPUT_FUNCTION { \
output[0] = add(a1,aa); \
output[1] = add(b1,bb); \
output[2] = add(c1,cc); \
output[3] = add(d1,dd); \
output[4] = add(a2,aa); \
output[5] = add(b2,bb); \
output[6] = add(c2,add(cc,p)); \
output[7] = add(d2,dd); \
\
output += 8; \
\
cc = add64(add64(cc,p), p); \
a1 = a2 = aa; \
b1 = b2 = bb; \
c1 = cc; c2 = add64(cc,p);\
d1 = d2 = dd; \
}
/* ------------------------------------------------------------------------------- */

INTRINSIC u_int32_t rotate(int r, u_int32_t a) {
return a<<r ^ a>>(32-r);
}

static __inline__ void
quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) {
*a = *a + *b; *d = rotate(16, *d^*a);
*c = *c + *d; *b = rotate(12, *b^*c);
*a = *a + *b; *d = rotate(8, *d^*a);
*c = *c + *d; *b = rotate(7, *b^*c);
}

static void
crandom_chacha_expand(u_int64_t iv,
u_int64_t ctr,
int nr,
int output_size,
const unsigned char *key_,
unsigned char *output_) {
# if MIGHT_HAVE_SSE2
if (HAVE(SSE2)) {
ssereg *key = (ssereg *)key_;
ssereg *output = (ssereg *)output_;
ssereg a1 = key[0], a2 = a1, aa = a1,
b1 = key[1], b2 = b1, bb = b1,
c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1,
d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull},
d2 = d1, dd = d1,
p = {0, 1};
int i,r;
# if (NEED_XOP)
if (HAVE(XOP)) {
for (i=0; i<output_size; i+=128) {
for (r=nr; r>0; r-=2)
DOUBLE_ROUND(quarter_round_xop);
OUTPUT_FUNCTION;
}
return;
}
# endif
# if (NEED_SSSE3)
if (HAVE(SSSE3)) {
for (i=0; i<output_size; i+=128) {
for (r=nr; r>0; r-=2)
DOUBLE_ROUND(quarter_round_ssse3);
OUTPUT_FUNCTION;
}
return;
}
# endif
# if (NEED_SSE2)
if (HAVE(SSE2)) {
for (i=0; i<output_size; i+=128) {
for (r=nr; r>0; r-=2)
DOUBLE_ROUND(quarter_round_sse2);
OUTPUT_FUNCTION;
}
return;
}
# endif
}
# endif

# if NEED_CONV
{
const u_int32_t *key = (const u_int32_t *)key_;
u_int32_t
x[16],
input[16] = {
key[0], key[1], key[2], key[3],
key[4], key[5], key[6], key[7],
iv, iv>>32, ctr, ctr>>32,
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
},
*output = (u_int32_t *)output_;
int i, r;

for (i=0; i<output_size; i+= 64) {
for (r=0; r<16; r++) {
x[r] = input[r];
}
for (r=nr; r>0; r-=2) {
quarter_round(&x[0], &x[4], &x[8], &x[12]);
quarter_round(&x[1], &x[5], &x[9], &x[13]);
quarter_round(&x[2], &x[6], &x[10], &x[14]);
quarter_round(&x[3], &x[7], &x[11], &x[15]);

quarter_round(&x[0], &x[5], &x[10], &x[15]);
quarter_round(&x[1], &x[6], &x[11], &x[12]);
quarter_round(&x[2], &x[7], &x[8], &x[13]);
quarter_round(&x[3], &x[4], &x[9], &x[14]);
}
for (r=0; r<16; r++) {
output[r] = x[r] + input[r];
}

output += 16;
input[11] ++;
if (!input[11]) input[12]++;
}
}
#endif /* NEED_CONV */
}

/* "return 4", cf xkcd #221 */
#define CRANDOM_MAGIC 0x72657475726e2034ull

int
crandom_init_from_file(
struct crandom_state_t *state,
const char *filename,
int reseed_interval,
int reseeds_mandatory
) {
state->fill = 0;
state->reseed_countdown = reseed_interval;
state->reseed_interval = reseed_interval;
state->ctr = 0;

state->randomfd = open(filename, O_RDONLY);
if (state->randomfd == -1) {
int err = errno;
return err ? err : -1;
}

ssize_t offset = 0, red;
do {
red = read(state->randomfd, state->seed + offset, 32 - offset);
if (red > 0) offset += red;
} while (red > 0 && offset < 32);

if (offset < 32) {
int err = errno;
return err ? err : -1;
}

bzero(state->buffer, 96);

state->magic = CRANDOM_MAGIC;
state->reseeds_mandatory = reseeds_mandatory;

return 0;
}

void
crandom_init_from_buffer(
struct crandom_state_t *state,
const char initial_seed[32]
) {
memcpy(state->seed, initial_seed, 32);
bzero(state->buffer, 96);
state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
state->randomfd = -1;
state->magic = CRANDOM_MAGIC;
}

int
crandom_generate(
struct crandom_state_t *state,
unsigned char *output,
unsigned long long length
) {
/* the generator isn't seeded; maybe they ignored the return value of init_from_file */
if (unlikely(state->magic != CRANDOM_MAGIC)) abort();

int ret = 0;

while (length) {
if (unlikely(state->fill <= 0)) {
uint64_t iv = 0;
if (state->reseed_interval) {
/* it's nondeterministic, stir in some rdtsc() */
iv = rdtsc();

state->reseed_countdown--;
if (unlikely(state->reseed_countdown <= 0)) {
/* reseed by xoring in random state */
state->reseed_countdown = state->reseed_interval;
ssize_t offset = 0, red;
do {
red = read(state->randomfd, state->buffer + offset, 32 - offset);
if (red > 0) offset += red;
} while (red > 0 && offset < 32);

if (offset < 32) {
/* The read failed. Signal an error with the return code.
*
* If reseeds are mandatory, crash.
*
* If not, the generator is still probably safe to use, because reseeding
* is basically over-engineering for caution. Also, the user might ignore
* the return code, so we still need to fill the request.
*
* Set reseed_countdown = 1 so we'll try again later. If the user's perf
* sucks as a result of ignoring the error code while calling us in a loop,
* well, he gets what he deserves.
*/
if (state->reseeds_mandatory) abort();

ret = errno;
if (ret == 0) ret = -1;
state->reseed_countdown = 1;
}

int i;
for (i=0; i<32; i++) {
/* Stir in the buffer. If somehow the read failed, it'll be zeros. */
state->seed[i] ^= state->buffer[i];
}
}
}
crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
state->ctr++;
state->fill = sizeof(state->buffer);
}

unsigned long long copy = (length > state->fill) ? state->fill : length;
state->fill -= copy;
memcpy(output, state->buffer + state->fill, copy);
bzero(state->buffer + state->fill, copy);
output += copy; length -= copy;
}

return ret;
}

void
crandom_destroy(
struct crandom_state_t *state
) {
if (state->randomfd) close(state->randomfd);
/* Ignore the return value, because what would it mean?
* "Your random device, which you were reading over NFS, lost some data"?
*/

bzero(state, sizeof(*state));
}

+ 66
- 0
crandom.h View File

@@ -0,0 +1,66 @@
/* Copyright (c) 2011 Stanford University.
* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/* A miniature version of the (as of yet incomplete) crandom project. */

#ifndef __GOLDI_CRANDOM_H__
#define __GOLDI_CRANDOM_H__ 1

#include <stdint.h> /* for uint64_t */
#include <fcntl.h> /* for open */
#include <errno.h> /* for returning errors after open */
#include <stdlib.h> /* for abort */
#include <string.h> /* for memcpy */
#include <strings.h> /* for bzero */
#include <unistd.h> /* for read */

struct crandom_state_t {
unsigned char seed[32];
unsigned char buffer[96];
uint64_t ctr;
uint64_t magic;
unsigned int fill;
int reseed_countdown;
int reseed_interval;
int reseeds_mandatory;
int randomfd;
} __attribute__((aligned(16))) ;

#ifdef __cplusplus
extern "C" {
#endif

int
crandom_init_from_file(
struct crandom_state_t *state,
const char *filename,
int reseed_interval,
int reseeds_mandatory
) __attribute__((warn_unused_result));

void
crandom_init_from_buffer(
struct crandom_state_t *state,
const char initial_seed[32]
);

/* TODO : attribute warn for not checking return type? */
int
crandom_generate(
struct crandom_state_t *state,
unsigned char *output,
unsigned long long length
);

void
crandom_destroy(
struct crandom_state_t *state
);

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __GOLDI_CRANDOM_H__ */

+ 621
- 0
ec_point.c View File

@@ -0,0 +1,621 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/* This file was generated with the assistance of a tool written in SAGE. */
#include "ec_point.h"


void
p448_isr (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L1, x );
p448_mul ( &L2, x, &L1 );
p448_sqr ( &L1, &L2 );
p448_mul ( &L2, x, &L1 );
p448_sqrn ( &L1, &L2, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L1, &L0, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L2, &L0, 9 );
p448_mul ( &L1, &L0, &L2 );
p448_sqr ( &L0, &L1 );
p448_mul ( &L2, x, &L0 );
p448_sqrn ( &L0, &L2, 18 );
p448_mul ( &L2, &L1, &L0 );
p448_sqrn ( &L0, &L2, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 111 );
p448_mul ( &L2, &L1, &L0 );
p448_sqr ( &L0, &L2 );
p448_mul ( &L1, x, &L0 );
p448_sqrn ( &L0, &L1, 223 );
p448_mul ( a, &L2, &L0 );
}

void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1;
p448_isr ( &L0, x );
p448_sqr ( &L1, &L0 );
p448_sqr ( &L0, &L1 );
p448_mul ( a, x, &L0 );
}

void
p448_tw_extensible_add_niels (
struct tw_extensible_t* d,
const struct tw_niels_t* e
) {
struct p448_t L0, L1;
p448_bias ( &d->y, 2 );
p448_bias ( &d->z, 2 );
p448_sub ( &L1, &d->y, &d->x );
p448_mul ( &L0, &e->a, &L1 );
p448_add ( &L1, &d->x, &d->y );
p448_mul ( &d->y, &e->b, &L1 );
p448_bias ( &d->y, 2 );
p448_mul ( &L1, &d->u, &d->t );
p448_mul ( &d->x, &e->c, &L1 );
p448_add ( &d->u, &L0, &d->y );
p448_sub ( &d->t, &d->y, &L0 );
p448_sub ( &d->y, &d->z, &d->x );
p448_add ( &L0, &d->x, &d->z );
p448_mul ( &d->z, &L0, &d->y );
p448_mul ( &d->x, &d->y, &d->t );
p448_mul ( &d->y, &L0, &d->u );
}

void
p448_tw_extensible_add_pniels (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
) {
struct p448_t L0;
p448_mul ( &L0, &e->z, &a->z );
p448_copy ( &e->z, &L0 );
p448_tw_extensible_add_niels( e, &a->n );
}

void
p448_tw_extensible_double (
struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &a->u, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &L1, &a->t );
p448_bias ( &L1, 3 );
p448_sub ( &a->t, &L1, &a->u );
p448_sub ( &L1, &L0, &L2 );
p448_bias ( &L1, 2 );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 2 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
p448_extensible_double (
struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &L1, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &a->u, &a->t );
p448_bias ( &a->u, 3 );
p448_sub ( &a->t, &a->u, &L1 );
p448_sub ( &a->u, &L0, &L2 );
p448_bias ( &a->u, 2 );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 2 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
p448_isogeny_un_to_tw (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &b->u, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &L0, &b->t );
p448_bias ( &L0, 3 );
p448_sub ( &b->t, &L0, &b->u );
p448_sub ( &L0, &b->z, &b->x );
p448_bias ( &L0, 2 );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 2 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
p448_isogeny_tw_to_un (
struct extensible_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &L0, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &b->u, &b->t );
p448_bias ( &b->u, 3 );
p448_sub ( &b->t, &b->u, &L0 );
p448_sub ( &b->u, &b->z, &b->x );
p448_bias ( &b->u, 2 );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 2 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
convert_tw_affine_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_affine_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->y, &a->x );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_set_ui( &b->z, 2 );
}

void
convert_tw_affine_to_tw_extensible (
struct tw_extensible_t* b,
const struct tw_affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_affine_to_extensible (
struct extensible_t* b,
const struct affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_tw_extensible_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_extensible_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->u, &a->t );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_add ( &b->z, &a->z, &a->z );
p448_weak_reduce( &b->z );
}

void
convert_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* d
) {
p448_add ( &e->u, &d->n.b, &d->n.a );
p448_sub ( &e->t, &d->n.b, &d->n.a );
p448_bias ( &e->t, 2 );
p448_mul ( &e->x, &d->z, &e->t );
p448_mul ( &e->y, &d->z, &e->u );
p448_sqr ( &e->z, &d->z );
}

void
convert_tw_niels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_niels_t* d
) {
p448_add ( &e->y, &d->b, &d->a );
p448_weak_reduce( &e->y );
p448_sub ( &e->x, &d->b, &d->a );
p448_bias ( &e->x, 2 );
p448_weak_reduce( &e->x );
p448_set_ui( &e->z, 1 );
p448_copy ( &e->t, &e->x );
p448_copy ( &e->u, &e->y );
}

void
p448_montgomery_step (
struct montgomery_t* a
) {
struct p448_t L0, L1;
p448_bias ( &a->xd, 2 );
p448_bias ( &a->xa, 2 );
p448_add ( &L0, &a->zd, &a->xd );
p448_sub ( &L1, &a->xd, &a->zd );
p448_sub ( &a->zd, &a->xa, &a->za );
p448_mul ( &a->xd, &L0, &a->zd );
p448_bias ( &a->xd, 2 );
p448_add ( &a->zd, &a->za, &a->xa );
p448_mul ( &a->za, &L1, &a->zd );
p448_add ( &a->xa, &a->za, &a->xd );
p448_sqr ( &a->zd, &a->xa );
p448_mul ( &a->xa, &a->z0, &a->zd );
p448_sub ( &a->zd, &a->xd, &a->za );
p448_sqr ( &a->za, &a->zd );
p448_sqr ( &a->xd, &L0 );
p448_bias ( &a->xd, 2 );
p448_sqr ( &L0, &L1 );
p448_mulw ( &a->zd, &a->xd, 39082 );
p448_bias ( &a->zd, 4 );
p448_sub ( &L1, &a->xd, &L0 );
p448_mul ( &a->xd, &L0, &a->zd );
p448_sub ( &L0, &a->zd, &L1 );
p448_mul ( &a->zd, &L0, &L1 );
}

void
p448_montgomery_serialize (
struct p448_t* sign,
struct p448_t* ser,
const struct montgomery_t* a,
const struct p448_t* sbz
) {
struct p448_t L0, L1, L2, L3;
p448_mul ( &L2, &a->z0, &a->zd );
p448_bias ( &L2, 2 );
p448_sub ( &L0, &L2, &a->xd );
p448_mul ( &L2, &a->za, &L0 );
p448_bias ( &L2, 2 );
p448_mul ( &L1, &a->z0, &a->xd );
p448_bias ( &L1, 2 );
p448_sub ( &L0, &L1, &a->zd );
p448_mul ( &L3, &a->xa, &L0 );
p448_add ( &L1, &L3, &L2 );
p448_sub ( &L0, &L2, &L3 );
p448_mul ( &L2, &L0, &L1 );
p448_mul ( &L0, sbz, &L2 );
p448_mul ( &L2, &a->zd, &L0 );
p448_mul ( sign, &L2, &a->zd );
p448_mul ( ser, &L2, &a->xd );
p448_mul ( &L2, sign, ser );
p448_isr ( &L1, &L2 );
p448_mul ( ser, sign, &L1 );
p448_sqr ( &L0, &L1 );
p448_mul ( sign, &L2, &L0 );
}

void
extensible_serialize (
struct p448_t* b,
const struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sub ( &L0, &a->y, &a->z );
p448_bias ( &L0, 2 );
p448_add ( b, &a->z, &a->y );
p448_mul ( &L1, &a->z, &a->x );
p448_mul ( &L2, &L0, &L1 );
p448_mul ( &L1, &L2, &L0 );
p448_mul ( &L0, &L2, b );
p448_mul ( &L2, &L1, &L0 );
p448_isr ( &L0, &L2 );
p448_mul ( b, &L1, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, &L2, &L1 );
}

void
isogeny_and_serialize (
struct p448_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_mul ( &L3, &a->y, &a->x );
p448_add ( &L1, &a->y, &a->x );
p448_sqr ( b, &L1 );
p448_add ( &L2, &L3, &L3 );
p448_sub ( &L1, b, &L2 );
p448_bias ( &L1, 3 );
p448_sqr ( &L2, &a->z );
p448_sqr ( b, &L2 );
p448_add ( &L2, &L1, &L1 );
p448_mulw ( &L1, &L2, 39082 );
p448_neg ( &L2, &L1 );
p448_bias ( &L2, 2 );
p448_mulw ( &L0, &L2, 39082 );
p448_neg ( &L1, &L0 );
p448_bias ( &L1, 2 );
p448_mul ( &L0, &L2, b );
p448_mul ( b, &L1, &L0 );
p448_isr ( &L0, b );
p448_mul ( &L2, &L1, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, b, &L1 );
p448_mul ( b, &L2, &L3 );
}

mask_t
affine_deserialize (
struct affine_t* a,
const struct p448_t* sz
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L1, sz );
p448_copy ( &L3, &L1 );
p448_addw ( &L3, 1 );
p448_sqr ( &a->x, &L3 );
p448_mulw ( &L3, &a->x, 39082 );
p448_neg ( &a->x, &L3 );
p448_add ( &L3, &L1, &L1 );
p448_bias ( &L3, 1 );
p448_add ( &a->y, &L3, &L3 );
p448_add ( &L3, &a->y, &a->x );
p448_copy ( &a->y, &L1 );
p448_subw ( &a->y, 1 );
p448_neg ( &a->x, &a->y );
p448_bias ( &a->x, 2 );
p448_mul ( &a->y, &a->x, &L3 );
p448_sqr ( &L2, &a->x );
p448_mul ( &L0, &L2, &a->y );
p448_mul ( &a->y, &a->x, &L0 );
p448_isr ( &L3, &a->y );
p448_mul ( &a->y, &L2, &L3 );
p448_sqr ( &L2, &L3 );
p448_mul ( &L3, &L0, &L2 );
p448_mul ( &L0, &a->x, &L3 );
p448_bias ( &L0, 1 );
p448_add ( &L2, &a->y, &a->y );
p448_mul ( &a->x, sz, &L2 );
p448_addw ( &L1, 1 );
p448_mul ( &a->y, &L1, &L3 );
p448_subw ( &L0, 1 );
return p448_is_zero( &L0 );
}

void
set_identity_extensible (
struct extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_tw_extensible (
struct tw_extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_affine (
struct affine_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
}

mask_t
eq_affine (
const struct affine_t* a,
const struct affine_t* b
) {
mask_t L0, L1;
struct p448_t L2;
p448_sub ( &L2, &a->x, &b->x );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_sub ( &L2, &a->y, &b->y );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

mask_t
eq_extensible (
const struct extensible_t* a,
const struct extensible_t* b
) {
mask_t L0, L1;
struct p448_t L2, L3, L4;
p448_mul ( &L4, &b->z, &a->x );
p448_mul ( &L3, &a->z, &b->x );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_mul ( &L4, &b->z, &a->y );
p448_mul ( &L3, &a->z, &b->y );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

mask_t
eq_tw_extensible (
const struct tw_extensible_t* a,
const struct tw_extensible_t* b
) {
mask_t L0, L1;
struct p448_t L2, L3, L4;
p448_mul ( &L4, &b->z, &a->x );
p448_mul ( &L3, &a->z, &b->x );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L1 = p448_is_zero( &L2 );
p448_mul ( &L4, &b->z, &a->y );
p448_mul ( &L3, &a->z, &b->y );
p448_sub ( &L2, &L4, &L3 );
p448_bias ( &L2, 2 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}

void
elligator_2s_inject (
struct affine_t* a,
const struct p448_t* r
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
p448_sqr ( &a->x, r );
p448_sqr ( &L3, &a->x );
p448_copy ( &a->y, &L3 );
p448_subw ( &a->y, 1 );
p448_neg ( &L9, &a->y );
p448_bias ( &L9, 2 );
p448_sqr ( &L2, &L9 );
p448_bias ( &L2, 1 );
p448_mulw ( &L7, &L2, 1527402724 );
p448_bias ( &L7, 2 );
p448_mulw ( &L8, &L3, 6108985600 );
p448_add ( &a->y, &L8, &L7 );
p448_mulw ( &L8, &L2, 6109454568 );
p448_sub ( &L7, &a->y, &L8 );
p448_mulw ( &L4, &a->y, 78160 );
p448_mul ( &L6, &L7, &L9 );
p448_mul ( &L8, &L6, &L4 );
p448_mul ( &L4, &L7, &L8 );
p448_isr ( &L5, &L4 );
p448_mul ( &L4, &L6, &L5 );
p448_sqr ( &L6, &L5 );
p448_mul ( &L5, &L8, &L6 );
p448_mul ( &L8, &L7, &L5 );
p448_mul ( &L7, &L8, &L5 );
p448_copy ( &L6, &a->x );
p448_subw ( &L6, 1 );
p448_addw ( &a->x, 1 );
p448_mul ( &L5, &a->x, &L8 );
p448_sub ( &a->x, &L6, &L5 );
p448_bias ( &a->x, 3 );
p448_mul ( &L5, &L4, &a->x );
p448_mulw ( &L4, &L5, 78160 );
p448_neg ( &a->x, &L4 );
p448_bias ( &a->x, 2 );
p448_weak_reduce( &a->x );
p448_add ( &L4, &L3, &L3 );
p448_add ( &L3, &L4, &L2 );
p448_subw ( &L3, 2 );
p448_mul ( &L2, &L3, &L8 );
p448_mulw ( &L3, &L2, 3054649120 );
p448_add ( &L2, &L3, &a->y );
p448_mul ( &a->y, &L7, &L2 );
L1 = p448_is_zero( &L9 );
L0 = - L1;
p448_addw ( &a->y, L0 );
p448_weak_reduce( &a->y );
}

mask_t
p448_affine_validate (
const struct affine_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L0, &a->y );
p448_sqr ( &L2, &a->x );
p448_add ( &L3, &L2, &L0 );
p448_subw ( &L3, 1 );
p448_mulw ( &L1, &L2, 39081 );
p448_neg ( &L2, &L1 );
p448_bias ( &L2, 2 );
p448_mul ( &L1, &L0, &L2 );
p448_sub ( &L0, &L3, &L1 );
p448_bias ( &L0, 3 );
return p448_is_zero( &L0 );
}

mask_t
p448_tw_extensible_validate (
const struct tw_extensible_t* ext
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5;
/*
* Check invariant:
* 0 = -x*y + z*t*u
*/
p448_mul ( &L2, &ext->t, &ext->u );
p448_mul ( &L4, &ext->z, &L2 );
p448_addw ( &L4, 0 );
p448_mul ( &L3, &ext->x, &ext->y );
p448_neg ( &L2, &L3 );
p448_add ( &L3, &L2, &L4 );
p448_bias ( &L3, 2 );
L1 = p448_is_zero( &L3 );
/*
* Check invariant:
* 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
*/
p448_sqr ( &L4, &ext->y );
p448_neg ( &L2, &L4 );
p448_addw ( &L2, 0 );
p448_sqr ( &L3, &ext->x );
p448_bias ( &L3, 4 );
p448_add ( &L4, &L3, &L2 );
p448_sqr ( &L5, &ext->u );
p448_sqr ( &L3, &ext->t );
p448_mul ( &L2, &L3, &L5 );
p448_mulw ( &L3, &L2, 39081 );
p448_neg ( &L5, &L3 );
p448_add ( &L3, &L5, &L4 );
p448_neg ( &L5, &L2 );
p448_add ( &L4, &L5, &L3 );
p448_sqr ( &L3, &ext->z );
p448_add ( &L2, &L3, &L4 );
L0 = p448_is_zero( &L2 );
return L1 & L0;
}



+ 440
- 0
ec_point.h View File

@@ -0,0 +1,440 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/* This file was generated with the assistance of a tool written in SAGE. */
#ifndef __CC_INCLUDED_P448_EDWARDS_H__
#define __CC_INCLUDED_P448_EDWARDS_H__

#include "p448.h"

#ifdef __cplusplus
extern "C" {
#endif

/*
* Affine point on an Edwards curve.
*/
struct affine_t {
struct p448_t x, y;
};

/*
* Affine point on a twisted Edwards curve.
*/
struct tw_affine_t {
struct p448_t x, y;
};

/*
* Montgomery buffer.
*/
struct montgomery_t {
struct p448_t z0, xd, zd, xa, za;
};

/*
* Extensible coordinates for Edwards curves, suitable for
* accumulators.
*
* Represents the point (x/z, y/z). The extra coordinates
* t,u satisfy xy = tuz, allowing for conversion to Extended
* form by multiplying t and u.
*
* The idea is that you don't have to do this multiplication
* when doubling the accumulator, because the t-coordinate
* isn't used there. At the same time, as long as you only
* have one point in extensible form, additions don't cost
* extra.
*
* This is essentially a lazier version of Hisil et al's
* lookahead trick. It might be worth considering that trick
* instead.
*/
struct extensible_t {
struct p448_t x, y, z, t, u;
};

/*
* Extensible coordinates for twisted Edwards curves,
* suitable for accumulators.
*/
struct tw_extensible_t {
struct p448_t x, y, z, t, u;
};

/*
* Niels coordinates for twisted Edwards curves. Good for
* mixed readdition; suitable for fixed tables.
*/
struct tw_niels_t {
struct p448_t a, b, c;
};

/*
* Projective niels coordinates for twisted Edwards curves.
* Good for readdition; suitable for temporary tables.
*/
struct tw_pniels_t {
struct tw_niels_t n;
struct p448_t z;
};


/*
* Auto-generated copy method.
*/
static __inline__ void
copy_affine (
struct affine_t* a,
const struct affine_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_tw_affine (
struct tw_affine_t* a,
const struct tw_affine_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_montgomery (
struct montgomery_t* a,
const struct montgomery_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_extensible (
struct extensible_t* a,
const struct extensible_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_tw_extensible (
struct tw_extensible_t* a,
const struct tw_extensible_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_tw_niels (
struct tw_niels_t* a,
const struct tw_niels_t* ds
) __attribute__((unused,always_inline));

/*
* Auto-generated copy method.
*/
static __inline__ void
copy_tw_pniels (
struct tw_pniels_t* a,
const struct tw_pniels_t* ds
) __attribute__((unused,always_inline));

/*
* Returns 1/sqrt(+- x).
*
* The Legendre symbol of the result is the same as that of the
* input.
*
* If x=0, returns 0.
*/
void
p448_isr (
struct p448_t* a,
const struct p448_t* x
);

/*
* Returns 1/x.
*
* If x=0, returns 0.
*/
void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
);

/*
* Add two points on a twisted Edwards curve, one in Extensible form
* and the other in half-Niels form.
*/
void
p448_tw_extensible_add_niels (
struct tw_extensible_t* d,
const struct tw_niels_t* e
);

/*
* Add two points on a twisted Edwards curve, one in Extensible form
* and the other in projective Niels form.
*/
void
p448_tw_extensible_add_pniels (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
);

/*
* Double a point on a twisted Edwards curve, in "extensible" coordinates.
*/
void
p448_tw_extensible_double (
struct tw_extensible_t* a
);

/*
* Double a point on an Edwards curve, in "extensible" coordinates.
*/
void
p448_extensible_double (
struct extensible_t* a
);

/*
* 4-isogeny from untwisted to twisted.
*/
void
p448_isogeny_un_to_tw (
struct tw_extensible_t* b,
const struct extensible_t* a
);

/*
* Dual 4-isogeny from twisted to untwisted.
*/
void
p448_isogeny_tw_to_un (
struct extensible_t* b,
const struct tw_extensible_t* a
);

void
convert_tw_affine_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_affine_t* a
);

void
convert_tw_affine_to_tw_extensible (
struct tw_extensible_t* b,
const struct tw_affine_t* a
);

void
convert_affine_to_extensible (
struct extensible_t* b,
const struct affine_t* a
);

void
convert_tw_extensible_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_extensible_t* a
);

void
convert_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* d
);

void
convert_tw_niels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_niels_t* d
);

void
p448_montgomery_step (
struct montgomery_t* a
);

void
p448_montgomery_serialize (
struct p448_t* sign,
struct p448_t* ser,
const struct montgomery_t* a,
const struct p448_t* sbz
);

/*
* Serialize a point on an Edwards curve
* The serialized form would be sqrt((z-y)/(z+y)) with sign of xz
* It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x.
* But 4/(1-d) isn't square, so we need to twist it:
* -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x
*/
void
extensible_serialize (
struct p448_t* b,
const struct extensible_t* a
);

/*
*
*/
void
isogeny_and_serialize (
struct p448_t* b,
const struct tw_extensible_t* a
);

/*
* Deserialize a point to an untwisted affine curve
*/
mask_t
affine_deserialize (
struct affine_t* a,
const struct p448_t* sz
);

void
set_identity_extensible (
struct extensible_t* a
);

void
set_identity_tw_extensible (
struct tw_extensible_t* a
);

void
set_identity_affine (
struct affine_t* a
);

mask_t
eq_affine (
const struct affine_t* a,
const struct affine_t* b
);

mask_t
eq_extensible (
const struct extensible_t* a,
const struct extensible_t* b
);

mask_t
eq_tw_extensible (
const struct tw_extensible_t* a,
const struct tw_extensible_t* b
);

void
elligator_2s_inject (
struct affine_t* a,
const struct p448_t* r
);

mask_t
p448_affine_validate (
const struct affine_t* a
);

/*
* Check the invariants for struct tw_extensible_t.
* PERF: This function was automatically generated
* with no regard for speed.
*/
mask_t
p448_tw_extensible_validate (
const struct tw_extensible_t* ext
);


void
copy_affine (
struct affine_t* a,
const struct affine_t* ds
) {
p448_copy ( &a->x, &ds->x );
p448_copy ( &a->y, &ds->y );
}

void
copy_tw_affine (
struct tw_affine_t* a,
const struct tw_affine_t* ds
) {
p448_copy ( &a->x, &ds->x );
p448_copy ( &a->y, &ds->y );
}

void
copy_montgomery (
struct montgomery_t* a,
const struct montgomery_t* ds
) {
p448_copy ( &a->z0, &ds->z0 );
p448_copy ( &a->xd, &ds->xd );
p448_copy ( &a->zd, &ds->zd );
p448_copy ( &a->xa, &ds->xa );
p448_copy ( &a->za, &ds->za );
}

void
copy_extensible (
struct extensible_t* a,
const struct extensible_t* ds
) {
p448_copy ( &a->x, &ds->x );
p448_copy ( &a->y, &ds->y );
p448_copy ( &a->z, &ds->z );
p448_copy ( &a->t, &ds->t );
p448_copy ( &a->u, &ds->u );
}

void
copy_tw_extensible (
struct tw_extensible_t* a,
const struct tw_extensible_t* ds
) {
p448_copy ( &a->x, &ds->x );
p448_copy ( &a->y, &ds->y );
p448_copy ( &a->z, &ds->z );
p448_copy ( &a->t, &ds->t );
p448_copy ( &a->u, &ds->u );
}

void
copy_tw_niels (
struct tw_niels_t* a,
const struct tw_niels_t* ds
) {
p448_copy ( &a->a, &ds->a );
p448_copy ( &a->b, &ds->b );
p448_copy ( &a->c, &ds->c );
}

void
copy_tw_pniels (
struct tw_pniels_t* a,
const struct tw_pniels_t* ds
) {
copy_tw_niels( &a->n, &ds->n );
p448_copy ( &a->z, &ds->z );
}



#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __CC_INCLUDED_P448_EDWARDS_H__ */

+ 3
- 0
exported.sym View File

@@ -0,0 +1,3 @@
_goldilocks_init
_goldilocks_keygen
_goldilocks_shared_secret

+ 168
- 0
goldilocks.c View File

@@ -0,0 +1,168 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#include "goldilocks.h"
#include "ec_point.h"
#include "scalarmul.h"
#include "barrett_field.h"
#include "crandom.h"

#ifndef GOLDILOCKS_RANDOM_INIT_FILE
#define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom"
#endif

#ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL
#define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000
#endif

/* We'll check it ourselves */
#ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY
#define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
#endif

/* TODO: word size; precompute */
const struct affine_t goldilocks_base_point = {
{{ 0xf0de840aed939full, 0xc170033f4ba0c7ull, 0xf3932d94c63d96ull, 0x9cecfa96147eaaull,
0x5f065c3c59d070ull, 0x3a6a26adf73324ull, 0x1b4faff4609845ull, 0x297ea0ea2692ffull
}},
{{ 19, 0, 0, 0, 0, 0, 0, 0 }}
};

// FIXME: threading
// TODO: autogen instead of init
struct {
struct tw_niels_t combs[80];
struct tw_niels_t wnafs[32];
struct crandom_state_t rand;
} goldilocks_global;

int
goldilocks_init() {
struct extensible_t ext;
struct tw_extensible_t text;
/* Sanity check: the base point is on the curve. */
assert(p448_affine_validate(&goldilocks_base_point));
/* Convert it to twisted Edwards. */
convert_affine_to_extensible(&ext, &goldilocks_base_point);
p448_isogeny_un_to_tw(&text, &ext);
/* Precompute the tables. */
precompute_for_combs(goldilocks_global.combs, &text, 5, 5, 18);
precompute_for_wnaf(goldilocks_global.wnafs, &text, 5);
return crandom_init_from_file(&goldilocks_global.rand,
GOLDILOCKS_RANDOM_INIT_FILE,
GOLDILOCKS_RANDOM_RESEED_INTERVAL,
GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
}

// TODO: move to a better place
// TODO: word size
void
p448_serialize(uint8_t *serial, const struct p448_t *x) {
int i,j;
p448_t red;
p448_copy(&red, x);
p448_strong_reduce(&red);
for (i=0; i<8; i++) {
for (j=0; j<7; j++) {
serial[7*i+j] = red.limb[i];
red.limb[i] >>= 8;
}
assert(red.limb[i] == 0);
}
}

void
q448_serialize(uint8_t *serial, const word_t x[7]) {
int i,j;
for (i=0; i<7; i++) {
for (j=0; j<8; j++) {
serial[8*i+j] = x[i]>>(8*j);
}
}
}

mask_t
q448_deserialize(word_t x[7], const uint8_t serial[56]) {
int i,j;
for (i=0; i<7; i++) {
word_t out = 0;
for (j=0; j<8; j++) {
out |= ((word_t)serial[8*i+j])<<(8*j);
}
x[i] = out;
}
// TODO: check for reduction
return MASK_SUCCESS;
}

mask_t
p448_deserialize(p448_t *x, const uint8_t serial[56]) {
int i,j;
for (i=0; i<8; i++) {
word_t out = 0;
for (j=0; j<7; j++) {
out |= ((word_t)serial[7*i+j])<<(8*j);
}
x->limb[i] = out;
}
// TODO: check for reduction
return MASK_SUCCESS;
}

static word_t
q448_lo[4] = {
0xdc873d6d54a7bb0dull,
0xde933d8d723a70aaull,
0x3bb124b65129c96full,
0x000000008335dc16ull
};

int
goldilocks_keygen(
uint8_t private[56],
uint8_t public[56]
) {
// TODO: check for init. Also maybe take CRANDOM object? API...
word_t sk[448*2/WORD_BITS];
struct tw_extensible_t exta;
struct p448_t pk;
int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,q448_lo,7,4,62); // TODO word size
q448_serialize(private, sk);
edwards_comb(&exta, sk, goldilocks_global.combs, 5, 5, 18);
isogeny_and_serialize(&pk, &exta);
p448_serialize(public, &pk);
return ret;
}

int
goldilocks_shared_secret(
uint8_t shared[56],
const uint8_t private[56],
const uint8_t public[56]
) {
// TODO: SHA
word_t sk[448/WORD_BITS];
struct p448_t pk;
mask_t succ = p448_deserialize(&pk,public);
succ &= q448_deserialize(sk,private);
succ &= p448_montgomery_ladder(&pk,&pk,sk,446,2);
p448_serialize(shared,&pk);
// TODO: hash
if (succ) {
return 0;
} else {
return -1;
}
}

+ 34
- 0
goldilocks.h View File

@@ -0,0 +1,34 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __GOLDILOCKS_H__
#define __GOLDILOCKS_H__ 1

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

int
goldilocks_init();

int
goldilocks_keygen(
uint8_t private[56],
uint8_t public[56]
);

int
goldilocks_shared_secret(
uint8_t shared[56],
const uint8_t private[56],
const uint8_t public[56]
);

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __GOLDILOCKS_H__ */

+ 177
- 0
intrinsics.h View File

@@ -0,0 +1,177 @@
/* Copyright (c) 2011 Stanford University.
* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

/* cRandom intrinsics header. */

#ifndef __CRANDOM_INTRINSICS_H__
#define __CRANDOM_INTRINSICS_H__ 1

#include <sys/types.h>

#include <immintrin.h>

#define INTRINSIC \
static __inline__ __attribute__((__gnu_inline__, __always_inline__))

#define GEN 1
#define SSE2 2
#define SSSE3 4
#define AESNI 8
#define XOP 16
#define AVX 32
#define AVX2 64

INTRINSIC u_int64_t rdtsc() {
u_int64_t out = 0;
# if (defined(__i386__) || defined(__x86_64__))
__asm__ __volatile__ ("rdtsc" : "=A"(out));
# endif
return out;
}

INTRINSIC u_int64_t opacify(u_int64_t x) {
__asm__ volatile("mov %0, %0" : "+r"(x));
return x;
}

#ifdef __AVX2__
# define MIGHT_HAVE_AVX2 1
# ifndef MUST_HAVE_AVX2
# define MUST_HAVE_AVX2 0
# endif
#else
# define MIGHT_HAVE_AVX2 0
# define MUST_HAVE_AVX2 0
#endif

#ifdef __AVX__
# define MIGHT_HAVE_AVX 1
# ifndef MUST_HAVE_AVX
# define MUST_HAVE_AVX MUST_HAVE_AVX2
# endif
#else
# define MIGHT_HAVE_AVX 0
# define MUST_HAVE_AVX 0
#endif

#ifdef __SSSE3__
# define MIGHT_HAVE_SSSE3 1
# ifndef MUST_HAVE_SSSE3
# define MUST_HAVE_SSSE3 MUST_HAVE_AVX
# endif
#else
# define MIGHT_HAVE_SSSE3 0
# define MUST_HAVE_SSSE3 0
#endif

#ifdef __SSE2__
# define MIGHT_HAVE_SSE2 1
# ifndef MUST_HAVE_SSE2
# define MUST_HAVE_SSE2 MUST_HAVE_SSSE3
# endif
typedef __m128i ssereg;
# define pslldq _mm_slli_epi32
# define pshufd _mm_shuffle_epi32

INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
}

#else
# define MIGHT_HAVE_SSE2 0
# define MUST_HAVE_SSE2 0
#endif

#ifdef __AES__
/* don't include intrinsics file, because not all platforms have it */
# define MIGHT_HAVE_AESNI 1
# ifndef MUST_HAVE_AESNI
# define MUST_HAVE_AESNI 0
# endif

INTRINSIC ssereg aeskeygenassist(int rc, ssereg x) {
ssereg out;
__asm__("aeskeygenassist %2, %1, %0" : "=x"(out) : "x"(x), "g"(rc));
return out;
}

INTRINSIC ssereg aesenc(ssereg subkey, ssereg block) {
ssereg out = block;
__asm__("aesenc %1, %0" : "+x"(out) : "x"(subkey));
return out;
}

INTRINSIC ssereg aesenclast(ssereg subkey, ssereg block) {
ssereg out = block;
__asm__("aesenclast %1, %0" : "+x"(out) : "x"(subkey));
return out;
}

#else
# define MIGHT_HAVE_AESNI 0
# define MUST_HAVE_AESNI 0
#endif

#ifdef __XOP__
/* don't include intrinsics file, because not all platforms have it */
# define MIGHT_HAVE_XOP 1
# ifndef MUST_HAVE_XOP
# define MUST_HAVE_XOP 0
# endif
INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
ssereg out;
__asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
return out;
}
#else
# define MIGHT_HAVE_XOP 0
# define MUST_HAVE_XOP 0
#endif

#define MIGHT_MASK \
( SSE2 * MIGHT_HAVE_SSE2 \
| SSSE3 * MIGHT_HAVE_SSSE3 \
| AESNI * MIGHT_HAVE_AESNI \
| XOP * MIGHT_HAVE_XOP \
| AVX * MIGHT_HAVE_AVX \
| AVX2 * MIGHT_HAVE_AVX2)

#define MUST_MASK \
( SSE2 * MUST_HAVE_SSE2 \
| SSSE3 * MUST_HAVE_SSSE3 \
| AESNI * MUST_HAVE_AESNI \
| XOP * MUST_HAVE_XOP \
| AVX * MUST_HAVE_AVX \
| AVX2 * MUST_HAVE_AVX2 )

#define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
#define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)

#ifdef __cplusplus
# define extern_c extern "C"
#else
# define extern_c
#endif

extern_c
unsigned int crandom_detect_features();

#ifndef likely
# define likely(x) __builtin_expect((x),1)
# define unlikely(x) __builtin_expect((x),0)
#endif

extern volatile unsigned int crandom_features;
INTRINSIC int HAVE(unsigned int feature) {
unsigned int features;
if (!MIGHT_HAVE(feature)) return 0;
if (MUST_HAVE(feature)) return 1;
features = crandom_features;
if (unlikely(!features))
crandom_features = features = crandom_detect_features();
return likely((features & feature) == feature);
}

#endif /* __CRANDOM_INTRINSICS_H__ */

+ 387
- 0
p448.c View File

@@ -0,0 +1,387 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "p448.h"
#include "x86-64-arith.h"

void p448_mul(p448_t *__restrict__ cs, const p448_t *as, const p448_t *bs) {
const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1;
uint64_t aa[4], bb[4];
/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
}
/*
for (int i=0; i<4; i++) {
aa[i] = a[i] + a[i+4];
bb[i] = b[i] + b[i+4];
}
*/
accum2 = widemul(&a[0],&b[3]);
accum1 = widemul(&aa[0],&bb[3]);
accum0 = widemul(&a[4],&b[7]);
mac(&accum2, &a[1], &b[2]);
mac(&accum1, &aa[1], &bb[2]);
mac(&accum0, &a[5], &b[6]);
mac(&accum2, &a[2], &b[1]);
mac(&accum1, &aa[2], &bb[1]);
mac(&accum0, &a[6], &b[5]);
mac(&accum2, &a[3], &b[0]);
mac(&accum1, &aa[3], &bb[0]);
mac(&accum0, &a[7], &b[4]);
accum1 -= accum2;
accum0 += accum2;
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
{
accum2 = accum1;
accum1 += accum0;
accum0 = accum2;
}
accum2 = widemul(&a[0],&b[0]);
accum1 -= accum2;
accum0 += accum2;
accum2 = widemul(&aa[1],&bb[3]);
msb(&accum0, &a[1], &b[3]);
mac(&accum1, &a[5], &b[7]);
msb(&accum0, &a[2], &b[2]);
mac(&accum2, &aa[2], &bb[2]);
mac(&accum1, &a[6], &b[6]);
msb(&accum0, &a[3], &b[1]);
mac(&accum1, &a[7], &b[5]);
mac(&accum2, &aa[3], &bb[1]);
accum0 += accum2;
accum1 += accum2;
mac(&accum0, &a[4], &b[4]);
mac(&accum1, &aa[0], &bb[0]);
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&aa[2],&bb[3]);
msb(&accum0, &a[2], &b[3]);
mac(&accum1, &a[6], &b[7]);
mac(&accum2, &aa[3], &bb[2]);
msb(&accum0, &a[3], &b[2]);
mac(&accum1, &a[7], &b[6]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul(&a[0],&b[1]);
mac(&accum1, &aa[0], &bb[1]);
mac(&accum0, &a[4], &b[5]);
mac(&accum2, &a[1], &b[0]);
mac(&accum1, &aa[1], &bb[0]);
mac(&accum0, &a[5], &b[4]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&aa[3],&bb[3]);
msb(&accum0, &a[3], &b[3]);
mac(&accum1, &a[7], &b[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul(&a[0],&b[2]);
mac(&accum1, &aa[0], &bb[2]);
mac(&accum0, &a[4], &b[6]);
mac(&accum2, &a[1], &b[1]);
mac(&accum1, &aa[1], &bb[1]);
mac(&accum0, &a[5], &b[5]);
mac(&accum2, &a[2], &b[0]);
mac(&accum1, &aa[2], &bb[0]);
mac(&accum0, &a[6], &b[4]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0 >>= 56;
accum1 >>= 56;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}

void p448_mulw(p448_t *__restrict__ cs, const p448_t *as, uint64_t b) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0, accum4;
uint64_t mask = (1ull<<56) - 1;
accum0 = widemul_rm(b, &a[0]);
accum4 = widemul_rm(b, &a[4]);
c[0] = accum0 & mask; accum0 >>= 56;
c[4] = accum4 & mask; accum4 >>= 56;
mac_rm(&accum0, b, &a[1]);
mac_rm(&accum4, b, &a[5]);
c[1] = accum0 & mask; accum0 >>= 56;
c[5] = accum4 & mask; accum4 >>= 56;
mac_rm(&accum0, b, &a[2]);
mac_rm(&accum4, b, &a[6]);
c[2] = accum0 & mask; accum0 >>= 56;
c[6] = accum4 & mask; accum4 >>= 56;
mac_rm(&accum0, b, &a[3]);
mac_rm(&accum4, b, &a[7]);
c[3] = accum0 & mask; accum0 >>= 56;
c[7] = accum4 & mask; accum4 >>= 56;
c[4] += accum0 + accum4;
c[0] += accum4;
}

void p448_sqr(p448_t *__restrict__ cs, const p448_t *as) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<56) - 1;
uint64_t aa[4];
/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
}
accum2 = widemul(&a[0],&a[3]);
accum1 = widemul(&aa[0],&aa[3]);
accum0 = widemul(&a[4],&a[7]);
mac(&accum2, &a[1], &a[2]);
mac(&accum1, &aa[1], &aa[2]);
mac(&accum0, &a[5], &a[6]);
accum1 -= accum2;
accum0 += accum2;
c[3] = ((uint64_t)(accum0))<<1 & mask;
c[7] = ((uint64_t)(accum1))<<1 & mask;
accum0 >>= 55;
accum1 >>= 55;
{
accum2 = accum1;
accum1 += accum0;
accum0 = accum2;
}
accum2 = widemul(&a[0],&a[0]);
accum1 -= accum2;
accum0 += accum2;
accum2 = widemul2(&aa[1],&aa[3]);
msb2(&accum0, &a[1], &a[3]);
mac2(&accum1, &a[5], &a[7]);
msb(&accum0, &a[2], &a[2]);
mac(&accum2, &aa[2], &aa[2]);
mac(&accum1, &a[6], &a[6]);
accum0 += accum2;
accum1 += accum2;
mac(&accum0, &a[4], &a[4]);
mac(&accum1, &aa[0], &aa[0]);
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul2(&aa[2],&aa[3]);
msb2(&accum0, &a[2], &a[3]);
mac2(&accum1, &a[6], &a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul2(&a[0],&a[1]);
mac2(&accum1, &aa[0], &aa[1]);
mac2(&accum0, &a[4], &a[5]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&aa[3],&aa[3]);
msb(&accum0, &a[3], &a[3]);
mac(&accum1, &a[7], &a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul2(&a[0],&a[2]);
mac2(&accum1, &aa[0], &aa[2]);
mac2(&accum0, &a[4], &a[6]);
mac(&accum2, &a[1], &a[1]);
mac(&accum1, &aa[1], &aa[1]);
mac(&accum0, &a[5], &a[5]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0 >>= 56;
accum1 >>= 56;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}

static __inline__ void p448_sqr_inplace(p448_t *x) {
p448_t y;
p448_sqr(&y,x);
*x = y;
}

static __inline__ void p448_mul_inplace(p448_t *x, const p448_t *z) {
p448_t y;
p448_mul(&y,x,z);
*x = y;
}

static __inline__ void p448_repunit(p448_t *x, int space, int teeth) {
int i,j;
p448_t working = *x;
for (i=0; i<teeth-1; i++) {
for (j=0; j<space-(i?0:1); j++)
p448_sqr_inplace(&working);
if (i==teeth-2)
p448_mul_inplace(x,&working);
else
p448_mul_inplace(&working,x);
}
}

void
p448_strong_reduce(p448_t *a) {
uint64_t mask = (1ull<<56)-1;
/* first, clear high */
a->limb[4] += a->limb[7]>>56;
a->limb[0] += a->limb[7]>>56;
a->limb[7] &= mask;
/* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
/* compute total_value - p. No need to reduce mod p. */
__int128_t scarry = 0;
int i;
for (i=0; i<8; i++) {
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
a->limb[i] = scarry & mask;
scarry >>= 56;
}
/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^448
* so let's add back in p. will carry back off the top for 2^448.
*/
assert(is_zero(scarry) | is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
/* add it back */
for (i=0; i<8; i++) {
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 56;
}
assert(is_zero(carry + scarry));
}

mask_t p448_is_zero(const struct p448_t *a) {
struct p448_t b;
p448_copy(&b,a);
p448_strong_reduce(&b);

uint64_t any = 0;
int i;
for (i=0; i<8; i++) {
any |= b.limb[i];
}
return is_zero(any);
}

+ 242
- 0
p448.h View File

@@ -0,0 +1,242 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_H__
#define __P448_H__ 1

#include <stdint.h>
#include <assert.h>

#include "word.h"

typedef struct p448_t {
uint64_t limb[8];
} __attribute__((aligned(32))) p448_t;

#ifdef __cplusplus
extern "C" {
#endif

static __inline__ void
p448_set_ui(p448_t *out,
uint64_t x)
__attribute__((unused,always_inline));
static __inline__ void
p448_cond_swap(p448_t *a,
p448_t *b,
mask_t do_swap)
__attribute__((unused,always_inline));

static __inline__ void
p448_add(p448_t *out,
const p448_t *a,
const p448_t *b)
__attribute__((unused,always_inline));
static __inline__ void
p448_sub(p448_t *out,
const p448_t *a,
const p448_t *b)
__attribute__((unused,always_inline));
static __inline__ void
p448_neg(p448_t *out,
const p448_t *a)
__attribute__((unused,always_inline));
static __inline__ void
p448_cond_neg(p448_t *a,
mask_t doNegate)
__attribute__((unused,always_inline));

static __inline__ void
p448_addw(p448_t *a,
uint64_t x)
__attribute__((unused,always_inline));
static __inline__ void
p448_subw(p448_t *a,
uint64_t x)
__attribute__((unused,always_inline));
static __inline__ void
p448_copy(p448_t *out, const p448_t *a)
__attribute__((unused,always_inline));
static __inline__ void
p448_weak_reduce(p448_t *inout)
__attribute__((unused,always_inline));
void
p448_strong_reduce(p448_t *inout);

mask_t
p448_is_zero(const p448_t *in);
static __inline__ void
p448_bias(p448_t *inout, int amount)
__attribute__((unused,always_inline));
void
p448_mul(p448_t *__restrict__ out,
const p448_t *a,
const p448_t *b);

void
p448_mulw(p448_t *__restrict__ out,
const p448_t *a,
uint64_t b);

void
p448_sqr(p448_t *__restrict__ out,
const p448_t *a);
static __inline__ void
p448_sqrn(p448_t *__restrict__ y, const p448_t *x, int n)
__attribute__((unused,always_inline));

void
p448_set_ui(p448_t *out,
uint64_t x) {
int i;
out->limb[0] = x;
for (i=1; i<8; i++) {
out->limb[i] = 0;
}
}
void
p448_cond_swap(p448_t *a, p448_t *b, mask_t doswap) {
big_register_t *aa = (big_register_t*)a;
big_register_t *bb = (big_register_t*)b;
big_register_t m = doswap;
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
big_register_t x = m & (aa[i]^bb[i]);
aa[i] ^= x;
bb[i] ^= x;
}
}

void
p448_add(p448_t *out, const p448_t *a, const p448_t *b) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
*/
}

void
p448_sub(p448_t *out, const p448_t *a, const p448_t *b) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
*/
}

void
p448_neg(p448_t *out, const p448_t *a) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = -a->limb[i];
}
*/
}

void
p448_cond_neg(
p448_t *a,
mask_t doNegate
) {
unsigned int i;
struct p448_t negated;
big_register_t *aa = (big_register_t *)a;
big_register_t *nn = (big_register_t*)&negated;
big_register_t m = doNegate;
p448_neg(&negated, a);
p448_bias(&negated, 2);
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
aa[i] = (aa[i] & ~m) | (nn[i] & m);
}
}

void
p448_addw(p448_t *a, uint64_t x) {
a->limb[0] += x;
}
void
p448_subw(p448_t *a, uint64_t x) {
a->limb[0] -= x;
}

void
p448_copy(p448_t *out, const p448_t *a) {
*out = *a;
}

void
p448_bias(p448_t *a, int amt) {
uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
uint64x4_t *aa = (uint64x4_t*) a;
aa[0] += lo;
aa[1] += hi;
}

void
p448_weak_reduce(p448_t *a) {
/* TODO: use pshufb/palignr if anyone cares about speed of this */
uint64_t mask = (1ull<<56) - 1;
uint64_t tmp = a->limb[7] >> 56;
int i;
a->limb[4] += tmp;
for (i=7; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}

void p448_sqrn(p448_t *__restrict__ y, const p448_t *x, int n) {
p448_t tmp;
assert(n>0);
if (n&1) {
p448_sqr(y,x);
n--;
} else {
p448_sqr(&tmp,x);
p448_sqr(y,&tmp);
n-=2;
}
for (; n; n-=2) {
p448_sqr(&tmp,y);
p448_sqr(y,&tmp);
}
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __P448_H__ */

+ 728
- 0
scalarmul.c View File

@@ -0,0 +1,728 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#include <stdlib.h>

#include "scalarmul.h"
#include "string.h"
#include "barrett_field.h"

mask_t
p448_montgomery_ladder(
struct p448_t *out,
const struct p448_t *in,
const uint64_t *scalar,
int nbits,
int n_extra_doubles
) {
struct montgomery_t mont;
p448_sqr(&mont.z0,in);
p448_copy(&mont.za,&mont.z0);
p448_set_ui(&mont.xa,1);
p448_set_ui(&mont.zd,0);
p448_set_ui(&mont.xd,1);
int i,j,n=(nbits-1)&63;
mask_t pflip = 0;
for (j=(nbits+63)/64-1; j>=0; j--) {
uint64_t w = scalar[j];
for (i=n; i>=0; i--) {
mask_t flip = -((w>>i)&1);
p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
p448_montgomery_step(&mont);
pflip = flip;
}
n = 63;
}
p448_cond_swap(&mont.xa,&mont.xd,pflip);
p448_cond_swap(&mont.za,&mont.zd,pflip);
for (j=0; j<n_extra_doubles; j++) {
p448_montgomery_step(&mont);
}
struct p448_t sign;
p448_montgomery_serialize(&sign, out, &mont, in);
p448_addw(&sign,1);
return ~p448_is_zero(&sign);
}

static __inline__ void
niels_cond_negate(
struct tw_niels_t *n,
mask_t doNegate
) {
p448_cond_swap(&n->a, &n->b, doNegate);
p448_cond_neg(&n->c, doNegate); /* TODO: bias amt? */
}

static __inline__ void
pniels_cond_negate(
struct tw_pniels_t *n,
mask_t doNegate
) {
niels_cond_negate(&n->n, doNegate);
}

void
constant_time_lookup_pniels(
struct tw_pniels_t *out,
const struct tw_pniels_t *in,
int nin,
int idx
) {
big_register_t big_one = 1, big_i = idx;
big_register_t *o = (big_register_t *)out;
const big_register_t *i = (const big_register_t *)in;
int j;
unsigned int k;
memset(out, 0, sizeof(*out));
for (j=0; j<nin; j++, big_i-=big_one) {
big_register_t mask = br_is_zero(big_i);
for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
}
}
}

static __inline__ void
constant_time_lookup_niels(
struct tw_niels_t *out,
const struct tw_niels_t *in,
int nin,
int idx
) {
big_register_t big_one = 1, big_i = idx;
big_register_t *o = (big_register_t *)out;
const big_register_t *i = (const big_register_t *)in;
int j;
unsigned int k;
memset(out, 0, sizeof(*out));
for (j=0; j<nin; j++, big_i-=big_one) {
big_register_t mask = br_is_zero(big_i);
for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
}
}
}

static void
convert_to_signed_window_form(
word_t *out,
const word_t *scalar,
const word_t *prepared_data,
int nwords
) {
mask_t mask = -(scalar[0]&1);

word_t carry = add_nr_ext_packed(out, scalar, nwords, prepared_data, nwords, ~mask);
carry += add_nr_ext_packed(out, out, nwords, prepared_data+nwords, nwords, mask);
assert(!(out[0]&1));
int i;
for (i=0; i<nwords; i++) {
out[i] >>= 1;
if (i<nwords-1) {
out[i] |= out[i+1]<<(WORD_BITS-1);
} else {
out[i] |= carry<<(WORD_BITS-1);
}
}
}

void
edwards_scalar_multiply(
struct tw_extensible_t *working,
const uint64_t scalar[7]
) {

const int nbits=448; /* HACK? */
word_t prepared_data[14] = {
0x9595b847fdf73126ull,
0x9bb9b8a856af5200ull,
0xb3136e22f37d5c4full,
0x0000000189a19442ull,
0x0000000000000000ull,
0x0000000000000000ull,
0x4000000000000000ull,

0x721cf5b5529eec33ull,
0x7a4cf635c8e9c2abull,
0xeec492d944a725bfull,
0x000000020cd77058ull,
0x0000000000000000ull,
0x0000000000000000ull,
0x0000000000000000ull
}; /* TODO: split off */
uint64_t scalar2[7];
convert_to_signed_window_form(scalar2,scalar,prepared_data,7);

struct tw_extensible_t tabulator;
copy_tw_extensible(&tabulator, working);
p448_tw_extensible_double(&tabulator);

struct tw_pniels_t pn, multiples[8];
convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
convert_tw_extensible_to_tw_pniels(&multiples[0], working);

int i;
for (i=1; i<8; i++) {
p448_tw_extensible_add_pniels(working, &pn);
convert_tw_extensible_to_tw_pniels(&multiples[i], working);
}

i = nbits - 4;
int bits = scalar2[i/64] >> (i%64) & 0xF,
inv = (bits>>3)-1;
bits ^= inv;
constant_time_lookup_pniels(&pn, multiples, 8, bits&7);
pniels_cond_negate(&pn, inv);
convert_tw_pniels_to_tw_extensible(working, &pn);

for (i-=4; i>=0; i-=4) {
p448_tw_extensible_double(working);
p448_tw_extensible_double(working);
p448_tw_extensible_double(working);
p448_tw_extensible_double(working);

bits = scalar2[i/64] >> (i%64) & 0xF;
inv = (bits>>3)-1;
bits ^= inv;
constant_time_lookup_pniels(&pn, multiples, 8, bits&7);
pniels_cond_negate(&pn, inv);
p448_tw_extensible_add_pniels(working, &pn);
}
}


void
edwards_comb(
struct tw_extensible_t *working,
const word_t scalar[7],
const struct tw_niels_t *table,
int n,
int t,
int s
) {
word_t prepared_data[14] = {
0xebec9967f5d3f5c2ull,
0x0aa09b49b16c9a02ull,
0x7f6126aec172cd8eull,
0x00000007b027e54dull,
0x0000000000000000ull,
0x0000000000000000ull,
0x4000000000000000ull,

0xc873d6d54a7bb0cfull,
0xe933d8d723a70aadull,
0xbb124b65129c96fdull,
0x00000008335dc163ull,
0x0000000000000000ull,
0x0000000000000000ull,
0x0000000000000000ull
}; /* TODO: split off. Above is for 450 bits */
word_t scalar2[7];
convert_to_signed_window_form(scalar2,scalar,prepared_data,7);
/* const int n=3, t=5, s=30; */
int i,j,k;
struct tw_niels_t ni;
for (i=0; i<s; i++) {
if (i) p448_tw_extensible_double(working);
for (j=0; j<n; j++) {
int tab = 0;
/*
* PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
* time of a keygen or sign op. Surely it is possible to speed it up.
*/
for (k=0; k<t; k++) {
int bit = (s-1-i) + k*s + j*(s*t);
if (bit < 7*WORD_BITS) {
tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
}
}
mask_t invert = (tab>>(t-1))-1;
tab ^= invert;
tab &= (1<<(t-1)) - 1;
constant_time_lookup_niels(&ni, table + (j<<(t-1)), 1<<(t-1), tab);
niels_cond_negate(&ni, invert);
if (i||j) {
p448_tw_extensible_add_niels(working, &ni);
} else {
convert_tw_niels_to_tw_extensible(working, &ni);
}
}
}
}

void
simultaneous_invert_p448(
struct p448_t *out,
const struct p448_t *in,
int n
) {
if (!n) return;
p448_copy(&out[1], &in[0]);
int i;
for (i=1; i<n-1; i++) {
p448_mul(&out[i+1], &out[i], &in[i]);
}
p448_mul(&out[0], &out[n-1], &in[n-1]);
struct p448_t tmp;
p448_inverse(&tmp, &out[0]);
p448_copy(&out[0], &tmp);
/* at this point, out[0] = product(in[i]) ^ -1
* out[i] = product(in[0]..in[i-1]) if i != 0
*/
for (i=n-1; i>0; i--) {
p448_mul(&tmp, &out[i], &out[0]);
p448_copy(&out[i], &tmp);
p448_mul(&tmp, &out[0], &in[i]);
p448_copy(&out[0], &tmp);
}
}

mask_t
precompute_for_combs(
struct tw_niels_t *out,
const struct tw_extensible_t *const_base,
int n,
int t,
int s
) {
if (s < 1) return 0;
struct tw_extensible_t working, start;
copy_tw_extensible(&working, const_base);
struct tw_pniels_t pn_tmp;
struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc(sizeof(*doubles) * (t-1));
struct p448_t *zs = (struct p448_t *) malloc(sizeof(*zs) * (n<<(t-1)));
struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis) * (n<<(t-1)));
if (!doubles || !zs || !zis) {
free(doubles);
free(zs);
free(zis);
return 0;
}
int i,j,k;
for (i=0; i<n; i++) {

/* doubling phase */
for (j=0; j<t; j++) {
if (j) {
convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
p448_tw_extensible_add_pniels(&start, &pn_tmp);
} else {
copy_tw_extensible(&start, &working);
}

if (j==t-1 && i==n-1) {
break;
}

p448_tw_extensible_double(&working);
if (j<t-1) {
convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
}

for (k=0; k<s-1; k++) {
p448_tw_extensible_double(&working);
}
}

/* Gray-code phase */
for (j=0;; j++) {
int gray = j ^ (j>>1);
int idx = ((i+1)<<(t-1))-1 ^ gray;

convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
copy_tw_niels(&out[idx], &pn_tmp.n);
p448_copy(&zs[idx], &pn_tmp.z);
if (j >= (1<<(t-1)) - 1) break;
int delta = (j+1) ^ ((j+1)>>1) ^ gray;

for (k=0; delta>1; k++)
delta >>=1;
if (gray & (1<<k)) {
/* start += doubles[k] */
p448_tw_extensible_add_pniels(&start, &doubles[k]);
} else {
/* start -= doubles[k] */
/* PERF: uncond negate */
copy_tw_pniels(&pn_tmp, &doubles[k]);
pniels_cond_negate(&pn_tmp, -1);
p448_tw_extensible_add_pniels(&start, &pn_tmp);
}
}
}
simultaneous_invert_p448(zis, zs, n<<(t-1));

p448_t product;
for (i=0; i<n<<(t-1); i++) {
p448_mul(&product, &out[i].a, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].a, &product);
p448_mul(&product, &out[i].b, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].b, &product);
p448_mul(&product, &out[i].c, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].c, &product);
}
mask_t ret = ~p448_is_zero(&zis[0]);

free(doubles);
free(zs);
free(zis);

return ret;
}

mask_t
precompute_for_wnaf(
struct tw_niels_t *out,
const struct tw_extensible_t *const_base,
int tbits
) {
int i;
struct p448_t *zs = (struct p448_t *) malloc(sizeof(*zs)<<tbits);
struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis)<<tbits);

if (!zs || !zis) {
free(zs);
free(zis);
return 0;
}

struct tw_extensible_t base;
copy_tw_extensible(&base,const_base);
struct tw_pniels_t twop, tmp;
convert_tw_extensible_to_tw_pniels(&tmp, &base);
p448_copy(&zs[0], &tmp.z);
copy_tw_niels(&out[0], &tmp.n);

if (tbits > 0) {
p448_tw_extensible_double(&base);
convert_tw_extensible_to_tw_pniels(&twop, &base);
p448_tw_extensible_add_pniels(&base, &tmp);
convert_tw_extensible_to_tw_pniels(&tmp, &base);
p448_copy(&zs[1], &tmp.z);
copy_tw_niels(&out[1], &tmp.n);

for (i=2; i < 1<<tbits; i++) {
p448_tw_extensible_add_pniels(&base, &twop);
convert_tw_extensible_to_tw_pniels(&tmp, &base);
p448_copy(&zs[i], &tmp.z);
copy_tw_niels(&out[i], &tmp.n);
}
}
simultaneous_invert_p448(zis, zs, 1<<tbits);

p448_t product;
for (i=0; i<1<<tbits; i++) {
p448_mul(&product, &out[i].a, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].a, &product);
p448_mul(&product, &out[i].b, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].b, &product);
p448_mul(&product, &out[i].c, &zis[i]);
p448_strong_reduce(&product);
p448_copy(&out[i].c, &product);
}

free(zs);
free(zis);

return -1;
}

struct smvt_control {
int power, addend;
};

static int
recode_wnaf(
struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
const word_t *scalar,
int nbits,
int tableBits)
{
int current = 0, position=0, i;

/* PERF: negate scalar if it's large
* PERF: this is a pretty simplistic algorithm. I'm sure there's a faster one...
*/
for (i=nbits-1; i >= -2 - tableBits; i--) {
int bit = (i >= 0)
? (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1
: 0;

current = 2*current + bit;

/*
* Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
* So current loses (tableBits+1) bits every time. It otherwise gains
* 1 bit per iteration. The number of iterations is
* (nbits + 2 + tableBits), and an additional control word is added at
* the end. So the total number of control words is at most
* ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
* There's also the stopper with power -1, for a total of +3.
*/
if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
int delta = (current + 1) >> 1;
current = -(current & 1);

int j;
for (j=i; (delta & 1) == 0; j++) {
delta >>= 1;
}
control[position].power = j+1;
control[position].addend = delta;
position++;
assert(position <= nbits/(tableBits+1) + 2);
}
}
control[position].power = -1;
control[position].addend = 0;
return position;
}


static void
prepare_wnaf_table(
struct tw_pniels_t *output,
struct tw_extensible_t *working,
int tbits
) {
convert_tw_extensible_to_tw_pniels(&output[0], working);

if (tbits == 0) return;

p448_tw_extensible_double(working);
struct tw_pniels_t twop;
convert_tw_extensible_to_tw_pniels(&twop, working);

p448_tw_extensible_add_pniels(working, &output[0]);
convert_tw_extensible_to_tw_pniels(&output[1], working);

for (int i=2; i < 1<<tbits; i++) {
p448_tw_extensible_add_pniels(working, &twop);
convert_tw_extensible_to_tw_pniels(&output[i], working);
}
}

int
edwards_scalar_multiply_vt(
struct tw_extensible_t *working,
const uint64_t scalar[7]
) {
/* HACK: not 448? */
const int nbits=448, table_bits = 3;
struct smvt_control control[nbits/(table_bits+1)+3];
int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
struct tw_pniels_t precmp[1<<table_bits];
prepare_wnaf_table(precmp, working, table_bits);
if (control_bits > 0) {
assert(control[0].addend > 0);
assert(control[0].power >= 0);
convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
} else {
set_identity_tw_extensible(working);
return control_bits;
}
int conti = 1, i;
struct tw_pniels_t neg;
for (i = control[0].power - 1; i >= 0; i--) {
p448_tw_extensible_double(working);

if (i == control[conti].power) {
assert(control[conti].addend);

if (control[conti].addend > 0) {
p448_tw_extensible_add_pniels(working, &precmp[control[conti].addend >> 1]);
} else {
/* PERF: uncond negate */
copy_tw_pniels(&neg, &precmp[(-control[conti].addend) >> 1]);
pniels_cond_negate(&neg, -1);
p448_tw_extensible_add_pniels(working, &neg);
}
conti++;
assert(conti <= control_bits);
}
}
return control_bits; /* TODO: don't return anything, this is just for testing */
}

void
edwards_scalar_multiply_vt_pre(
struct tw_extensible_t *working,
const uint64_t scalar[7],
const struct tw_niels_t *precmp,
int table_bits
) {
/* HACK: not 448? */
const int nbits=448;
struct smvt_control control[nbits/(table_bits+1)+3];
int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
if (control_bits > 0) {
assert(control[0].addend > 0);
assert(control[0].power >= 0);
convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
} else {
set_identity_tw_extensible(working);
return;
}
int conti = 1, i;
struct tw_niels_t neg;
for (i = control[0].power - 1; i >= 0; i--) {
p448_tw_extensible_double(working);

if (i == control[conti].power) {
assert(control[conti].addend);

if (control[conti].addend > 0) {
p448_tw_extensible_add_niels(working, &precmp[control[conti].addend >> 1]);
} else {
/* PERF: uncond negate */
copy_tw_niels(&neg, &precmp[(-control[conti].addend) >> 1]);
niels_cond_negate(&neg, -1);
p448_tw_extensible_add_niels(working, &neg);
}
conti++;
assert(conti <= control_bits);
}
}
}

int
edwards_combo_var_fixed_vt(
struct tw_extensible_t *working,
const uint64_t scalar_var[7],
const uint64_t scalar_pre[7],
const struct tw_niels_t *precmp,
int table_bits_pre
) {
/* HACK: not 448? */
const int nbits_var=448, nbits_pre=448, table_bits_var = 3;
struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
(void)ncb_var;
(void)ncb_pre;
struct tw_pniels_t precmp_var[1<<table_bits_var];
prepare_wnaf_table(precmp_var, working, table_bits_var);
int contp=0, contv=0, i;
i = control_var[0].power;
if (i > control_pre[0].power) {
convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
contv++;
} else if (i == control_pre[0].power && i >=0 ) {
convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
p448_tw_extensible_add_niels(working, &precmp[control_pre[0].addend >> 1]);
contv++; contp++;
} else {
i = control_pre[0].power;
convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
contp++;
}
if (i < 0) {
set_identity_tw_extensible(working);
return ncb_pre;
}
struct tw_pniels_t pneg;
struct tw_niels_t neg;
for (i--; i >= 0; i--) {
p448_tw_extensible_double(working);

if (i == control_var[contv].power) {
assert(control_var[contv].addend);

if (control_var[contv].addend > 0) {
p448_tw_extensible_add_pniels(working, &precmp_var[control_var[contv].addend >> 1]);
} else {
/* PERF: uncond negate */
copy_tw_pniels(&pneg, &precmp_var[(-control_var[contv].addend) >> 1]);
pniels_cond_negate(&pneg, -1);
p448_tw_extensible_add_pniels(working, &pneg);
}
contv++;
}

if (i == control_pre[contp].power) {
assert(control_pre[contp].addend);

if (control_pre[contp].addend > 0) {
p448_tw_extensible_add_niels(working, &precmp[control_pre[contp].addend >> 1]);
} else {
/* PERF: uncond negate */
copy_tw_niels(&neg, &precmp[(-control_pre[contp].addend) >> 1]);
niels_cond_negate(&neg, -1);
p448_tw_extensible_add_niels(working, &neg);
}
contp++;
}
}
assert(contv == ncb_var);
assert(contp == ncb_pre);
return ncb_pre;
}




+ 112
- 0
scalarmul.h View File

@@ -0,0 +1,112 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_ALGO_H__
#define __P448_ALGO_H__ 1

#include "ec_point.h"

#ifdef __cplusplus
extern "C" {
#endif

/*
* Out = scalar * in, encoded in inverse square root
* format.
*
* nbits is the number of bits in scalar.
*
* The scalar is to be presented in little-endian form,
* meaning that scalar[0] contains the least significant
* word of the scalar.
*
* If the point "in" is on the curve, the return
* value will be set (to -1).
*
* If the point "in" is not on the curve, then the
* output will be incorrect. If the scalar is even,
* this condition will be detected by returning 0,
* unless the output is the identity point (0; TODO).
* If the scalar is odd, the value returned will be
* set (to -1; TODO).
*
* The input and output points are always even.
* Therefore on a cofactor-4 curve like Goldilocks,
* it is sufficient for security to make the scalar
* even. (TODO: detect when i/o has cofactor?)
*
* This function takes constant time, depending on
* nbits but not on in or scalar.
*/
mask_t
p448_montgomery_ladder(
struct p448_t *out,
const struct p448_t *in,
const uint64_t *scalar,
int nbits,
int n_extra_doubles
);

void
edwards_scalar_multiply(
struct tw_extensible_t *working,
const uint64_t scalar[7]
/* TODO? int nbits */
);
mask_t
precompute_for_combs(
struct tw_niels_t *out,
const struct tw_extensible_t *const_base,
int n,
int t,
int s
);
void
edwards_comb(
struct tw_extensible_t *working,
const word_t scalar[7],
const struct tw_niels_t *table,
int n,
int t,
int s
);

/* TODO: void. int is just for diagnostic purposes. */
int
edwards_scalar_multiply_vt(
struct tw_extensible_t *working,
const uint64_t scalar[7]
);
void
edwards_scalar_multiply_vt_pre(
struct tw_extensible_t *working,
const uint64_t scalar[7],
const struct tw_niels_t *precmp,
int table_bits
);

mask_t
precompute_for_wnaf(
struct tw_niels_t *out,
const struct tw_extensible_t *const_base,
int tbits
); /* TODO: attr don't ignore... */

/* TODO: void. int is just for diagnostic purposes. */
int
edwards_combo_var_fixed_vt(
struct tw_extensible_t *working,
const uint64_t scalar_var[7],
const uint64_t scalar_pre[7],
const struct tw_niels_t *precmp,
int table_bits_pre
);

#ifdef __cplusplus
};
#endif

#endif /* __P448_ALGO_H__ */

+ 55
- 0
word.h View File

@@ -0,0 +1,55 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __WORD_H__
#define __WORD_H__

#include <stdint.h>

typedef uint64_t word_t;
typedef __uint128_t dword_t;
typedef int64_t sword_t;
typedef __int128_t dsword_t;

static const int WORD_BITS = sizeof(word_t) * 8;

/* TODO: vector width for procs like ARM; gcc support */
typedef uint64_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;

/* FIXME this only works on clang */
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));

#if __AVX2__
typedef uint32x8_t big_register_t;
typedef uint64x4_t uint64xn_t;
#elif __SSE2__ || __ARM_NEON__
typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t;
#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
typedef uint64_t big_register_t, uint64xn_t;
#else
typedef uint64_t uint64xn_t;
typedef uint32_t big_register_t;
#endif


#if __AVX2__ || __SSE2__ || __ARM_NEON__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
return (big_register_t)(x == (big_register_t)0);
}
#else
#error "Todo: constant-time equality on vectorless platforms"
#endif

#endif /* __WORD_H__ */

+ 246
- 0
x86-64-arith.h View File

@@ -0,0 +1,246 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#ifndef __X86_64_ARITH_H__
#define __X86_64_ARITH_H__

#include <stdint.h>

/* TODO: non x86-64 versions of these.
* TODO: autogenerate
*/

static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"r"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"leaq (,%%rdx,2), %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"r"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t c,d, lo = *acc, hi = *acc>>64;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[lo], %[c]; "
"sbbq %[hi], %[d]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
*acc = (((__uint128_t)(d))<<64) | c;
}

static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
return ((__uint128_t)(a)) * b;
}

static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
return ((__int128_t)(a)) * b;
}
static __inline__ uint64_t opacify(uint64_t x) {
__asm__ volatile("" : "+r"(x));
return x;
}

static __inline__ mask_t is_zero(uint64_t x) {
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
return ~x;
}

#endif /* __X86_64_ARITH_H__ */

Loading…
Cancel
Save