Browse Source

some accel in for curve25519

master
Michael Hamburg 9 years ago
parent
commit
42a561d018
7 changed files with 491 additions and 4 deletions
  1. +1
    -1
      Makefile
  2. +3
    -3
      src/decaf_fast.c
  3. +1
    -0
      src/p25519/arch_x86_64/arch_config.h
  4. +275
    -0
      src/p25519/arch_x86_64/p25519.c
  5. +166
    -0
      src/p25519/arch_x86_64/p25519.h
  6. +1
    -0
      src/p25519/arch_x86_64/x86-64-arith.h
  7. +44
    -0
      src/p448/arch_x86_64/x86-64-arith.h

+ 1
- 1
Makefile View File

@@ -19,7 +19,7 @@ ASM ?= $(CC)
DECAF ?= decaf_fast

ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_ref64
ARCH ?= arch_x86_64
else
# no i386 port yet
ARCH ?= arch_ref32


+ 3
- 3
src/decaf_fast.c View File

@@ -361,7 +361,7 @@ decaf_bool_t API_NS(scalar_invert) (
}
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#else
decaf_255_scalar_t b, ma;
scalar_t b, ma;
int i;
sc_montmul(b,API_NS(scalar_one),sc_r2);
sc_montmul(ma,a,sc_r2);
@@ -378,10 +378,10 @@ decaf_bool_t API_NS(scalar_invert) (
}
}

sc_montmul(out,b,decaf_255_scalar_one);
sc_montmul(out,b,API_NS(scalar_one));
API_NS(scalar_destroy)(b);
API_NS(scalar_destroy)(ma);
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#endif
}



+ 1
- 0
src/p25519/arch_x86_64/arch_config.h View File

@@ -0,0 +1 @@
#define WORD_BITS 64

+ 275
- 0
src/p25519/arch_x86_64/p25519.c View File

@@ -0,0 +1,275 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/

#include "p25519.h"
#include "x86-64-arith.h"

void
p255_mul (
p255_t *__restrict__ cs,
const p255_t *as,
const p255_t *bs
) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t bh[4];
int i;
for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
accum0 = widemul_rm(ai, &b[0]);
accum1 = widemul_rm(ai, &b[1]);
accum2 = widemul_rm(ai, &b[2]);
ai = a[1];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
mac_rm(&accum2, ai, &b[1]);
ai = a[2];
mac_rm(&accum0, ai, &bh[2]);
mac_rm(&accum1, ai, &bh[3]);
mac_rm(&accum2, ai, &b[0]);
ai = a[3];
mac_rm(&accum0, ai, &bh[1]);
mac_rm(&accum1, ai, &bh[2]);
mac_rm(&accum2, ai, &bh[3]);
ai = a[4];
mac_rm(&accum0, ai, &bh[0]);
mac_rm(&accum1, ai, &bh[1]);
mac_rm(&accum2, ai, &bh[2]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
ai = a[0];
mac_rm(&accum0, ai, &b[3]);
accum1 = widemul_rm(ai, &b[4]);
ai = a[1];
mac_rm(&accum0, ai, &b[2]);
mac_rm(&accum1, ai, &b[3]);
ai = a[2];
mac_rm(&accum0, ai, &b[1]);
mac_rm(&accum1, ai, &b[2]);
ai = a[3];
mac_rm(&accum0, ai, &b[0]);
mac_rm(&accum1, ai, &b[1]);
ai = a[4];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
}

void
p255_mulw (
p255_t *__restrict__ cs,
const p255_t *as,
uint64_t b
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb;

__uint128_t accum = 0;
for (i=0; i<5; i++) {
mac_rm(&accum, b, &a[i]);
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
}

void
p255_sqr (
p255_t *__restrict__ cs,
const p255_t *as
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t ah[4];
int i;
for (i=0; i<4; i++) ah[i] = a[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
accum0 = widemul_rr(ai, ai);
ai *= 2;
accum1 = widemul_rm(ai, &a[1]);
accum2 = widemul_rm(ai, &a[2]);
ai = a[1];
mac_rr(&accum2, ai, ai);
ai *= 2;
mac_rm(&accum0, ai, &ah[3]);
ai = a[2] * 2;
mac_rm(&accum0, ai, &ah[2]);
mac_rm(&accum1, ai, &ah[3]);
ai = a[3];
mac_rm(&accum1, ai, &ah[2]);
ai *= 2;
mac_rm(&accum2, ai, &ah[3]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
ai = a[0]*2;
mac_rm(&accum0, ai, &a[3]);
accum1 = widemul_rm(ai, &a[4]);
ai = a[1]*2;
mac_rm(&accum0, ai, &a[2]);
mac_rm(&accum1, ai, &a[3]);
mac_rm(&accum0, a[4], &ah[3]);
mac_rr(&accum1, a[2], a[2]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
}

void
p255_strong_reduce (
p255_t *a
) {
uint64_t mask = (1ull<<51)-1;

/* first, clear high */
a->limb[0] += (a->limb[4]>>51)*19;
a->limb[4] &= mask;

/* now the total is less than 2p */

/* compute total_value - p. No need to reduce mod p. */
__int128_t scarry = 0;
int i;
for (i=0; i<5; i++) {
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
a->limb[i] = scarry & mask;
scarry >>= 51;
}

/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^255
* so let's add back in p. will carry back off the top for 2^255.
*/

assert(is_zero(scarry) | is_zero(scarry+1));

uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;

/* add it back */
for (i=0; i<5; i++) {
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 51;
}

assert(is_zero(carry + scarry));
}

void
p255_serialize (
uint8_t serial[32],
const struct p255_t *x
) {
int i,j;
p255_t red;
p255_copy(&red, x);
p255_strong_reduce(&red);
uint64_t *r = red.limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
for (j=0; j<8; j++) {
serial[8*i+j] = ser64[i];
ser64[i] >>= 8;
}
}
}

mask_t
p255_deserialize (
p255_t *x,
const uint8_t serial[32]
) {
int i,j;
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
uint64_t out = 0;
for (j=0; j<8; j++) {
out |= ((uint64_t)serial[8*i+j])<<(8*j);
}
ser64[i] = out;
}
/* Test for >= 2^255-19 */
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
ge &= ser64[1];
ge &= ser64[2];
ge &= (ser64[3]<<1) + 1;
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
x->limb[0] = ser64[0] & mask;
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(~ge);
}

+ 166
- 0
src/p25519/arch_x86_64/p25519.h View File

@@ -0,0 +1,166 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P255_H__
#define __P255_H__ 1

#include <stdint.h>
#include <assert.h>
#include <string.h>

#include "word.h"

typedef struct p255_t {
uint64_t limb[5];
} p255_t;

#define LBITS 51
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

/*
#define FIELD_LITERAL(a,b,c,d) {{ \
(a##ull) & LMASK, \
((a##ull)>>51 | (b##ull)<<13) & LMASK, \
((b##ull)>>38 | (c##ull)<<26) & LMASK, \
((c##ull)>>25 | (d##ull)<<39) & LMASK, \
(d##ull)>>12 \
}}
*/

#ifdef __cplusplus
extern "C" {
#endif

static __inline__ void
p255_add_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) __attribute__((unused));
static __inline__ void
p255_sub_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) __attribute__((unused));
static __inline__ void
p255_copy (
p255_t *out,
const p255_t *a
) __attribute__((unused));
static __inline__ void
p255_weak_reduce (
p255_t *inout
) __attribute__((unused));
void
p255_strong_reduce (
p255_t *inout
);

static __inline__ void
p255_bias (
p255_t *inout,
int amount
) __attribute__((unused));
void
p255_mul (
p255_t *__restrict__ out,
const p255_t *a,
const p255_t *b
);

void
p255_mulw (
p255_t *__restrict__ out,
const p255_t *a,
uint64_t b
);

void
p255_sqr (
p255_t *__restrict__ out,
const p255_t *a
);

void
p255_serialize (
uint8_t serial[32],
const struct p255_t *x
);

mask_t
p255_deserialize (
p255_t *x,
const uint8_t serial[32]
);

/* -------------- Inline functions begin here -------------- */

void
p255_add_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) {
unsigned int i;
for (i=0; i<5; i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
}

void
p255_sub_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) {
unsigned int i;
uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
for (i=0; i<5; i++) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
}
}

void
p255_copy (
p255_t *out,
const p255_t *a
) {
memcpy(out,a,sizeof(*a));
}

void
p255_bias (
p255_t *a,
int amt
) {
a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
int i;
for (i=1; i<5; i++) {
a->limb[i] += ((uint64_t)(amt)<<52)-2*amt;
}
}

void
p255_weak_reduce (
p255_t *a
) {
uint64_t mask = (1ull<<51) - 1;
uint64_t tmp = a->limb[4] >> 51;
int i;
for (i=4; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
}
a->limb[0] = (a->limb[0] & mask) + tmp*19;
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __P255_H__ */

+ 1
- 0
src/p25519/arch_x86_64/x86-64-arith.h View File

@@ -0,0 +1 @@
../../p448/arch_x86_64/x86-64-arith.h

+ 44
- 0
src/p448/arch_x86_64/x86-64-arith.h View File

@@ -53,6 +53,25 @@ static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
#endif
}

static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"r"(b), "a"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"r"(b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}

static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
@@ -163,6 +182,31 @@ static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), "a"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}

static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;


Loading…
Cancel
Save