Browse Source

remove ah = a*19

master
Michael Hamburg 9 years ago
parent
commit
8202c43eba
2 changed files with 53 additions and 58 deletions
  1. +1
    -1
      src/include/decaf_255_config.h
  2. +52
    -57
      src/p25519/arch_x86_64/p25519.c

+ 1
- 1
src/include/decaf_255_config.h View File

@@ -20,7 +20,7 @@
* kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
* time.
*/
#define DECAF_USE_MONTGOMERY_LADDER 1
#define DECAF_USE_MONTGOMERY_LADDER 0 /* FUTURE */

/** The number of comb tables for fixed base scalarmul. */
#define DECAF_COMBS_N 3


+ 52
- 57
src/p25519/arch_x86_64/p25519.c View File

@@ -14,10 +14,6 @@ p255_mul (
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t bh[4];
int i;
for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
@@ -26,24 +22,26 @@ p255_mul (
accum2 = widemul_rm(ai, &b[2]);
ai = a[1];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
mac_rm(&accum2, ai, &b[1]);
mac_rm(&accum1, ai, &b[0]);
ai *= 19;
mac_rm(&accum0, ai, &b[4]);
ai = a[2];
mac_rm(&accum0, ai, &bh[2]);
mac_rm(&accum1, ai, &bh[3]);
mac_rm(&accum2, ai, &b[0]);
ai *= 19;
mac_rm(&accum0, ai, &b[3]);
mac_rm(&accum1, ai, &b[4]);
ai = a[3];
mac_rm(&accum0, ai, &bh[1]);
mac_rm(&accum1, ai, &bh[2]);
mac_rm(&accum2, ai, &bh[3]);
ai = a[3] * 19;
mac_rm(&accum0, ai, &b[2]);
mac_rm(&accum1, ai, &b[3]);
mac_rm(&accum2, ai, &b[4]);
ai = a[4];
mac_rm(&accum0, ai, &bh[0]);
mac_rm(&accum1, ai, &bh[1]);
mac_rm(&accum2, ai, &bh[2]);
ai = a[4] * 19;
mac_rm(&accum0, ai, &b[1]);
mac_rm(&accum1, ai, &b[2]);
mac_rm(&accum2, ai, &b[3]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
@@ -52,6 +50,8 @@ p255_mul (
c[2] = accum2 & mask;
accum0 = accum2 >> 51;

mac_rm(&accum0, ai, &b[4]);
ai = a[0];
mac_rm(&accum0, ai, &b[3]);
@@ -70,7 +70,6 @@ p255_mul (
mac_rm(&accum1, ai, &b[1]);
ai = a[4];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
c[3] = accum0 & mask;
@@ -88,33 +87,6 @@ p255_mul (
c[1] = c1 + (accum1>>51);
}

void
p255_mulw (
p255_t *__restrict__ cs,
const p255_t *as,
uint64_t b
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb;

__uint128_t accum = 0;
for (i=0; i<5; i++) {
mac_rm(&accum, b, &a[i]);
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
}

void
p255_sqr (
p255_t *__restrict__ cs,
@@ -123,10 +95,6 @@ p255_sqr (
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t ah[4];
int i;
for (i=0; i<4; i++) ah[i] = a[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
@@ -137,17 +105,17 @@ p255_sqr (
ai = a[1];
mac_rr(&accum2, ai, ai);
ai *= 2;
mac_rm(&accum0, ai, &ah[3]);
ai *= 38;
mac_rm(&accum0, ai, &a[4]);
ai = a[2] * 2;
mac_rm(&accum0, ai, &ah[2]);
mac_rm(&accum1, ai, &ah[3]);
ai = a[2] * 38;
mac_rm(&accum0, ai, &a[3]);
mac_rm(&accum1, ai, &a[4]);
ai = a[3];
mac_rm(&accum1, ai, &ah[2]);
ai = a[3] * 19;
mac_rm(&accum1, ai, &a[3]);
ai *= 2;
mac_rm(&accum2, ai, &ah[3]);
mac_rm(&accum2, ai, &a[4]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
@@ -165,7 +133,7 @@ p255_sqr (
mac_rm(&accum0, ai, &a[2]);
mac_rm(&accum1, ai, &a[3]);
mac_rm(&accum0, a[4], &ah[3]);
mac_rr(&accum0, a[4]*19, a[4]);
mac_rr(&accum1, a[2], a[2]);
c[3] = accum0 & mask;
@@ -183,6 +151,33 @@ p255_sqr (
c[1] = c1 + (accum1>>51);
}

void
p255_mulw (
p255_t *__restrict__ cs,
const p255_t *as,
uint64_t b
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb;

__uint128_t accum = 0;
for (i=0; i<5; i++) {
mac_rm(&accum, b, &a[i]);
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
}

void
p255_strong_reduce (
p255_t *a


Loading…
Cancel
Save