Browse Source

switch to 64-bit shift; still slower than donna?

master
Michael Hamburg 9 years ago
parent
commit
e273155e35
1 changed files with 23 additions and 19 deletions
  1. +23
    -19
      src/p25519/arch_x86_64/p25519.c

+ 23
- 19
src/p25519/arch_x86_64/p25519.c View File

@@ -5,6 +5,10 @@
#include "p25519.h"
#include "x86-64-arith.h"

static inline uint64_t shr(__uint128_t x, int n) {
return x>>n;
}

void
p255_mul (
p255_t *__restrict__ cs,
@@ -44,12 +48,12 @@ p255_mul (
mac_rm(&accum2, ai, &b[3]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
accum1 += shr(accum0, 51);
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
accum2 += shr(accum1, 51);
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
accum0 = shr(accum2, 51);

mac_rm(&accum0, ai, &b[4]);
@@ -73,7 +77,7 @@ p255_mul (
mac_rm(&accum1, ai, &b[0]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
accum1 += shr(accum0, 51);
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
@@ -81,10 +85,10 @@ p255_mul (
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
uint64_t a1 = shr(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
c[1] = c1 + shr(accum1,51);
}

void
@@ -118,9 +122,9 @@ p255_sqr (
mac_rm(&accum2, ai, &a[4]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
accum1 += shr(accum0, 51);
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
accum2 += shr(accum1, 51);
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
@@ -137,7 +141,7 @@ p255_sqr (
mac_rr(&accum1, a[2], a[2]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
accum1 += shr(accum0, 51);
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
@@ -145,10 +149,10 @@ p255_sqr (
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
uint64_t a1 = shr(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
c[1] = c1 + shr(accum1,51);
}

void
@@ -162,28 +166,28 @@ p255_mulw (

__uint128_t accum = widemul_rm(b, &a[0]);
uint64_t c0 = accum & mask;
accum >>= 51;
accum = shr(accum,51);
mac_rm(&accum, b, &a[1]);
uint64_t c1 = accum & mask;
accum >>= 51;
accum = shr(accum,51);
mac_rm(&accum, b, &a[2]);
c[2] = accum & mask;
accum >>= 51;
accum = shr(accum,51);
mac_rm(&accum, b, &a[3]);
c[3] = accum & mask;
accum >>= 51;
accum = shr(accum,51);
mac_rm(&accum, b, &a[4]);
c[4] = accum & mask;
uint64_t a1 = accum>>51;
accum = (__uint128_t)a1 * 19 + c0;
accum = shr(accum,51);
accum = accum * 19 + c0;
c[0] = accum & mask;
c[1] = c1 + (accum>>51);
c[1] = c1 + shr(accum,51);
}

void


Loading…
Cancel
Save