some accel in for curve25519

10 years ago · 42a561d018
--- a/+ 1
+++ b/+ 1
@@ -19,7 +19,7 @@ ASM ?= $(CC)
 DECAF ?= decaf_fast

 ifneq (,$(findstring x86_64,$(MACHINE)))
 ARCH ?= arch_ref64
 ARCH ?= arch_x86_64
 else
 # no i386 port yet
 ARCH ?= arch_ref32
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -361,7 +361,7 @@ decaf_bool_t API_NS(scalar_invert) (
    }
    return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
 #else
    decaf_255_scalar_t b, ma;
    scalar_t b, ma;
    int i;
    sc_montmul(b,API_NS(scalar_one),sc_r2);
    sc_montmul(ma,a,sc_r2);
@@ -378,10 +378,10 @@ decaf_bool_t API_NS(scalar_invert) (
        }
    }

    sc_montmul(out,b,decaf_255_scalar_one);
    sc_montmul(out,b,API_NS(scalar_one));
    API_NS(scalar_destroy)(b);
    API_NS(scalar_destroy)(ma);
    return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
    return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
 #endif
 }

--- a/src/p25519/arch_x86_64/arch_config.h
+++ b/src/p25519/arch_x86_64/arch_config.h
@@ -0,0 +1 @@
 #define WORD_BITS 64
--- a/src/p25519/arch_x86_64/p25519.c
+++ b/src/p25519/arch_x86_64/p25519.c
@@ -0,0 +1,275 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p25519.h"
 #include "x86-64-arith.h"

 void
 p255_mul (
    p255_t *__restrict__ cs,
    const p255_t *as,
    const p255_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;
    
    uint64_t bh[4];
    int i;
    for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
    
    __uint128_t accum0, accum1, accum2;
    
    uint64_t ai = a[0];
    accum0 = widemul_rm(ai, &b[0]);
    accum1 = widemul_rm(ai, &b[1]);
    accum2 = widemul_rm(ai, &b[2]);
    
    ai = a[1];
    mac_rm(&accum0, ai, &bh[3]);
    mac_rm(&accum1, ai, &b[0]);
    mac_rm(&accum2, ai, &b[1]);
    
    ai = a[2];
    mac_rm(&accum0, ai, &bh[2]);
    mac_rm(&accum1, ai, &bh[3]);
    mac_rm(&accum2, ai, &b[0]);
    
    ai = a[3];
    mac_rm(&accum0, ai, &bh[1]);
    mac_rm(&accum1, ai, &bh[2]);
    mac_rm(&accum2, ai, &bh[3]);
    
    ai = a[4];
    mac_rm(&accum0, ai, &bh[0]);
    mac_rm(&accum1, ai, &bh[1]);
    mac_rm(&accum2, ai, &bh[2]);
    
    uint64_t c0 = accum0 & mask;
    accum1 += accum0 >> 51;
    uint64_t c1 = accum1 & mask;
    accum2 += accum1 >> 51;
    c[2] = accum2 & mask;
    
    accum0 = accum2 >> 51;
    
    ai = a[0];
    mac_rm(&accum0, ai, &b[3]);
    accum1 = widemul_rm(ai, &b[4]);
    
    ai = a[1];
    mac_rm(&accum0, ai, &b[2]);
    mac_rm(&accum1, ai, &b[3]);
    
    ai = a[2];
    mac_rm(&accum0, ai, &b[1]);
    mac_rm(&accum1, ai, &b[2]);
    
    ai = a[3];
    mac_rm(&accum0, ai, &b[0]);
    mac_rm(&accum1, ai, &b[1]);
    
    ai = a[4];
    mac_rm(&accum0, ai, &bh[3]);
    mac_rm(&accum1, ai, &b[0]);
    
    c[3] = accum0 & mask;
    accum1 += accum0 >> 51;
    c[4] = accum1 & mask;
    
    /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
     * = 2^(-13 + <13)
     * PERF: good enough to fit into uint64_t?
     */
    
    uint64_t a1 = accum1>>51;
    accum1 = (__uint128_t)a1 * 19 + c0;
    c[0] = accum1 & mask;
    c[1] = c1 + (accum1>>51);
 }

 void
 p255_mulw (
    p255_t *__restrict__ cs,
    const p255_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    int i;
    
    uint64_t *c = cs->limb;

    __uint128_t accum = 0;
    for (i=0; i<5; i++) {
        mac_rm(&accum, b, &a[i]);
        c[i] = accum & mask;
        accum >>= 51;
    }
    /* PERF: parallelize? eh well this is reference */
    accum *= 19;
    accum += c[0];
    c[0] = accum & mask;
    accum >>= 51;
    
    assert(accum < mask);
    c[1] += accum;
 }

 void
 p255_sqr (
    p255_t *__restrict__ cs,
    const p255_t *as
 ) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;
    
    uint64_t ah[4];
    int i;
    for (i=0; i<4; i++) ah[i] = a[i+1] * 19;
    
    __uint128_t accum0, accum1, accum2;
    
    uint64_t ai = a[0];
    accum0 = widemul_rr(ai, ai);
    ai *= 2;
    accum1 = widemul_rm(ai, &a[1]);
    accum2 = widemul_rm(ai, &a[2]);
    
    ai = a[1];
    mac_rr(&accum2, ai, ai);
    ai *= 2;
    mac_rm(&accum0, ai, &ah[3]);
    
    ai = a[2] * 2;
    mac_rm(&accum0, ai, &ah[2]);
    mac_rm(&accum1, ai, &ah[3]);
    
    ai = a[3];
    mac_rm(&accum1, ai, &ah[2]);
    ai *= 2;
    mac_rm(&accum2, ai, &ah[3]);
    
    uint64_t c0 = accum0 & mask;
    accum1 += accum0 >> 51;
    uint64_t c1 = accum1 & mask;
    accum2 += accum1 >> 51;
    c[2] = accum2 & mask;
    
    accum0 = accum2 >> 51;
    
    ai = a[0]*2;
    mac_rm(&accum0, ai, &a[3]);
    accum1 = widemul_rm(ai, &a[4]);
    
    ai = a[1]*2;
    mac_rm(&accum0, ai, &a[2]);
    mac_rm(&accum1, ai, &a[3]);
    
    mac_rm(&accum0, a[4], &ah[3]);
    mac_rr(&accum1, a[2], a[2]);
    
    c[3] = accum0 & mask;
    accum1 += accum0 >> 51;
    c[4] = accum1 & mask;
    
    /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
     * = 2^(-13 + <13)
     * PERF: good enough to fit into uint64_t?
     */
    
    uint64_t a1 = accum1>>51;
    accum1 = (__uint128_t)a1 * 19 + c0;
    c[0] = accum1 & mask;
    c[1] = c1 + (accum1>>51);
 }

 void
 p255_strong_reduce (
    p255_t *a
 ) {
    uint64_t mask = (1ull<<51)-1;

    /* first, clear high */
    a->limb[0] += (a->limb[4]>>51)*19;
    a->limb[4] &= mask;

    /* now the total is less than 2p */

    /* compute total_value - p.  No need to reduce mod p. */
    __int128_t scarry = 0;
    int i;
    for (i=0; i<5; i++) {
        scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 51;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
    * so let's add back in p.  will carry back off the top for 2^255.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<5; i++) {
        carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 51;
    }

    assert(is_zero(carry + scarry));
 }

 void
 p255_serialize (
    uint8_t serial[32],
    const struct p255_t *x
 ) {
    int i,j;
    p255_t red;
    p255_copy(&red, x);
    p255_strong_reduce(&red);
    uint64_t *r = red.limb;
    uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
    for (i=0; i<4; i++) {
        for (j=0; j<8; j++) {
            serial[8*i+j] = ser64[i];
            ser64[i] >>= 8;
        }
    }
 }

 mask_t
 p255_deserialize (
    p255_t *x,
    const uint8_t serial[32]
 ) {
    int i,j;
    uint64_t ser64[4], mask = ((1ull<<51)-1);
    for (i=0; i<4; i++) {
        uint64_t out = 0;
        for (j=0; j<8; j++) {
            out |= ((uint64_t)serial[8*i+j])<<(8*j);
        }
        ser64[i] = out;
    }
    
    /* Test for >= 2^255-19 */
    uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
    ge &= ser64[1];
    ge &= ser64[2];
    ge &= (ser64[3]<<1) + 1;
    ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
    
    x->limb[0] = ser64[0] & mask;
    x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
    x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
    x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
    x->limb[4] = ser64[3]>>12;
    
    return ~is_zero(~ge);
 }
--- a/src/p25519/arch_x86_64/p25519.h
+++ b/src/p25519/arch_x86_64/p25519.h
@@ -0,0 +1,166 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P255_H__
 #define __P255_H__ 1

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "word.h"

 typedef struct p255_t {
  uint64_t limb[5];
 } p255_t;

 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}

 /*
 #define FIELD_LITERAL(a,b,c,d) {{ \
    (a##ull) & LMASK, \
    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
    (d##ull)>>12 \
 }}
 */

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p255_copy (
    p255_t *out,
    const p255_t *a
 ) __attribute__((unused));
             
 static __inline__ void
 p255_weak_reduce (
    p255_t *inout
 ) __attribute__((unused));
             
 void
 p255_strong_reduce (
    p255_t *inout
 );

 static __inline__ void
 p255_bias (
    p255_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 p255_mul (
    p255_t *__restrict__ out,
    const p255_t *a,
    const p255_t *b
 );

 void
 p255_mulw (
    p255_t *__restrict__ out,
    const p255_t *a,
    uint64_t b
 );

 void
 p255_sqr (
    p255_t *__restrict__ out,
    const p255_t *a
 );

 void
 p255_serialize (
    uint8_t serial[32],
    const struct p255_t *x
 );

 mask_t
 p255_deserialize (
    p255_t *x,
    const uint8_t serial[32]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p255_add_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 ) {
    unsigned int i;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
 }

 void
 p255_sub_RAW (
    p255_t *out,
    const p255_t *a,
    const p255_t *b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
    for (i=0; i<5; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
    }
 }

 void
 p255_copy (
    p255_t *out,
    const p255_t *a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p255_bias (
    p255_t *a,
    int amt
 ) {
    a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
    int i;
    for (i=1; i<5; i++) {
        a->limb[i] += ((uint64_t)(amt)<<52)-2*amt;
    }
 }

 void
 p255_weak_reduce (
    p255_t *a
 ) {
    uint64_t mask = (1ull<<51) - 1;
    uint64_t tmp = a->limb[4] >> 51;
    int i;
    for (i=4; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp*19;
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P255_H__ */
--- a/src/p25519/arch_x86_64/x86-64-arith.h
+++ b/src/p25519/arch_x86_64/x86-64-arith.h
@@ -0,0 +1 @@
 ../../p448/arch_x86_64/x86-64-arith.h
--- a/src/p448/arch_x86_64/x86-64-arith.h
+++ b/src/p448/arch_x86_64/x86-64-arith.h
@@ -53,6 +53,25 @@ static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #endif
 }

 static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"r"(b), "a"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"r"(b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
@@ -163,6 +182,31 @@ static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"r"(b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"r"(b), "a"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;