Browse Source

set mulw to <32 bits instead of <64 bits (but actually less than that: 1 limb instead of 2). also there is a bug if you compile ed448 for arch_32 on a 64-bit machine... tracing

master
Michael Hamburg 9 years ago
parent
commit
790745e2b3
12 changed files with 67 additions and 94 deletions
  1. +16
    -23
      src/decaf.c
  2. +1
    -1
      src/gen_headers/f_field_h.py
  3. +1
    -1
      src/p25519/arch_32/f_impl.c
  4. +1
    -1
      src/p25519/arch_ref64/f_impl.c
  5. +15
    -11
      src/p25519/arch_x86_64/f_impl.c
  6. +6
    -11
      src/p448/arch_32/f_impl.c
  7. +2
    -2
      src/p448/arch_32/f_impl.h
  8. +18
    -34
      src/p448/arch_arm_32/f_impl.c
  9. +4
    -7
      src/p448/arch_neon/f_impl.c
  10. +1
    -1
      src/p448/arch_ref64/f_impl.c
  11. +1
    -1
      src/p448/arch_x86_64/f_impl.c
  12. +1
    -1
      src/per_field.c

+ 16
- 23
src/decaf.c View File

@@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) {
static INLINE void
cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
constant_time_cond_swap(x,y,sizeof(gf_s),swap);
/*
UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
x->limb[i] ^= s;
y->limb[i] ^= s;
}
*/
}

/** Inverse square root using addition chain. */
@@ -133,7 +126,7 @@ static void
gf_invert(gf y, const gf x) {
gf t1, t2;
gf_sqr(t1, x); // o^2
decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
(void)ret; assert(ret);
gf_sqr(t1, t2);
gf_mul(t2, t1, x); // not direct to y in case of alias.
@@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) {

/** Mul by signed int. Not constant-time WRT the sign of that int. */
static INLINE void
gf_mulw_sgn(gf c, const gf a, int w) {
gf_mulw_sgn(gf c, const gf a, int32_t w) {
if (w>0) {
gf_mulw(c, a, w);
} else {
@@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) {
}

/** Return high bit of x = low bit of 2x mod p */
static decaf_word_t hibit(const gf x) {
static mask_t hibit(const gf x) {
gf y;
gf_add(y,x,x);
gf_strong_reduce(y);
@@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) {

#if COFACTOR==8
/** Return high bit of x = low bit of 2x mod p */
static decaf_word_t lobit(const gf x) {
static mask_t lobit(const gf x) {
gf y;
gf_copy(y,x);
gf_strong_reduce(y);
@@ -873,9 +866,9 @@ static INLINE void
constant_time_lookup_xx (
void *__restrict__ out_,
const void *table_,
decaf_word_t elem_bytes,
decaf_word_t n_table,
decaf_word_t idx
word_t elem_bytes,
word_t n_table,
word_t idx
) {
constant_time_lookup(out_,table_,elem_bytes,n_table,idx);
}
@@ -928,12 +921,12 @@ void API_NS(point_scalarmul) (

for (; i>=0; i-=WINDOW) {
/* Fetch another block of bits */
decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
}
bits &= WINDOW_MASK;
decaf_word_t inv = (bits>>(WINDOW-1))-1;
mask_t inv = (bits>>(WINDOW-1))-1;
bits ^= inv;
/* Add in from table. Compute t only on last iteration. */
@@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) (

for (; i>=0; i-=WINDOW) {
/* Fetch another block of bits */
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
@@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) (
}
bits1 &= WINDOW_MASK;
bits2 &= WINDOW_MASK;
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
mask_t inv1 = (bits1>>(WINDOW-1))-1;
mask_t inv2 = (bits2>>(WINDOW-1))-1;
bits1 ^= inv1;
bits2 ^= inv2;
@@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) (
}
/* Fetch another block of bits */
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
}
bits1 &= WINDOW_MASK;
bits2 &= WINDOW_MASK;
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
mask_t inv1 = (bits1>>(WINDOW-1))-1;
mask_t inv2 = (bits2>>(WINDOW-1))-1;
bits1 ^= inv1;
bits2 ^= inv2;


+ 1
- 1
src/gen_headers/f_field_h.py View File

@@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout);
void gf_add (gf out, const gf a, const gf b);
void gf_sub (gf out, const gf a, const gf b);
void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b);
void gf_sqr (gf_s *__restrict__ out, const gf a);
void gf_serialize (uint8_t *serial, const gf x);
void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */


+ 1
- 1
src/p25519/arch_32/f_impl.c View File

@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += accum;
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
uint32_t *c = cs->limb;


+ 1
- 1
src/p25519/arch_ref64/f_impl.c View File

@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += accum;
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;


+ 15
- 11
src/p25519/arch_x86_64/f_impl.c View File

@@ -4,6 +4,7 @@

#include "f_field.h"

/** Requires: input limbs < 9*2^51 */
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
@@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
ai = a[4];
mac_rm(&accum1, ai, &b[0]);
/* Here accum1 < 5*(9*2^51)^2 */
c[3] = accum0 & mask;
accum1 += shrld(accum0, 51);
@@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
* PERF: good enough to fit into uint64_t.
*/
uint64_t a1 = shrld(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
/* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small
* a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51.
*/
accum1 = a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + shrld(accum1,51);
c[1] = c1 + (accum1>>51);
}

void gf_sqr (gf_s *__restrict__ cs, const gf as) {
@@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = shrld(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
accum1 = a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + shrld(accum1,51);
c[1] = c1 + (accum1>>51);
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;

@@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
mac_rm(&accum, b, &a[4]);
c[4] = accum & mask;

accum = shrld(accum,51);
accum = accum * 19 + c0;
uint64_t a1 = shrld(accum,51);
a1 = a1*19+c0;
c[0] = accum & mask;
c[1] = c1 + shrld(accum,51);
c[0] = a1 & mask;
c[1] = c1 + (a1>>51);
}

+ 6
- 11
src/p448/arch_32/f_impl.c View File

@@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += ((uint32_t)(accum1));
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
assert(b<1<<28);
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
@@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {

int i;

accum0 = widemul(blo, a[0]);
accum8 = widemul(blo, a[8]);
accum0 += widemul(bhi, a[15]);
accum8 += widemul(bhi, a[15] + a[7]);
accum0 = widemul(b, a[0]);
accum8 = widemul(b, a[8]);

c[0] = accum0 & mask; accum0 >>= 28;
c[8] = accum8 & mask; accum8 >>= 28;
for (i=1; i<8; i++) {
accum0 += widemul(blo, a[i]);
accum8 += widemul(blo, a[i+8]);
accum0 += widemul(bhi, a[i-1]);
accum8 += widemul(bhi, a[i+7]);
accum0 += widemul(b, a[i]);
accum8 += widemul(b, a[i+8]);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;


+ 2
- 2
src/p448/arch_32/f_impl.h View File

@@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) {
}

void gf_weak_reduce (gf a) {
uint64_t mask = (1ull<<28) - 1;
uint64_t tmp = a->limb[15] >> 28;
uint32_t mask = (1ull<<28) - 1;
uint32_t tmp = a->limb[15] >> 28;
a->limb[8] += tmp;
for (unsigned int i=15; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);


+ 18
- 34
src/p448/arch_arm_32/f_impl.c View File

@@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
void gf_mulw (
gf_s *__restrict__ cs,
const gf as,
uint64_t b
uint32_t b
) {
uint32_t mask = (1ull<<28)-1;
const uint32_t bhi = b>>28, blo = b & mask;
assert(b <= mask);
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
@@ -737,11 +737,9 @@ void gf_mulw (
int i;

uint32_t c0, c8, n0, n8;
accum0 = widemul(bhi, a[15]);
accum8 = widemul(bhi, a[15] + a[7]);
c0 = a[0]; c8 = a[8];
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);
accum0 = widemul(b, c0);
accum8 = widemul(b, c8);

c[0] = accum0 & mask; accum0 >>= 28;
c[8] = accum8 & mask; accum8 >>= 28;
@@ -749,10 +747,8 @@ void gf_mulw (
i=1;
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);
smlal(&accum0, b, n0);
smlal(&accum8, b, n8);
c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -760,10 +756,8 @@ void gf_mulw (
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);
smlal(&accum0, b, c0);
smlal(&accum8, b, c8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -771,10 +765,8 @@ void gf_mulw (
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);
smlal(&accum0, b, n0);
smlal(&accum8, b, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -782,10 +774,8 @@ void gf_mulw (
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);
smlal(&accum0, b, c0);
smlal(&accum8, b, c8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -793,10 +783,8 @@ void gf_mulw (
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);
smlal(&accum0, b, n0);
smlal(&accum8, b, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -804,10 +792,8 @@ void gf_mulw (
}
{
c0 = a[i]; c8 = a[i+8];
smlal(&accum0, bhi, n0);
smlal(&accum8, bhi, n8);
smlal(&accum0, blo, c0);
smlal(&accum8, blo, c8);
smlal(&accum0, b, c0);
smlal(&accum8, b, c8);
c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -815,10 +801,8 @@ void gf_mulw (
}
{
n0 = a[i]; n8 = a[i+8];
smlal(&accum0, bhi, c0);
smlal(&accum8, bhi, c8);
smlal(&accum0, blo, n0);
smlal(&accum8, blo, n8);
smlal(&accum0, b, n0);
smlal(&accum8, b, n8);

c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;


+ 4
- 7
src/p448/arch_neon/f_impl.c View File

@@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
);
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
assert(b<(1<<28));
uint64x2_t accum;
const uint32x2_t *va = (const uint32x2_t *) as->limb;
uint32x2_t *vo = (uint32x2_t *) cs->limb;
uint32x2_t vc, vn;
uint32x2_t vb = {b & ((1<<28)-1), b>>28};
accum = vmull_lane_u32(va[7], vb, 1);
accum = xx_vaddup_u64(vrev128_u64(accum));
uint32x2_t vb = {b, 0};
vc = va[0];
accum = vmlal_lane_u32(accum, vc, vb, 0);
accum = vmull_lane_u32(accum, vc, vb, 0);
vo[0] = vmovn_u64(accum) & vmask;
accum = vshrq_n_u64(accum,28);
@@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
int i;
for (i=1; i<8; i++) {
vn = va[i];
accum = vmlal_lane_u32(accum, vc, vb, 1);
accum = vmlal_lane_u32(accum, vn, vb, 0);
vo[i] = vmovn_u64(accum) & vmask;
accum = vshrq_n_u64(accum,28);


+ 1
- 1
src/p448/arch_ref64/f_impl.c View File

@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[1] += ((uint64_t)(accum1));
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;



+ 1
- 1
src/p448/arch_x86_64/f_impl.c View File

@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
c[0] += ((uint64_t)(accum1));
}

void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;



+ 1
- 1
src/per_field.c View File

@@ -1,6 +1,6 @@
/**
* @cond internal
* @file decaf_crypto.c
* @file per_field.c
* @copyright
* Copyright (c) 2015-2016 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.


Loading…
Cancel
Save