@@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) { | |||
static INLINE void | |||
cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | |||
constant_time_cond_swap(x,y,sizeof(gf_s),swap); | |||
/* | |||
UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) { | |||
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
x->limb[i] ^= s; | |||
y->limb[i] ^= s; | |||
} | |||
*/ | |||
} | |||
/** Inverse square root using addition chain. */ | |||
@@ -133,7 +126,7 @@ static void | |||
gf_invert(gf y, const gf x) { | |||
gf t1, t2; | |||
gf_sqr(t1, x); // o^2 | |||
decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||
mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||
(void)ret; assert(ret); | |||
gf_sqr(t1, t2); | |||
gf_mul(t2, t1, x); // not direct to y in case of alias. | |||
@@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) { | |||
/** Mul by signed int. Not constant-time WRT the sign of that int. */ | |||
static INLINE void | |||
gf_mulw_sgn(gf c, const gf a, int w) { | |||
gf_mulw_sgn(gf c, const gf a, int32_t w) { | |||
if (w>0) { | |||
gf_mulw(c, a, w); | |||
} else { | |||
@@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) { | |||
} | |||
/** Return high bit of x = low bit of 2x mod p */ | |||
static decaf_word_t hibit(const gf x) { | |||
static mask_t hibit(const gf x) { | |||
gf y; | |||
gf_add(y,x,x); | |||
gf_strong_reduce(y); | |||
@@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) { | |||
#if COFACTOR==8 | |||
/** Return high bit of x = low bit of 2x mod p */ | |||
static decaf_word_t lobit(const gf x) { | |||
static mask_t lobit(const gf x) { | |||
gf y; | |||
gf_copy(y,x); | |||
gf_strong_reduce(y); | |||
@@ -873,9 +866,9 @@ static INLINE void | |||
constant_time_lookup_xx ( | |||
void *__restrict__ out_, | |||
const void *table_, | |||
decaf_word_t elem_bytes, | |||
decaf_word_t n_table, | |||
decaf_word_t idx | |||
word_t elem_bytes, | |||
word_t n_table, | |||
word_t idx | |||
) { | |||
constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | |||
} | |||
@@ -928,12 +921,12 @@ void API_NS(point_scalarmul) ( | |||
for (; i>=0; i-=WINDOW) { | |||
/* Fetch another block of bits */ | |||
decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||
word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
} | |||
bits &= WINDOW_MASK; | |||
decaf_word_t inv = (bits>>(WINDOW-1))-1; | |||
mask_t inv = (bits>>(WINDOW-1))-1; | |||
bits ^= inv; | |||
/* Add in from table. Compute t only on last iteration. */ | |||
@@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) ( | |||
for (; i>=0; i-=WINDOW) { | |||
/* Fetch another block of bits */ | |||
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
@@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) ( | |||
} | |||
bits1 &= WINDOW_MASK; | |||
bits2 &= WINDOW_MASK; | |||
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||
mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||
mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||
bits1 ^= inv1; | |||
bits2 ^= inv2; | |||
@@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) ( | |||
} | |||
/* Fetch another block of bits */ | |||
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
} | |||
bits1 &= WINDOW_MASK; | |||
bits2 &= WINDOW_MASK; | |||
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||
mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||
mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||
bits1 ^= inv1; | |||
bits2 ^= inv2; | |||
@@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout); | |||
void gf_add (gf out, const gf a, const gf b); | |||
void gf_sub (gf out, const gf a, const gf b); | |||
void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | |||
void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); | |||
void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b); | |||
void gf_sqr (gf_s *__restrict__ out, const gf a); | |||
void gf_serialize (uint8_t *serial, const gf x); | |||
void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | |||
@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
c[1] += accum; | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | |||
uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | |||
uint32_t *c = cs->limb; | |||
@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
c[1] += accum; | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
int i; | |||
@@ -4,6 +4,7 @@ | |||
#include "f_field.h" | |||
/** Requires: input limbs < 9*2^51 */ | |||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||
uint64_t *c = cs->limb; | |||
@@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
ai = a[4]; | |||
mac_rm(&accum1, ai, &b[0]); | |||
/* Here accum1 < 5*(9*2^51)^2 */ | |||
c[3] = accum0 & mask; | |||
accum1 += shrld(accum0, 51); | |||
@@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | |||
* = 2^(-13 + <13) | |||
* PERF: good enough to fit into uint64_t? | |||
* PERF: good enough to fit into uint64_t. | |||
*/ | |||
uint64_t a1 = shrld(accum1,51); | |||
accum1 = (__uint128_t)a1 * 19 + c0; | |||
/* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small | |||
* a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51. | |||
*/ | |||
accum1 = a1 * 19 + c0; | |||
c[0] = accum1 & mask; | |||
c[1] = c1 + shrld(accum1,51); | |||
c[1] = c1 + (accum1>>51); | |||
} | |||
void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
@@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | |||
* = 2^(-13 + <13) | |||
* PERF: good enough to fit into uint64_t? | |||
*/ | |||
uint64_t a1 = shrld(accum1,51); | |||
accum1 = (__uint128_t)a1 * 19 + c0; | |||
accum1 = a1 * 19 + c0; | |||
c[0] = accum1 & mask; | |||
c[1] = c1 + shrld(accum1,51); | |||
c[1] = c1 + (accum1>>51); | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
uint64_t *c = cs->limb; | |||
@@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
mac_rm(&accum, b, &a[4]); | |||
c[4] = accum & mask; | |||
accum = shrld(accum,51); | |||
accum = accum * 19 + c0; | |||
uint64_t a1 = shrld(accum,51); | |||
a1 = a1*19+c0; | |||
c[0] = accum & mask; | |||
c[1] = c1 + shrld(accum,51); | |||
c[0] = a1 & mask; | |||
c[1] = c1 + (a1>>51); | |||
} |
@@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
c[1] += ((uint32_t)(accum1)); | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
assert(b<1<<28); | |||
const uint32_t *a = as->limb; | |||
uint32_t *c = cs->limb; | |||
@@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
int i; | |||
accum0 = widemul(blo, a[0]); | |||
accum8 = widemul(blo, a[8]); | |||
accum0 += widemul(bhi, a[15]); | |||
accum8 += widemul(bhi, a[15] + a[7]); | |||
accum0 = widemul(b, a[0]); | |||
accum8 = widemul(b, a[8]); | |||
c[0] = accum0 & mask; accum0 >>= 28; | |||
c[8] = accum8 & mask; accum8 >>= 28; | |||
for (i=1; i<8; i++) { | |||
accum0 += widemul(blo, a[i]); | |||
accum8 += widemul(blo, a[i+8]); | |||
accum0 += widemul(bhi, a[i-1]); | |||
accum8 += widemul(bhi, a[i+7]); | |||
accum0 += widemul(b, a[i]); | |||
accum8 += widemul(b, a[i+8]); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) { | |||
} | |||
void gf_weak_reduce (gf a) { | |||
uint64_t mask = (1ull<<28) - 1; | |||
uint64_t tmp = a->limb[15] >> 28; | |||
uint32_t mask = (1ull<<28) - 1; | |||
uint32_t tmp = a->limb[15] >> 28; | |||
a->limb[8] += tmp; | |||
for (unsigned int i=15; i>0; i--) { | |||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | |||
@@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
void gf_mulw ( | |||
gf_s *__restrict__ cs, | |||
const gf as, | |||
uint64_t b | |||
uint32_t b | |||
) { | |||
uint32_t mask = (1ull<<28)-1; | |||
const uint32_t bhi = b>>28, blo = b & mask; | |||
assert(b <= mask); | |||
const uint32_t *a = as->limb; | |||
uint32_t *c = cs->limb; | |||
@@ -737,11 +737,9 @@ void gf_mulw ( | |||
int i; | |||
uint32_t c0, c8, n0, n8; | |||
accum0 = widemul(bhi, a[15]); | |||
accum8 = widemul(bhi, a[15] + a[7]); | |||
c0 = a[0]; c8 = a[8]; | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
accum0 = widemul(b, c0); | |||
accum8 = widemul(b, c8); | |||
c[0] = accum0 & mask; accum0 >>= 28; | |||
c[8] = accum8 & mask; accum8 >>= 28; | |||
@@ -749,10 +747,8 @@ void gf_mulw ( | |||
i=1; | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
smlal(&accum0, b, n0); | |||
smlal(&accum8, b, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -760,10 +756,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
smlal(&accum0, b, c0); | |||
smlal(&accum8, b, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -771,10 +765,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
smlal(&accum0, b, n0); | |||
smlal(&accum8, b, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -782,10 +774,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
smlal(&accum0, b, c0); | |||
smlal(&accum8, b, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -793,10 +783,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
smlal(&accum0, b, n0); | |||
smlal(&accum8, b, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -804,10 +792,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
c0 = a[i]; c8 = a[i+8]; | |||
smlal(&accum0, bhi, n0); | |||
smlal(&accum8, bhi, n8); | |||
smlal(&accum0, blo, c0); | |||
smlal(&accum8, blo, c8); | |||
smlal(&accum0, b, c0); | |||
smlal(&accum8, b, c8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -815,10 +801,8 @@ void gf_mulw ( | |||
} | |||
{ | |||
n0 = a[i]; n8 = a[i+8]; | |||
smlal(&accum0, bhi, c0); | |||
smlal(&accum8, bhi, c8); | |||
smlal(&accum0, blo, n0); | |||
smlal(&accum8, blo, n8); | |||
smlal(&accum0, b, n0); | |||
smlal(&accum8, b, n8); | |||
c[i] = accum0 & mask; accum0 >>= 28; | |||
c[i+8] = accum8 & mask; accum8 >>= 28; | |||
@@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) { | |||
); | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | |||
assert(b<(1<<28)); | |||
uint64x2_t accum; | |||
const uint32x2_t *va = (const uint32x2_t *) as->limb; | |||
uint32x2_t *vo = (uint32x2_t *) cs->limb; | |||
uint32x2_t vc, vn; | |||
uint32x2_t vb = {b & ((1<<28)-1), b>>28}; | |||
accum = vmull_lane_u32(va[7], vb, 1); | |||
accum = xx_vaddup_u64(vrev128_u64(accum)); | |||
uint32x2_t vb = {b, 0}; | |||
vc = va[0]; | |||
accum = vmlal_lane_u32(accum, vc, vb, 0); | |||
accum = vmull_lane_u32(accum, vc, vb, 0); | |||
vo[0] = vmovn_u64(accum) & vmask; | |||
accum = vshrq_n_u64(accum,28); | |||
@@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
int i; | |||
for (i=1; i<8; i++) { | |||
vn = va[i]; | |||
accum = vmlal_lane_u32(accum, vc, vb, 1); | |||
accum = vmlal_lane_u32(accum, vn, vb, 0); | |||
vo[i] = vmovn_u64(accum) & vmask; | |||
accum = vshrq_n_u64(accum,28); | |||
@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
c[1] += ((uint64_t)(accum1)); | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
const uint64_t *a = as->limb; | |||
uint64_t *c = cs->limb; | |||
@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
c[0] += ((uint64_t)(accum1)); | |||
} | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
const uint64_t *a = as->limb; | |||
uint64_t *c = cs->limb; | |||
@@ -1,6 +1,6 @@ | |||
/** | |||
* @cond internal | |||
* @file decaf_crypto.c | |||
* @file per_field.c | |||
* @copyright | |||
* Copyright (c) 2015-2016 Cryptography Research, Inc. \n | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||