@@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) { | |||||
static INLINE void | static INLINE void | ||||
cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | ||||
constant_time_cond_swap(x,y,sizeof(gf_s),swap); | constant_time_cond_swap(x,y,sizeof(gf_s),swap); | ||||
/* | |||||
UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) { | |||||
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||||
x->limb[i] ^= s; | |||||
y->limb[i] ^= s; | |||||
} | |||||
*/ | |||||
} | } | ||||
/** Inverse square root using addition chain. */ | /** Inverse square root using addition chain. */ | ||||
@@ -133,7 +126,7 @@ static void | |||||
gf_invert(gf y, const gf x) { | gf_invert(gf y, const gf x) { | ||||
gf t1, t2; | gf t1, t2; | ||||
gf_sqr(t1, x); // o^2 | gf_sqr(t1, x); // o^2 | ||||
decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||||
mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||||
(void)ret; assert(ret); | (void)ret; assert(ret); | ||||
gf_sqr(t1, t2); | gf_sqr(t1, t2); | ||||
gf_mul(t2, t1, x); // not direct to y in case of alias. | gf_mul(t2, t1, x); // not direct to y in case of alias. | ||||
@@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) { | |||||
/** Mul by signed int. Not constant-time WRT the sign of that int. */ | /** Mul by signed int. Not constant-time WRT the sign of that int. */ | ||||
static INLINE void | static INLINE void | ||||
gf_mulw_sgn(gf c, const gf a, int w) { | |||||
gf_mulw_sgn(gf c, const gf a, int32_t w) { | |||||
if (w>0) { | if (w>0) { | ||||
gf_mulw(c, a, w); | gf_mulw(c, a, w); | ||||
} else { | } else { | ||||
@@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) { | |||||
} | } | ||||
/** Return high bit of x = low bit of 2x mod p */ | /** Return high bit of x = low bit of 2x mod p */ | ||||
static decaf_word_t hibit(const gf x) { | |||||
static mask_t hibit(const gf x) { | |||||
gf y; | gf y; | ||||
gf_add(y,x,x); | gf_add(y,x,x); | ||||
gf_strong_reduce(y); | gf_strong_reduce(y); | ||||
@@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) { | |||||
#if COFACTOR==8 | #if COFACTOR==8 | ||||
/** Return high bit of x = low bit of 2x mod p */ | /** Return high bit of x = low bit of 2x mod p */ | ||||
static decaf_word_t lobit(const gf x) { | |||||
static mask_t lobit(const gf x) { | |||||
gf y; | gf y; | ||||
gf_copy(y,x); | gf_copy(y,x); | ||||
gf_strong_reduce(y); | gf_strong_reduce(y); | ||||
@@ -873,9 +866,9 @@ static INLINE void | |||||
constant_time_lookup_xx ( | constant_time_lookup_xx ( | ||||
void *__restrict__ out_, | void *__restrict__ out_, | ||||
const void *table_, | const void *table_, | ||||
decaf_word_t elem_bytes, | |||||
decaf_word_t n_table, | |||||
decaf_word_t idx | |||||
word_t elem_bytes, | |||||
word_t n_table, | |||||
word_t idx | |||||
) { | ) { | ||||
constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | ||||
} | } | ||||
@@ -928,12 +921,12 @@ void API_NS(point_scalarmul) ( | |||||
for (; i>=0; i-=WINDOW) { | for (; i>=0; i-=WINDOW) { | ||||
/* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||||
word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
} | } | ||||
bits &= WINDOW_MASK; | bits &= WINDOW_MASK; | ||||
decaf_word_t inv = (bits>>(WINDOW-1))-1; | |||||
mask_t inv = (bits>>(WINDOW-1))-1; | |||||
bits ^= inv; | bits ^= inv; | ||||
/* Add in from table. Compute t only on last iteration. */ | /* Add in from table. Compute t only on last iteration. */ | ||||
@@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) ( | |||||
for (; i>=0; i-=WINDOW) { | for (; i>=0; i-=WINDOW) { | ||||
/* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | ||||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
@@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) ( | |||||
} | } | ||||
bits1 &= WINDOW_MASK; | bits1 &= WINDOW_MASK; | ||||
bits2 &= WINDOW_MASK; | bits2 &= WINDOW_MASK; | ||||
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
bits1 ^= inv1; | bits1 ^= inv1; | ||||
bits2 ^= inv2; | bits2 ^= inv2; | ||||
@@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) ( | |||||
} | } | ||||
/* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||||
word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||||
if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
} | } | ||||
bits1 &= WINDOW_MASK; | bits1 &= WINDOW_MASK; | ||||
bits2 &= WINDOW_MASK; | bits2 &= WINDOW_MASK; | ||||
decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
bits1 ^= inv1; | bits1 ^= inv1; | ||||
bits2 ^= inv2; | bits2 ^= inv2; | ||||
@@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout); | |||||
void gf_add (gf out, const gf a, const gf b); | void gf_add (gf out, const gf a, const gf b); | ||||
void gf_sub (gf out, const gf a, const gf b); | void gf_sub (gf out, const gf a, const gf b); | ||||
void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | ||||
void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); | |||||
void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b); | |||||
void gf_sqr (gf_s *__restrict__ out, const gf a); | void gf_sqr (gf_s *__restrict__ out, const gf a); | ||||
void gf_serialize (uint8_t *serial, const gf x); | void gf_serialize (uint8_t *serial, const gf x); | ||||
void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | ||||
@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
c[1] += accum; | c[1] += accum; | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | ||||
uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
c[1] += accum; | c[1] += accum; | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
int i; | int i; | ||||
@@ -4,6 +4,7 @@ | |||||
#include "f_field.h" | #include "f_field.h" | ||||
/** Requires: input limbs < 9*2^51 */ | |||||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
ai = a[4]; | ai = a[4]; | ||||
mac_rm(&accum1, ai, &b[0]); | mac_rm(&accum1, ai, &b[0]); | ||||
/* Here accum1 < 5*(9*2^51)^2 */ | |||||
c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
accum1 += shrld(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
@@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
* = 2^(-13 + <13) | * = 2^(-13 + <13) | ||||
* PERF: good enough to fit into uint64_t? | |||||
* PERF: good enough to fit into uint64_t. | |||||
*/ | */ | ||||
uint64_t a1 = shrld(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
accum1 = (__uint128_t)a1 * 19 + c0; | |||||
/* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small | |||||
* a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51. | |||||
*/ | |||||
accum1 = a1 * 19 + c0; | |||||
c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
c[1] = c1 + shrld(accum1,51); | |||||
c[1] = c1 + (accum1>>51); | |||||
} | } | ||||
void gf_sqr (gf_s *__restrict__ cs, const gf as) { | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
@@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||||
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
* = 2^(-13 + <13) | * = 2^(-13 + <13) | ||||
* PERF: good enough to fit into uint64_t? | |||||
*/ | */ | ||||
uint64_t a1 = shrld(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
accum1 = (__uint128_t)a1 * 19 + c0; | |||||
accum1 = a1 * 19 + c0; | |||||
c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
c[1] = c1 + shrld(accum1,51); | |||||
c[1] = c1 + (accum1>>51); | |||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
mac_rm(&accum, b, &a[4]); | mac_rm(&accum, b, &a[4]); | ||||
c[4] = accum & mask; | c[4] = accum & mask; | ||||
accum = shrld(accum,51); | |||||
accum = accum * 19 + c0; | |||||
uint64_t a1 = shrld(accum,51); | |||||
a1 = a1*19+c0; | |||||
c[0] = accum & mask; | |||||
c[1] = c1 + shrld(accum,51); | |||||
c[0] = a1 & mask; | |||||
c[1] = c1 + (a1>>51); | |||||
} | } |
@@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
c[1] += ((uint32_t)(accum1)); | c[1] += ((uint32_t)(accum1)); | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
assert(b<1<<28); | |||||
const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
int i; | int i; | ||||
accum0 = widemul(blo, a[0]); | |||||
accum8 = widemul(blo, a[8]); | |||||
accum0 += widemul(bhi, a[15]); | |||||
accum8 += widemul(bhi, a[15] + a[7]); | |||||
accum0 = widemul(b, a[0]); | |||||
accum8 = widemul(b, a[8]); | |||||
c[0] = accum0 & mask; accum0 >>= 28; | c[0] = accum0 & mask; accum0 >>= 28; | ||||
c[8] = accum8 & mask; accum8 >>= 28; | c[8] = accum8 & mask; accum8 >>= 28; | ||||
for (i=1; i<8; i++) { | for (i=1; i<8; i++) { | ||||
accum0 += widemul(blo, a[i]); | |||||
accum8 += widemul(blo, a[i+8]); | |||||
accum0 += widemul(bhi, a[i-1]); | |||||
accum8 += widemul(bhi, a[i+7]); | |||||
accum0 += widemul(b, a[i]); | |||||
accum8 += widemul(b, a[i+8]); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) { | |||||
} | } | ||||
void gf_weak_reduce (gf a) { | void gf_weak_reduce (gf a) { | ||||
uint64_t mask = (1ull<<28) - 1; | |||||
uint64_t tmp = a->limb[15] >> 28; | |||||
uint32_t mask = (1ull<<28) - 1; | |||||
uint32_t tmp = a->limb[15] >> 28; | |||||
a->limb[8] += tmp; | a->limb[8] += tmp; | ||||
for (unsigned int i=15; i>0; i--) { | for (unsigned int i=15; i>0; i--) { | ||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | ||||
@@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||||
void gf_mulw ( | void gf_mulw ( | ||||
gf_s *__restrict__ cs, | gf_s *__restrict__ cs, | ||||
const gf as, | const gf as, | ||||
uint64_t b | |||||
uint32_t b | |||||
) { | ) { | ||||
uint32_t mask = (1ull<<28)-1; | uint32_t mask = (1ull<<28)-1; | ||||
const uint32_t bhi = b>>28, blo = b & mask; | |||||
assert(b <= mask); | |||||
const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
@@ -737,11 +737,9 @@ void gf_mulw ( | |||||
int i; | int i; | ||||
uint32_t c0, c8, n0, n8; | uint32_t c0, c8, n0, n8; | ||||
accum0 = widemul(bhi, a[15]); | |||||
accum8 = widemul(bhi, a[15] + a[7]); | |||||
c0 = a[0]; c8 = a[8]; | c0 = a[0]; c8 = a[8]; | ||||
smlal(&accum0, blo, c0); | |||||
smlal(&accum8, blo, c8); | |||||
accum0 = widemul(b, c0); | |||||
accum8 = widemul(b, c8); | |||||
c[0] = accum0 & mask; accum0 >>= 28; | c[0] = accum0 & mask; accum0 >>= 28; | ||||
c[8] = accum8 & mask; accum8 >>= 28; | c[8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -749,10 +747,8 @@ void gf_mulw ( | |||||
i=1; | i=1; | ||||
{ | { | ||||
n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
smlal(&accum0, bhi, c0); | |||||
smlal(&accum8, bhi, c8); | |||||
smlal(&accum0, blo, n0); | |||||
smlal(&accum8, blo, n8); | |||||
smlal(&accum0, b, n0); | |||||
smlal(&accum8, b, n8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -760,10 +756,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
smlal(&accum0, bhi, n0); | |||||
smlal(&accum8, bhi, n8); | |||||
smlal(&accum0, blo, c0); | |||||
smlal(&accum8, blo, c8); | |||||
smlal(&accum0, b, c0); | |||||
smlal(&accum8, b, c8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -771,10 +765,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
smlal(&accum0, bhi, c0); | |||||
smlal(&accum8, bhi, c8); | |||||
smlal(&accum0, blo, n0); | |||||
smlal(&accum8, blo, n8); | |||||
smlal(&accum0, b, n0); | |||||
smlal(&accum8, b, n8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -782,10 +774,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
smlal(&accum0, bhi, n0); | |||||
smlal(&accum8, bhi, n8); | |||||
smlal(&accum0, blo, c0); | |||||
smlal(&accum8, blo, c8); | |||||
smlal(&accum0, b, c0); | |||||
smlal(&accum8, b, c8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -793,10 +783,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
smlal(&accum0, bhi, c0); | |||||
smlal(&accum8, bhi, c8); | |||||
smlal(&accum0, blo, n0); | |||||
smlal(&accum8, blo, n8); | |||||
smlal(&accum0, b, n0); | |||||
smlal(&accum8, b, n8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -804,10 +792,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
smlal(&accum0, bhi, n0); | |||||
smlal(&accum8, bhi, n8); | |||||
smlal(&accum0, blo, c0); | |||||
smlal(&accum8, blo, c8); | |||||
smlal(&accum0, b, c0); | |||||
smlal(&accum8, b, c8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -815,10 +801,8 @@ void gf_mulw ( | |||||
} | } | ||||
{ | { | ||||
n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
smlal(&accum0, bhi, c0); | |||||
smlal(&accum8, bhi, c8); | |||||
smlal(&accum0, blo, n0); | |||||
smlal(&accum8, blo, n8); | |||||
smlal(&accum0, b, n0); | |||||
smlal(&accum8, b, n8); | |||||
c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
@@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) { | |||||
); | ); | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | ||||
assert(b<(1<<28)); | |||||
uint64x2_t accum; | uint64x2_t accum; | ||||
const uint32x2_t *va = (const uint32x2_t *) as->limb; | const uint32x2_t *va = (const uint32x2_t *) as->limb; | ||||
uint32x2_t *vo = (uint32x2_t *) cs->limb; | uint32x2_t *vo = (uint32x2_t *) cs->limb; | ||||
uint32x2_t vc, vn; | uint32x2_t vc, vn; | ||||
uint32x2_t vb = {b & ((1<<28)-1), b>>28}; | |||||
accum = vmull_lane_u32(va[7], vb, 1); | |||||
accum = xx_vaddup_u64(vrev128_u64(accum)); | |||||
uint32x2_t vb = {b, 0}; | |||||
vc = va[0]; | vc = va[0]; | ||||
accum = vmlal_lane_u32(accum, vc, vb, 0); | |||||
accum = vmull_lane_u32(accum, vc, vb, 0); | |||||
vo[0] = vmovn_u64(accum) & vmask; | vo[0] = vmovn_u64(accum) & vmask; | ||||
accum = vshrq_n_u64(accum,28); | accum = vshrq_n_u64(accum,28); | ||||
@@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
int i; | int i; | ||||
for (i=1; i<8; i++) { | for (i=1; i<8; i++) { | ||||
vn = va[i]; | vn = va[i]; | ||||
accum = vmlal_lane_u32(accum, vc, vb, 1); | |||||
accum = vmlal_lane_u32(accum, vn, vb, 0); | accum = vmlal_lane_u32(accum, vn, vb, 0); | ||||
vo[i] = vmovn_u64(accum) & vmask; | vo[i] = vmovn_u64(accum) & vmask; | ||||
accum = vshrq_n_u64(accum,28); | accum = vshrq_n_u64(accum,28); | ||||
@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
c[1] += ((uint64_t)(accum1)); | c[1] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
} | } | ||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
@@ -1,6 +1,6 @@ | |||||
/** | /** | ||||
* @cond internal | * @cond internal | ||||
* @file decaf_crypto.c | |||||
* @file per_field.c | |||||
* @copyright | * @copyright | ||||
* Copyright (c) 2015-2016 Cryptography Research, Inc. \n | * Copyright (c) 2015-2016 Cryptography Research, Inc. \n | ||||
* Released under the MIT License. See LICENSE.txt for license information. | * Released under the MIT License. See LICENSE.txt for license information. | ||||