| @@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) { | |||
| static INLINE void | |||
| cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | |||
| constant_time_cond_swap(x,y,sizeof(gf_s),swap); | |||
| /* | |||
| UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) { | |||
| decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||
| x->limb[i] ^= s; | |||
| y->limb[i] ^= s; | |||
| } | |||
| */ | |||
| } | |||
| /** Inverse square root using addition chain. */ | |||
| @@ -133,7 +126,7 @@ static void | |||
| gf_invert(gf y, const gf x) { | |||
| gf t1, t2; | |||
| gf_sqr(t1, x); // o^2 | |||
| decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||
| mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||
| (void)ret; assert(ret); | |||
| gf_sqr(t1, t2); | |||
| gf_mul(t2, t1, x); // not direct to y in case of alias. | |||
| @@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) { | |||
| /** Mul by signed int. Not constant-time WRT the sign of that int. */ | |||
| static INLINE void | |||
| gf_mulw_sgn(gf c, const gf a, int w) { | |||
| gf_mulw_sgn(gf c, const gf a, int32_t w) { | |||
| if (w>0) { | |||
| gf_mulw(c, a, w); | |||
| } else { | |||
| @@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) { | |||
| } | |||
| /** Return high bit of x = low bit of 2x mod p */ | |||
| static decaf_word_t hibit(const gf x) { | |||
| static mask_t hibit(const gf x) { | |||
| gf y; | |||
| gf_add(y,x,x); | |||
| gf_strong_reduce(y); | |||
| @@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) { | |||
| #if COFACTOR==8 | |||
| /** Return high bit of x = low bit of 2x mod p */ | |||
| static decaf_word_t lobit(const gf x) { | |||
| static mask_t lobit(const gf x) { | |||
| gf y; | |||
| gf_copy(y,x); | |||
| gf_strong_reduce(y); | |||
| @@ -873,9 +866,9 @@ static INLINE void | |||
| constant_time_lookup_xx ( | |||
| void *__restrict__ out_, | |||
| const void *table_, | |||
| decaf_word_t elem_bytes, | |||
| decaf_word_t n_table, | |||
| decaf_word_t idx | |||
| word_t elem_bytes, | |||
| word_t n_table, | |||
| word_t idx | |||
| ) { | |||
| constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | |||
| } | |||
| @@ -928,12 +921,12 @@ void API_NS(point_scalarmul) ( | |||
| for (; i>=0; i-=WINDOW) { | |||
| /* Fetch another block of bits */ | |||
| decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||
| word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
| bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
| } | |||
| bits &= WINDOW_MASK; | |||
| decaf_word_t inv = (bits>>(WINDOW-1))-1; | |||
| mask_t inv = (bits>>(WINDOW-1))-1; | |||
| bits ^= inv; | |||
| /* Add in from table. Compute t only on last iteration. */ | |||
| @@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) ( | |||
| for (; i>=0; i-=WINDOW) { | |||
| /* Fetch another block of bits */ | |||
| decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
| word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
| bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
| @@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) ( | |||
| } | |||
| bits1 &= WINDOW_MASK; | |||
| bits2 &= WINDOW_MASK; | |||
| decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||
| decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||
| mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||
| mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||
| bits1 ^= inv1; | |||
| bits2 ^= inv2; | |||
| @@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) ( | |||
| } | |||
| /* Fetch another block of bits */ | |||
| decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
| word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | |||
| bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
| bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | |||
| } | |||
| bits1 &= WINDOW_MASK; | |||
| bits2 &= WINDOW_MASK; | |||
| decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||
| decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||
| mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||
| mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||
| bits1 ^= inv1; | |||
| bits2 ^= inv2; | |||
| @@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout); | |||
| void gf_add (gf out, const gf a, const gf b); | |||
| void gf_sub (gf out, const gf a, const gf b); | |||
| void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | |||
| void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); | |||
| void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b); | |||
| void gf_sqr (gf_s *__restrict__ out, const gf a); | |||
| void gf_serialize (uint8_t *serial, const gf x); | |||
| void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | |||
| @@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| c[1] += accum; | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | |||
| uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | |||
| uint32_t *c = cs->limb; | |||
| @@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| c[1] += accum; | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
| int i; | |||
| @@ -4,6 +4,7 @@ | |||
| #include "f_field.h" | |||
| /** Requires: input limbs < 9*2^51 */ | |||
| void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||
| uint64_t *c = cs->limb; | |||
| @@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| ai = a[4]; | |||
| mac_rm(&accum1, ai, &b[0]); | |||
| /* Here accum1 < 5*(9*2^51)^2 */ | |||
| c[3] = accum0 & mask; | |||
| accum1 += shrld(accum0, 51); | |||
| @@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | |||
| * = 2^(-13 + <13) | |||
| * PERF: good enough to fit into uint64_t? | |||
| * PERF: good enough to fit into uint64_t. | |||
| */ | |||
| uint64_t a1 = shrld(accum1,51); | |||
| accum1 = (__uint128_t)a1 * 19 + c0; | |||
| /* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small | |||
| * a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51. | |||
| */ | |||
| accum1 = a1 * 19 + c0; | |||
| c[0] = accum1 & mask; | |||
| c[1] = c1 + shrld(accum1,51); | |||
| c[1] = c1 + (accum1>>51); | |||
| } | |||
| void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
| @@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | |||
| * = 2^(-13 + <13) | |||
| * PERF: good enough to fit into uint64_t? | |||
| */ | |||
| uint64_t a1 = shrld(accum1,51); | |||
| accum1 = (__uint128_t)a1 * 19 + c0; | |||
| accum1 = a1 * 19 + c0; | |||
| c[0] = accum1 & mask; | |||
| c[1] = c1 + shrld(accum1,51); | |||
| c[1] = c1 + (accum1>>51); | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||
| uint64_t *c = cs->limb; | |||
| @@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| mac_rm(&accum, b, &a[4]); | |||
| c[4] = accum & mask; | |||
| accum = shrld(accum,51); | |||
| accum = accum * 19 + c0; | |||
| uint64_t a1 = shrld(accum,51); | |||
| a1 = a1*19+c0; | |||
| c[0] = accum & mask; | |||
| c[1] = c1 + shrld(accum,51); | |||
| c[0] = a1 & mask; | |||
| c[1] = c1 + (a1>>51); | |||
| } | |||
| @@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| c[1] += ((uint32_t)(accum1)); | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| assert(b<1<<28); | |||
| const uint32_t *a = as->limb; | |||
| uint32_t *c = cs->limb; | |||
| @@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| int i; | |||
| accum0 = widemul(blo, a[0]); | |||
| accum8 = widemul(blo, a[8]); | |||
| accum0 += widemul(bhi, a[15]); | |||
| accum8 += widemul(bhi, a[15] + a[7]); | |||
| accum0 = widemul(b, a[0]); | |||
| accum8 = widemul(b, a[8]); | |||
| c[0] = accum0 & mask; accum0 >>= 28; | |||
| c[8] = accum8 & mask; accum8 >>= 28; | |||
| for (i=1; i<8; i++) { | |||
| accum0 += widemul(blo, a[i]); | |||
| accum8 += widemul(blo, a[i+8]); | |||
| accum0 += widemul(bhi, a[i-1]); | |||
| accum8 += widemul(bhi, a[i+7]); | |||
| accum0 += widemul(b, a[i]); | |||
| accum8 += widemul(b, a[i+8]); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) { | |||
| } | |||
| void gf_weak_reduce (gf a) { | |||
| uint64_t mask = (1ull<<28) - 1; | |||
| uint64_t tmp = a->limb[15] >> 28; | |||
| uint32_t mask = (1ull<<28) - 1; | |||
| uint32_t tmp = a->limb[15] >> 28; | |||
| a->limb[8] += tmp; | |||
| for (unsigned int i=15; i>0; i--) { | |||
| a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | |||
| @@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||
| void gf_mulw ( | |||
| gf_s *__restrict__ cs, | |||
| const gf as, | |||
| uint64_t b | |||
| uint32_t b | |||
| ) { | |||
| uint32_t mask = (1ull<<28)-1; | |||
| const uint32_t bhi = b>>28, blo = b & mask; | |||
| assert(b <= mask); | |||
| const uint32_t *a = as->limb; | |||
| uint32_t *c = cs->limb; | |||
| @@ -737,11 +737,9 @@ void gf_mulw ( | |||
| int i; | |||
| uint32_t c0, c8, n0, n8; | |||
| accum0 = widemul(bhi, a[15]); | |||
| accum8 = widemul(bhi, a[15] + a[7]); | |||
| c0 = a[0]; c8 = a[8]; | |||
| smlal(&accum0, blo, c0); | |||
| smlal(&accum8, blo, c8); | |||
| accum0 = widemul(b, c0); | |||
| accum8 = widemul(b, c8); | |||
| c[0] = accum0 & mask; accum0 >>= 28; | |||
| c[8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -749,10 +747,8 @@ void gf_mulw ( | |||
| i=1; | |||
| { | |||
| n0 = a[i]; n8 = a[i+8]; | |||
| smlal(&accum0, bhi, c0); | |||
| smlal(&accum8, bhi, c8); | |||
| smlal(&accum0, blo, n0); | |||
| smlal(&accum8, blo, n8); | |||
| smlal(&accum0, b, n0); | |||
| smlal(&accum8, b, n8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -760,10 +756,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| c0 = a[i]; c8 = a[i+8]; | |||
| smlal(&accum0, bhi, n0); | |||
| smlal(&accum8, bhi, n8); | |||
| smlal(&accum0, blo, c0); | |||
| smlal(&accum8, blo, c8); | |||
| smlal(&accum0, b, c0); | |||
| smlal(&accum8, b, c8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -771,10 +765,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| n0 = a[i]; n8 = a[i+8]; | |||
| smlal(&accum0, bhi, c0); | |||
| smlal(&accum8, bhi, c8); | |||
| smlal(&accum0, blo, n0); | |||
| smlal(&accum8, blo, n8); | |||
| smlal(&accum0, b, n0); | |||
| smlal(&accum8, b, n8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -782,10 +774,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| c0 = a[i]; c8 = a[i+8]; | |||
| smlal(&accum0, bhi, n0); | |||
| smlal(&accum8, bhi, n8); | |||
| smlal(&accum0, blo, c0); | |||
| smlal(&accum8, blo, c8); | |||
| smlal(&accum0, b, c0); | |||
| smlal(&accum8, b, c8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -793,10 +783,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| n0 = a[i]; n8 = a[i+8]; | |||
| smlal(&accum0, bhi, c0); | |||
| smlal(&accum8, bhi, c8); | |||
| smlal(&accum0, blo, n0); | |||
| smlal(&accum8, blo, n8); | |||
| smlal(&accum0, b, n0); | |||
| smlal(&accum8, b, n8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -804,10 +792,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| c0 = a[i]; c8 = a[i+8]; | |||
| smlal(&accum0, bhi, n0); | |||
| smlal(&accum8, bhi, n8); | |||
| smlal(&accum0, blo, c0); | |||
| smlal(&accum8, blo, c8); | |||
| smlal(&accum0, b, c0); | |||
| smlal(&accum8, b, c8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -815,10 +801,8 @@ void gf_mulw ( | |||
| } | |||
| { | |||
| n0 = a[i]; n8 = a[i+8]; | |||
| smlal(&accum0, bhi, c0); | |||
| smlal(&accum8, bhi, c8); | |||
| smlal(&accum0, blo, n0); | |||
| smlal(&accum8, blo, n8); | |||
| smlal(&accum0, b, n0); | |||
| smlal(&accum8, b, n8); | |||
| c[i] = accum0 & mask; accum0 >>= 28; | |||
| c[i+8] = accum8 & mask; accum8 >>= 28; | |||
| @@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) { | |||
| ); | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | |||
| assert(b<(1<<28)); | |||
| uint64x2_t accum; | |||
| const uint32x2_t *va = (const uint32x2_t *) as->limb; | |||
| uint32x2_t *vo = (uint32x2_t *) cs->limb; | |||
| uint32x2_t vc, vn; | |||
| uint32x2_t vb = {b & ((1<<28)-1), b>>28}; | |||
| accum = vmull_lane_u32(va[7], vb, 1); | |||
| accum = xx_vaddup_u64(vrev128_u64(accum)); | |||
| uint32x2_t vb = {b, 0}; | |||
| vc = va[0]; | |||
| accum = vmlal_lane_u32(accum, vc, vb, 0); | |||
| accum = vmull_lane_u32(accum, vc, vb, 0); | |||
| vo[0] = vmovn_u64(accum) & vmask; | |||
| accum = vshrq_n_u64(accum,28); | |||
| @@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| int i; | |||
| for (i=1; i<8; i++) { | |||
| vn = va[i]; | |||
| accum = vmlal_lane_u32(accum, vc, vb, 1); | |||
| accum = vmlal_lane_u32(accum, vn, vb, 0); | |||
| vo[i] = vmovn_u64(accum) & vmask; | |||
| accum = vshrq_n_u64(accum,28); | |||
| @@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| c[1] += ((uint64_t)(accum1)); | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| const uint64_t *a = as->limb; | |||
| uint64_t *c = cs->limb; | |||
| @@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||
| c[0] += ((uint64_t)(accum1)); | |||
| } | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||
| const uint64_t *a = as->limb; | |||
| uint64_t *c = cs->limb; | |||
| @@ -1,6 +1,6 @@ | |||
| /** | |||
| * @cond internal | |||
| * @file decaf_crypto.c | |||
| * @file per_field.c | |||
| * @copyright | |||
| * Copyright (c) 2015-2016 Cryptography Research, Inc. \n | |||
| * Released under the MIT License. See LICENSE.txt for license information. | |||