From 790745e2b397a86af824b3ec1c9a4df68911f52d Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Fri, 29 Jan 2016 12:57:27 -0800 Subject: [PATCH] set mulw to <32 bits instead of <64 bits (but actually less than that: 1 limb instead of 2). also there is a bug if you compile ed448 for arch_32 on a 64-bit machine... tracing --- src/decaf.c | 39 ++++++++++--------------- src/gen_headers/f_field_h.py | 2 +- src/p25519/arch_32/f_impl.c | 2 +- src/p25519/arch_ref64/f_impl.c | 2 +- src/p25519/arch_x86_64/f_impl.c | 26 ++++++++++------- src/p448/arch_32/f_impl.c | 17 ++++------- src/p448/arch_32/f_impl.h | 4 +-- src/p448/arch_arm_32/f_impl.c | 52 ++++++++++++--------------------- src/p448/arch_neon/f_impl.c | 11 +++---- src/p448/arch_ref64/f_impl.c | 2 +- src/p448/arch_x86_64/f_impl.c | 2 +- src/per_field.c | 2 +- 12 files changed, 67 insertions(+), 94 deletions(-) diff --git a/src/decaf.c b/src/decaf.c index ae201b3..f4bb0a9 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) { static INLINE void cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { constant_time_cond_swap(x,y,sizeof(gf_s),swap); - /* - UNROLL for (unsigned int i=0; ilimb)/sizeof(x->limb[0]); i++) { - decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; - x->limb[i] ^= s; - y->limb[i] ^= s; - } - */ } /** Inverse square root using addition chain. */ @@ -133,7 +126,7 @@ static void gf_invert(gf y, const gf x) { gf t1, t2; gf_sqr(t1, x); // o^2 - decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o + mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o (void)ret; assert(ret); gf_sqr(t1, t2); gf_mul(t2, t1, x); // not direct to y in case of alias. @@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) { /** Mul by signed int. Not constant-time WRT the sign of that int. */ static INLINE void -gf_mulw_sgn(gf c, const gf a, int w) { +gf_mulw_sgn(gf c, const gf a, int32_t w) { if (w>0) { gf_mulw(c, a, w); } else { @@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) { } /** Return high bit of x = low bit of 2x mod p */ -static decaf_word_t hibit(const gf x) { +static mask_t hibit(const gf x) { gf y; gf_add(y,x,x); gf_strong_reduce(y); @@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) { #if COFACTOR==8 /** Return high bit of x = low bit of 2x mod p */ -static decaf_word_t lobit(const gf x) { +static mask_t lobit(const gf x) { gf y; gf_copy(y,x); gf_strong_reduce(y); @@ -873,9 +866,9 @@ static INLINE void constant_time_lookup_xx ( void *__restrict__ out_, const void *table_, - decaf_word_t elem_bytes, - decaf_word_t n_table, - decaf_word_t idx + word_t elem_bytes, + word_t n_table, + word_t idx ) { constant_time_lookup(out_,table_,elem_bytes,n_table,idx); } @@ -928,12 +921,12 @@ void API_NS(point_scalarmul) ( for (; i>=0; i-=WINDOW) { /* Fetch another block of bits */ - decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); + word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); if (i%WBITS >= WBITS-WINDOW && i/WBITSlimb[i/WBITS+1] << (WBITS - (i%WBITS)); } bits &= WINDOW_MASK; - decaf_word_t inv = (bits>>(WINDOW-1))-1; + mask_t inv = (bits>>(WINDOW-1))-1; bits ^= inv; /* Add in from table. Compute t only on last iteration. */ @@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) ( for (; i>=0; i-=WINDOW) { /* Fetch another block of bits */ - decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), + word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); if (i%WBITS >= WBITS-WINDOW && i/WBITSlimb[i/WBITS+1] << (WBITS - (i%WBITS)); @@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) ( } bits1 &= WINDOW_MASK; bits2 &= WINDOW_MASK; - decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; - decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; + mask_t inv1 = (bits1>>(WINDOW-1))-1; + mask_t inv2 = (bits2>>(WINDOW-1))-1; bits1 ^= inv1; bits2 ^= inv2; @@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) ( } /* Fetch another block of bits */ - decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), - bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); + word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), + bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); if (i%WBITS >= WBITS-WINDOW && i/WBITSlimb[i/WBITS+1] << (WBITS - (i%WBITS)); bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); } bits1 &= WINDOW_MASK; bits2 &= WINDOW_MASK; - decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; - decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; + mask_t inv1 = (bits1>>(WINDOW-1))-1; + mask_t inv2 = (bits2>>(WINDOW-1))-1; bits1 ^= inv1; bits2 ^= inv2; diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py index abc29d6..fa3fa5b 100644 --- a/src/gen_headers/f_field_h.py +++ b/src/gen_headers/f_field_h.py @@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout); void gf_add (gf out, const gf a, const gf b); void gf_sub (gf out, const gf a, const gf b); void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); -void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); +void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b); void gf_sqr (gf_s *__restrict__ out, const gf a); void gf_serialize (uint8_t *serial, const gf x); void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c index 7c9ab84..656d9f7 100644 --- a/src/p25519/arch_32/f_impl.c +++ b/src/p25519/arch_32/f_impl.c @@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += accum; } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; uint32_t *c = cs->limb; diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c index e8cd206..1f0e22d 100644 --- a/src/p25519/arch_ref64/f_impl.c +++ b/src/p25519/arch_ref64/f_impl.c @@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += accum; } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); int i; diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c index 2f94164..1ae69ef 100644 --- a/src/p25519/arch_x86_64/f_impl.c +++ b/src/p25519/arch_x86_64/f_impl.c @@ -4,6 +4,7 @@ #include "f_field.h" +/** Requires: input limbs < 9*2^51 */ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; @@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { ai = a[4]; mac_rm(&accum1, ai, &b[0]); + /* Here accum1 < 5*(9*2^51)^2 */ c[3] = accum0 & mask; accum1 += shrld(accum0, 51); @@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 * = 2^(-13 + <13) - * PERF: good enough to fit into uint64_t? + * PERF: good enough to fit into uint64_t. */ uint64_t a1 = shrld(accum1,51); - accum1 = (__uint128_t)a1 * 19 + c0; + /* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small + * a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51. + */ + accum1 = a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + shrld(accum1,51); + c[1] = c1 + (accum1>>51); } void gf_sqr (gf_s *__restrict__ cs, const gf as) { @@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 * = 2^(-13 + <13) - * PERF: good enough to fit into uint64_t? */ uint64_t a1 = shrld(accum1,51); - accum1 = (__uint128_t)a1 * 19 + c0; + accum1 = a1 * 19 + c0; c[0] = accum1 & mask; - c[1] = c1 + shrld(accum1,51); + c[1] = c1 + (accum1>>51); } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint64_t *a = as->limb, mask = ((1ull<<51)-1); uint64_t *c = cs->limb; @@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { mac_rm(&accum, b, &a[4]); c[4] = accum & mask; - accum = shrld(accum,51); - accum = accum * 19 + c0; + uint64_t a1 = shrld(accum,51); + a1 = a1*19+c0; - c[0] = accum & mask; - c[1] = c1 + shrld(accum,51); + c[0] = a1 & mask; + c[1] = c1 + (a1>>51); } diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c index f70b236..a07aae5 100644 --- a/src/p448/arch_32/f_impl.c +++ b/src/p448/arch_32/f_impl.c @@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint32_t)(accum1)); } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { - const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { + assert(b<1<<28); const uint32_t *a = as->limb; uint32_t *c = cs->limb; @@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { int i; - accum0 = widemul(blo, a[0]); - accum8 = widemul(blo, a[8]); - accum0 += widemul(bhi, a[15]); - accum8 += widemul(bhi, a[15] + a[7]); + accum0 = widemul(b, a[0]); + accum8 = widemul(b, a[8]); c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; for (i=1; i<8; i++) { - accum0 += widemul(blo, a[i]); - accum8 += widemul(blo, a[i+8]); - - accum0 += widemul(bhi, a[i-1]); - accum8 += widemul(bhi, a[i+7]); + accum0 += widemul(b, a[i]); + accum8 += widemul(b, a[i+8]); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h index 330a29c..7eae599 100644 --- a/src/p448/arch_32/f_impl.h +++ b/src/p448/arch_32/f_impl.h @@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) { } void gf_weak_reduce (gf a) { - uint64_t mask = (1ull<<28) - 1; - uint64_t tmp = a->limb[15] >> 28; + uint32_t mask = (1ull<<28) - 1; + uint32_t tmp = a->limb[15] >> 28; a->limb[8] += tmp; for (unsigned int i=15; i>0; i--) { a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c index ddb0494..887c083 100644 --- a/src/p448/arch_arm_32/f_impl.c +++ b/src/p448/arch_arm_32/f_impl.c @@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_mulw ( gf_s *__restrict__ cs, const gf as, - uint64_t b + uint32_t b ) { uint32_t mask = (1ull<<28)-1; - const uint32_t bhi = b>>28, blo = b & mask; + assert(b <= mask); const uint32_t *a = as->limb; uint32_t *c = cs->limb; @@ -737,11 +737,9 @@ void gf_mulw ( int i; uint32_t c0, c8, n0, n8; - accum0 = widemul(bhi, a[15]); - accum8 = widemul(bhi, a[15] + a[7]); c0 = a[0]; c8 = a[8]; - smlal(&accum0, blo, c0); - smlal(&accum8, blo, c8); + accum0 = widemul(b, c0); + accum8 = widemul(b, c8); c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; @@ -749,10 +747,8 @@ void gf_mulw ( i=1; { n0 = a[i]; n8 = a[i+8]; - smlal(&accum0, bhi, c0); - smlal(&accum8, bhi, c8); - smlal(&accum0, blo, n0); - smlal(&accum8, blo, n8); + smlal(&accum0, b, n0); + smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -760,10 +756,8 @@ void gf_mulw ( } { c0 = a[i]; c8 = a[i+8]; - smlal(&accum0, bhi, n0); - smlal(&accum8, bhi, n8); - smlal(&accum0, blo, c0); - smlal(&accum8, blo, c8); + smlal(&accum0, b, c0); + smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -771,10 +765,8 @@ void gf_mulw ( } { n0 = a[i]; n8 = a[i+8]; - smlal(&accum0, bhi, c0); - smlal(&accum8, bhi, c8); - smlal(&accum0, blo, n0); - smlal(&accum8, blo, n8); + smlal(&accum0, b, n0); + smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -782,10 +774,8 @@ void gf_mulw ( } { c0 = a[i]; c8 = a[i+8]; - smlal(&accum0, bhi, n0); - smlal(&accum8, bhi, n8); - smlal(&accum0, blo, c0); - smlal(&accum8, blo, c8); + smlal(&accum0, b, c0); + smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -793,10 +783,8 @@ void gf_mulw ( } { n0 = a[i]; n8 = a[i+8]; - smlal(&accum0, bhi, c0); - smlal(&accum8, bhi, c8); - smlal(&accum0, blo, n0); - smlal(&accum8, blo, n8); + smlal(&accum0, b, n0); + smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -804,10 +792,8 @@ void gf_mulw ( } { c0 = a[i]; c8 = a[i+8]; - smlal(&accum0, bhi, n0); - smlal(&accum8, bhi, n8); - smlal(&accum0, blo, c0); - smlal(&accum8, blo, c8); + smlal(&accum0, b, c0); + smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; @@ -815,10 +801,8 @@ void gf_mulw ( } { n0 = a[i]; n8 = a[i+8]; - smlal(&accum0, bhi, c0); - smlal(&accum8, bhi, c8); - smlal(&accum0, blo, n0); - smlal(&accum8, blo, n8); + smlal(&accum0, b, n0); + smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; diff --git a/src/p448/arch_neon/f_impl.c b/src/p448/arch_neon/f_impl.c index 2319c7b..ba0e303 100644 --- a/src/p448/arch_neon/f_impl.c +++ b/src/p448/arch_neon/f_impl.c @@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) { ); } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; + assert(b<(1<<28)); uint64x2_t accum; const uint32x2_t *va = (const uint32x2_t *) as->limb; uint32x2_t *vo = (uint32x2_t *) cs->limb; uint32x2_t vc, vn; - uint32x2_t vb = {b & ((1<<28)-1), b>>28}; - - accum = vmull_lane_u32(va[7], vb, 1); - accum = xx_vaddup_u64(vrev128_u64(accum)); + uint32x2_t vb = {b, 0}; vc = va[0]; - accum = vmlal_lane_u32(accum, vc, vb, 0); + accum = vmull_lane_u32(accum, vc, vb, 0); vo[0] = vmovn_u64(accum) & vmask; accum = vshrq_n_u64(accum,28); @@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { int i; for (i=1; i<8; i++) { vn = va[i]; - accum = vmlal_lane_u32(accum, vc, vb, 1); accum = vmlal_lane_u32(accum, vn, vb, 0); vo[i] = vmovn_u64(accum) & vmask; accum = vshrq_n_u64(accum,28); diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c index 22162aa..4273d3d 100644 --- a/src/p448/arch_ref64/f_impl.c +++ b/src/p448/arch_ref64/f_impl.c @@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint64_t)(accum1)); } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c index 943e80b..4989cb5 100644 --- a/src/p448/arch_x86_64/f_impl.c +++ b/src/p448/arch_x86_64/f_impl.c @@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[0] += ((uint64_t)(accum1)); } -void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { +void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; diff --git a/src/per_field.c b/src/per_field.c index c76be14..c60b17f 100644 --- a/src/per_field.c +++ b/src/per_field.c @@ -1,6 +1,6 @@ /** * @cond internal - * @file decaf_crypto.c + * @file per_field.c * @copyright * Copyright (c) 2015-2016 Cryptography Research, Inc. \n * Released under the MIT License. See LICENSE.txt for license information.