/* Copyright (c) 2014 Cryptography Research, Inc. * Released under the MIT License. See LICENSE.txt for license information. */ #include "barrett_field.h" #include #include word_t add_nr_ext_packed( word_t *out, const word_t *a, uint32_t nwords_a, const word_t *c, uint32_t nwords_c, word_t mask ) { uint32_t i; dword_t carry = 0; for (i=0; i>= WORD_BITS; } for (; i>= WORD_BITS; } return carry; } static __inline__ word_t add_nr_packed( word_t *a, const word_t *c, uint32_t nwords ) { uint32_t i; dword_t carry = 0; for (i=0; i>= WORD_BITS; } return carry; } word_t sub_nr_ext_packed( word_t *out, const word_t *a, uint32_t nwords_a, const word_t *c, uint32_t nwords_c, word_t mask ) { uint32_t i; dsword_t carry = 0; for (i=0; i>= WORD_BITS; } for (; i>= WORD_BITS; } return carry; } static word_t widemac( word_t *accum, uint32_t nwords_accum, const word_t *mier, uint32_t nwords_mier, word_t mand, word_t carry ) { uint32_t i; assert(nwords_mier <= nwords_accum); for (i=0; i> WORD_BITS; } for (; i> WORD_BITS; } return carry; } void barrett_negate ( word_t *a, uint32_t nwords_a, const struct barrett_prime_t *prime ) { uint32_t i; dsword_t carry = 0; barrett_reduce(a,nwords_a,0,prime); /* Have p = 2^big - p_lo. Want p - a = 2^big - p_lo - a */ for (i=0; inwords_lo; i++) { a[i] = carry = carry - prime->p_lo[i] - a[i]; carry >>= WORD_BITS; } for (; inwords_p; i++) { a[i] = carry = carry - a[i]; if (inwords_p-1) { carry >>= WORD_BITS; } } a[prime->nwords_p-1] = carry = carry + (((word_t)1) << prime->p_shift); for (; i>WORD_BITS)); } void barrett_reduce( word_t *a, uint32_t nwords_a, word_t a_carry, const struct barrett_prime_t *prime ) { uint32_t repeat, nwords_left_in_a=nwords_a; /* Is there a point to this a_carry business? */ assert(a_carry < ((word_t)1) << prime->p_shift); assert(nwords_a >= prime->nwords_p); assert(prime->nwords_p > 0); /* scan-build: prevent underflow */ for (; nwords_left_in_a >= prime->nwords_p; nwords_left_in_a--) { for (repeat=0; repeat<2; repeat++) { /* PERF: surely a more careful implementation could * avoid this double round */ word_t mand = a[nwords_left_in_a-1] >> prime->p_shift; a[nwords_left_in_a-1] &= (((word_t)1)<p_shift)-1; if (prime->p_shift && !repeat) { /* collect high bits when there are any */ if (nwords_left_in_a < nwords_a) { mand |= a[nwords_left_in_a] << (WORD_BITS-prime->p_shift); a[nwords_left_in_a] = 0; } else { mand |= a_carry << (WORD_BITS-prime->p_shift); } } word_t carry = widemac( a+nwords_left_in_a-prime->nwords_p, prime->nwords_p, prime->p_lo, prime->nwords_lo, mand, 0 ); assert(!carry); (void)carry; } } assert(nwords_left_in_a == prime->nwords_p-1); /* OK, but it still isn't reduced. Add and subtract p_lo. */ word_t cout = add_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,-1); if (prime->p_shift) { cout = (cout<<(WORD_BITS-prime->p_shift)) + (a[prime->nwords_p-1]>>prime->p_shift); a[prime->nwords_p-1] &= (((word_t)1)<p_shift)-1; } /* mask = carry-1: if no carry then do sub, otherwise don't */ sub_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,cout-1); } /* PERF: This function is horribly slow. Enough to break 1%. */ void barrett_mul_or_mac( word_t *accum, uint32_t nwords_accum, const word_t *a, uint32_t nwords_a, const word_t *b, uint32_t nwords_b, const struct barrett_prime_t *prime, mask_t doMac ) { assert(nwords_accum >= prime->nwords_p); /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */ uint32_t nwords_tmp = (nwords_a > prime->nwords_p) ? nwords_a : prime->nwords_p; nwords_tmp++; assert(nwords_tmp > 0); /* scan-build: prevent underflow. */ if (nwords_tmp < nwords_accum && doMac) nwords_tmp = nwords_accum; word_t tmp[nwords_tmp]; int bpos, idown; uint32_t i; for (i=0; i= 0; bpos--) { /* Invariant at the beginning of the loop: the high word is unused. */ assert(tmp[nwords_tmp-1] == 0); /* shift up */ for (idown=nwords_tmp-2; idown>=0; idown--) { tmp[idown+1] = tmp[idown]; } tmp[0] = 0; /* mac and reduce */ word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0); /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */ assert(!carry); barrett_reduce(tmp, nwords_tmp, carry, prime); /* at this point, the number of words used is nwords_p <= nwords_tmp-1, * so the high word is again clear */ } if (doMac) { word_t cout = add_nr_packed(tmp, accum, nwords_accum); barrett_reduce(tmp, nwords_tmp, cout, prime); } for (i=0; inwords_p * sizeof(word_t); if (prime->p_shift) { nserial -= (WORD_BITS - prime->p_shift) / 8; } /* Track x < p, p = 2^k - p_lo <==> x + p_lo < 2^k */ dword_t carry = 0; for (i=0; i*sizeof(word_t)>= WORD_BITS; word_t the = 0; for (j=0; jnwords_lo) carry += prime->p_lo[i]; } /* check for reduction */ if (prime->p_shift) { carry >>= prime->p_shift; } else { carry >>= WORD_BITS; } /* at this point, carry > 0 indicates failure */ dsword_t scarry = carry; scarry = -scarry; scarry >>= WORD_BITS; scarry >>= WORD_BITS; return (mask_t) ~scarry; } void barrett_deserialize_and_reduce ( word_t *x, const uint8_t *serial, uint32_t nserial, const struct barrett_prime_t *prime ) { unsigned int size = (nserial + sizeof(word_t) - 1)/sizeof(word_t); if (size < prime->nwords_p) { size = prime->nwords_p; } word_t tmp[size]; memset(tmp,0,sizeof(tmp)); unsigned int i,j; for (i=0; i*sizeof(word_t)nwords_p; i++) { x[i] = tmp[i]; } for (; i>(8*j); } } }