You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

350 lines
8.8 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "barrett_field.h"
  5. #include <string.h>
  6. #include <assert.h>
  7. word_t
  8. add_nr_ext_packed(
  9. word_t *out,
  10. const word_t *a,
  11. uint32_t nwords_a,
  12. const word_t *c,
  13. uint32_t nwords_c,
  14. word_t mask
  15. ) {
  16. uint32_t i;
  17. dword_t carry = 0;
  18. for (i=0; i<nwords_c; i++) {
  19. out[i] = carry = carry + a[i] + (c[i]&mask);
  20. carry >>= WORD_BITS;
  21. }
  22. for (; i<nwords_a; i++) {
  23. out[i] = carry = carry + a[i];
  24. carry >>= WORD_BITS;
  25. }
  26. return carry;
  27. }
  28. static __inline__ word_t
  29. add_nr_packed(
  30. word_t *a,
  31. const word_t *c,
  32. uint32_t nwords
  33. ) {
  34. uint32_t i;
  35. dword_t carry = 0;
  36. for (i=0; i<nwords; i++) {
  37. a[i] = carry = carry + a[i] + c[i];
  38. carry >>= WORD_BITS;
  39. }
  40. return carry;
  41. }
  42. word_t
  43. sub_nr_ext_packed(
  44. word_t *out,
  45. const word_t *a,
  46. uint32_t nwords_a,
  47. const word_t *c,
  48. uint32_t nwords_c,
  49. word_t mask
  50. ) {
  51. uint32_t i;
  52. dsword_t carry = 0;
  53. for (i=0; i<nwords_c; i++) {
  54. out[i] = carry = carry + a[i] - (c[i]&mask);
  55. carry >>= WORD_BITS;
  56. }
  57. for (; i<nwords_a; i++) {
  58. out[i] = carry = carry + a[i];
  59. carry >>= WORD_BITS;
  60. }
  61. return carry;
  62. }
  63. static word_t
  64. widemac(
  65. word_t *accum,
  66. uint32_t nwords_accum,
  67. const word_t *mier,
  68. uint32_t nwords_mier,
  69. word_t mand,
  70. word_t carry
  71. ) {
  72. uint32_t i;
  73. assert(nwords_mier <= nwords_accum);
  74. for (i=0; i<nwords_mier; i++) {
  75. #ifdef __clang_analyzer__
  76. /* always true, but this satisfies scan-build (bug in scan-build?) */
  77. assert(i<nwords_accum);
  78. #endif
  79. /* UMAAL chain for the wordy part of p */
  80. dword_t product = ((dword_t)mand) * mier[i];
  81. product += accum[i];
  82. product += carry;
  83. accum[i] = product;
  84. carry = product >> WORD_BITS;
  85. }
  86. for (; i<nwords_accum; i++) {
  87. dword_t sum = ((dword_t)carry) + accum[i];
  88. accum[i] = sum;
  89. carry = sum >> WORD_BITS;
  90. }
  91. return carry;
  92. }
  93. void
  94. barrett_negate (
  95. word_t *a,
  96. uint32_t nwords_a,
  97. const struct barrett_prime_t *prime
  98. ) {
  99. uint32_t i;
  100. dsword_t carry = 0;
  101. barrett_reduce(a,nwords_a,0,prime);
  102. /* Have p = 2^big - p_lo. Want p - a = 2^big - p_lo - a */
  103. for (i=0; i<prime->nwords_lo; i++) {
  104. a[i] = carry = carry - prime->p_lo[i] - a[i];
  105. carry >>= WORD_BITS;
  106. }
  107. for (; i<prime->nwords_p; i++) {
  108. a[i] = carry = carry - a[i];
  109. if (i<prime->nwords_p-1) {
  110. carry >>= WORD_BITS;
  111. }
  112. }
  113. a[prime->nwords_p-1] = carry = carry + (((word_t)1) << prime->p_shift);
  114. for (; i<nwords_a; i++) {
  115. assert(!a[i]);
  116. }
  117. assert(!(carry>>WORD_BITS));
  118. }
  119. void
  120. barrett_reduce(
  121. word_t *a,
  122. uint32_t nwords_a,
  123. word_t a_carry,
  124. const struct barrett_prime_t *prime
  125. ) {
  126. uint32_t repeat, nwords_left_in_a=nwords_a;
  127. /* Is there a point to this a_carry business? */
  128. assert(a_carry < ((word_t)1) << prime->p_shift);
  129. assert(nwords_a >= prime->nwords_p);
  130. assert(prime->nwords_p > 0); /* scan-build: prevent underflow */
  131. for (; nwords_left_in_a >= prime->nwords_p; nwords_left_in_a--) {
  132. for (repeat=0; repeat<2; repeat++) {
  133. /* PERF: surely a more careful implementation could
  134. * avoid this double round
  135. */
  136. word_t mand = a[nwords_left_in_a-1] >> prime->p_shift;
  137. a[nwords_left_in_a-1] &= (((word_t)1)<<prime->p_shift)-1;
  138. if (prime->p_shift && !repeat) {
  139. /* collect high bits when there are any */
  140. if (nwords_left_in_a < nwords_a) {
  141. mand |= a[nwords_left_in_a] << (WORD_BITS-prime->p_shift);
  142. a[nwords_left_in_a] = 0;
  143. } else {
  144. mand |= a_carry << (WORD_BITS-prime->p_shift);
  145. }
  146. }
  147. word_t carry = widemac(
  148. a+nwords_left_in_a-prime->nwords_p,
  149. prime->nwords_p,
  150. prime->p_lo,
  151. prime->nwords_lo,
  152. mand,
  153. 0
  154. );
  155. assert(!carry);
  156. (void)carry;
  157. }
  158. }
  159. assert(nwords_left_in_a == prime->nwords_p-1);
  160. /* OK, but it still isn't reduced. Add and subtract p_lo. */
  161. word_t cout = add_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,-1);
  162. if (prime->p_shift) {
  163. cout = (cout<<(WORD_BITS-prime->p_shift)) + (a[prime->nwords_p-1]>>prime->p_shift);
  164. a[prime->nwords_p-1] &= (((word_t)1)<<prime->p_shift)-1;
  165. }
  166. /* mask = carry-1: if no carry then do sub, otherwise don't */
  167. sub_nr_ext_packed(a,a,prime->nwords_p,prime->p_lo,prime->nwords_lo,cout-1);
  168. }
  169. /* PERF: This function is horribly slow. Enough to break 1%. */
  170. void
  171. barrett_mul_or_mac(
  172. word_t *accum,
  173. uint32_t nwords_accum,
  174. const word_t *a,
  175. uint32_t nwords_a,
  176. const word_t *b,
  177. uint32_t nwords_b,
  178. const struct barrett_prime_t *prime,
  179. mask_t doMac
  180. ) {
  181. assert(nwords_accum >= prime->nwords_p);
  182. /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
  183. uint32_t nwords_tmp = (nwords_a > prime->nwords_p) ? nwords_a : prime->nwords_p;
  184. nwords_tmp++;
  185. assert(nwords_tmp > 0); /* scan-build: prevent underflow. */
  186. if (nwords_tmp < nwords_accum && doMac)
  187. nwords_tmp = nwords_accum;
  188. word_t tmp[nwords_tmp];
  189. int bpos, idown;
  190. uint32_t i;
  191. for (i=0; i<nwords_tmp; i++) {
  192. tmp[i] = 0;
  193. }
  194. for (bpos=nwords_b-1; bpos >= 0; bpos--) {
  195. /* Invariant at the beginning of the loop: the high word is unused. */
  196. assert(tmp[nwords_tmp-1] == 0);
  197. /* shift up */
  198. for (idown=nwords_tmp-2; idown>=0; idown--) {
  199. tmp[idown+1] = tmp[idown];
  200. }
  201. tmp[0] = 0;
  202. /* mac and reduce */
  203. word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
  204. /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
  205. assert(!carry);
  206. barrett_reduce(tmp, nwords_tmp, carry, prime);
  207. /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
  208. * so the high word is again clear */
  209. }
  210. if (doMac) {
  211. word_t cout = add_nr_packed(tmp, accum, nwords_accum);
  212. barrett_reduce(tmp, nwords_tmp, cout, prime);
  213. }
  214. for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
  215. accum[i] = tmp[i];
  216. }
  217. for (; i<nwords_tmp; i++) {
  218. assert(tmp[i] == 0);
  219. }
  220. for (; i<nwords_accum; i++) {
  221. accum[i] = 0;
  222. }
  223. }
  224. mask_t
  225. barrett_deserialize (
  226. word_t *x,
  227. const uint8_t *serial,
  228. const struct barrett_prime_t *prime
  229. ) {
  230. unsigned int i,j,nserial = prime->nwords_p * sizeof(word_t);
  231. if (prime->p_shift) {
  232. nserial -= (WORD_BITS - prime->p_shift) / 8;
  233. }
  234. /* Track x < p, p = 2^k - p_lo <==> x + p_lo < 2^k */
  235. dword_t carry = 0;
  236. for (i=0; i*sizeof(word_t)<nserial; i++) {
  237. carry >>= WORD_BITS;
  238. word_t the = 0;
  239. for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
  240. the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
  241. }
  242. x[i] = the;
  243. carry += the;
  244. if (i < prime->nwords_lo) carry += prime->p_lo[i];
  245. }
  246. /* check for reduction */
  247. if (prime->p_shift) {
  248. carry >>= prime->p_shift;
  249. } else {
  250. carry >>= WORD_BITS;
  251. }
  252. /* at this point, carry > 0 indicates failure */
  253. dsword_t scarry = carry;
  254. scarry = -scarry;
  255. scarry >>= WORD_BITS;
  256. scarry >>= WORD_BITS;
  257. return (mask_t) ~scarry;
  258. }
  259. void
  260. barrett_deserialize_and_reduce (
  261. word_t *x,
  262. const uint8_t *serial,
  263. uint32_t nserial,
  264. const struct barrett_prime_t *prime
  265. ) {
  266. unsigned int size = (nserial + sizeof(word_t) - 1)/sizeof(word_t);
  267. if (size < prime->nwords_p) {
  268. size = prime->nwords_p;
  269. }
  270. word_t tmp[size];
  271. memset(tmp,0,sizeof(tmp));
  272. unsigned int i,j;
  273. for (i=0; i*sizeof(word_t)<nserial; i++) {
  274. word_t the = 0;
  275. for (j=0; j<sizeof(word_t) && sizeof(word_t)*i+j < nserial; j++) {
  276. the |= ((word_t)serial[sizeof(word_t)*i+j]) << (8*j);
  277. }
  278. tmp[i] = the;
  279. }
  280. barrett_reduce(tmp,size,0,prime);
  281. for (i=0; i<prime->nwords_p; i++) {
  282. x[i] = tmp[i];
  283. }
  284. for (; i<size; i++) {
  285. assert(!tmp[i]);
  286. }
  287. }
  288. void
  289. barrett_serialize (
  290. uint8_t *serial,
  291. const word_t *x,
  292. uint32_t nserial
  293. ) {
  294. unsigned int i,j;
  295. for (i=0; i*sizeof(word_t)<nserial; i++) {
  296. for (j=0; j<sizeof(word_t); j++) {
  297. serial[sizeof(word_t)*i+j] = x[i]>>(8*j);
  298. }
  299. }
  300. }