You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

460 lines
10 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "p448.h"
  5. #include "x86-64-arith.h"
  6. void
  7. p448_mul (
  8. p448_t *__restrict__ cs,
  9. const p448_t *as,
  10. const p448_t *bs
  11. ) {
  12. const uint64_t *a = as->limb, *b = bs->limb;
  13. uint64_t *c = cs->limb;
  14. __uint128_t accum0 = 0, accum1 = 0, accum2;
  15. uint64_t mask = (1ull<<56) - 1;
  16. uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
  17. /* For some reason clang doesn't vectorize this without prompting? */
  18. unsigned int i;
  19. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  20. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  21. ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
  22. ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
  23. }
  24. /*
  25. for (int i=0; i<4; i++) {
  26. aa[i] = a[i] + a[i+4];
  27. bb[i] = b[i] + b[i+4];
  28. }
  29. */
  30. accum2 = widemul(&a[0],&b[3]);
  31. accum0 = widemul(&aa[0],&bb[3]);
  32. accum1 = widemul(&a[4],&b[7]);
  33. mac(&accum2, &a[1], &b[2]);
  34. mac(&accum0, &aa[1], &bb[2]);
  35. mac(&accum1, &a[5], &b[6]);
  36. mac(&accum2, &a[2], &b[1]);
  37. mac(&accum0, &aa[2], &bb[1]);
  38. mac(&accum1, &a[6], &b[5]);
  39. mac(&accum2, &a[3], &b[0]);
  40. mac(&accum0, &aa[3], &bb[0]);
  41. mac(&accum1, &a[7], &b[4]);
  42. accum0 -= accum2;
  43. accum1 += accum2;
  44. c[3] = ((uint64_t)(accum1)) & mask;
  45. c[7] = ((uint64_t)(accum0)) & mask;
  46. accum0 >>= 56;
  47. accum1 >>= 56;
  48. mac(&accum0, &aa[1],&bb[3]);
  49. mac(&accum1, &a[5], &b[7]);
  50. mac(&accum0, &aa[2], &bb[2]);
  51. mac(&accum1, &a[6], &b[6]);
  52. mac(&accum0, &aa[3], &bb[1]);
  53. accum1 += accum0;
  54. accum2 = widemul(&a[0],&b[0]);
  55. accum1 -= accum2;
  56. accum0 += accum2;
  57. msb(&accum0, &a[1], &b[3]);
  58. msb(&accum0, &a[2], &b[2]);
  59. mac(&accum1, &a[7], &b[5]);
  60. msb(&accum0, &a[3], &b[1]);
  61. mac(&accum1, &aa[0], &bb[0]);
  62. mac(&accum0, &a[4], &b[4]);
  63. c[0] = ((uint64_t)(accum0)) & mask;
  64. c[4] = ((uint64_t)(accum1)) & mask;
  65. accum0 >>= 56;
  66. accum1 >>= 56;
  67. accum2 = widemul(&a[2],&b[7]);
  68. mac(&accum0, &a[6], &bb[3]);
  69. mac(&accum1, &aa[2], &bbb[3]);
  70. mac(&accum2, &a[3], &b[6]);
  71. mac(&accum0, &a[7], &bb[2]);
  72. mac(&accum1, &aa[3], &bbb[2]);
  73. mac(&accum2, &a[0],&b[1]);
  74. mac(&accum1, &aa[0], &bb[1]);
  75. mac(&accum0, &a[4], &b[5]);
  76. mac(&accum2, &a[1], &b[0]);
  77. mac(&accum1, &aa[1], &bb[0]);
  78. mac(&accum0, &a[5], &b[4]);
  79. accum1 -= accum2;
  80. accum0 += accum2;
  81. c[1] = ((uint64_t)(accum0)) & mask;
  82. c[5] = ((uint64_t)(accum1)) & mask;
  83. accum0 >>= 56;
  84. accum1 >>= 56;
  85. accum2 = widemul(&a[3],&b[7]);
  86. mac(&accum0, &a[7], &bb[3]);
  87. mac(&accum1, &aa[3], &bbb[3]);
  88. mac(&accum2, &a[0],&b[2]);
  89. mac(&accum1, &aa[0], &bb[2]);
  90. mac(&accum0, &a[4], &b[6]);
  91. mac(&accum2, &a[1], &b[1]);
  92. mac(&accum1, &aa[1], &bb[1]);
  93. mac(&accum0, &a[5], &b[5]);
  94. mac(&accum2, &a[2], &b[0]);
  95. mac(&accum1, &aa[2], &bb[0]);
  96. mac(&accum0, &a[6], &b[4]);
  97. accum1 -= accum2;
  98. accum0 += accum2;
  99. c[2] = ((uint64_t)(accum0)) & mask;
  100. c[6] = ((uint64_t)(accum1)) & mask;
  101. accum0 >>= 56;
  102. accum1 >>= 56;
  103. accum0 += c[3];
  104. accum1 += c[7];
  105. c[3] = ((uint64_t)(accum0)) & mask;
  106. c[7] = ((uint64_t)(accum1)) & mask;
  107. /* we could almost stop here, but it wouldn't be stable, so... */
  108. accum0 >>= 56;
  109. accum1 >>= 56;
  110. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  111. c[0] += ((uint64_t)(accum1));
  112. }
  113. void
  114. p448_mulw (
  115. p448_t *__restrict__ cs,
  116. const p448_t *as,
  117. uint64_t b
  118. ) {
  119. const uint64_t *a = as->limb;
  120. uint64_t *c = cs->limb;
  121. __uint128_t accum0, accum4;
  122. uint64_t mask = (1ull<<56) - 1;
  123. accum0 = widemul_rm(b, &a[0]);
  124. accum4 = widemul_rm(b, &a[4]);
  125. c[0] = accum0 & mask; accum0 >>= 56;
  126. c[4] = accum4 & mask; accum4 >>= 56;
  127. mac_rm(&accum0, b, &a[1]);
  128. mac_rm(&accum4, b, &a[5]);
  129. c[1] = accum0 & mask; accum0 >>= 56;
  130. c[5] = accum4 & mask; accum4 >>= 56;
  131. mac_rm(&accum0, b, &a[2]);
  132. mac_rm(&accum4, b, &a[6]);
  133. c[2] = accum0 & mask; accum0 >>= 56;
  134. c[6] = accum4 & mask; accum4 >>= 56;
  135. mac_rm(&accum0, b, &a[3]);
  136. mac_rm(&accum4, b, &a[7]);
  137. c[3] = accum0 & mask; accum0 >>= 56;
  138. c[7] = accum4 & mask; accum4 >>= 56;
  139. // c[4] += accum0 + accum4;
  140. // c[0] += accum4;
  141. accum0 += accum4 + c[4];
  142. c[4] = accum0 & mask;
  143. c[5] += accum0 >> 56;
  144. accum4 += c[0];
  145. c[0] = accum4 & mask;
  146. c[1] += accum4 >> 56;
  147. }
  148. void
  149. p448_sqr (
  150. p448_t *__restrict__ cs,
  151. const p448_t *as
  152. ) {
  153. const uint64_t *a = as->limb;
  154. uint64_t *c = cs->limb;
  155. __uint128_t accum0 = 0, accum1 = 0, accum2;
  156. uint64_t mask = (1ull<<56) - 1;
  157. uint64_t aa[4] __attribute__((aligned(32)));
  158. /* For some reason clang doesn't vectorize this without prompting? */
  159. unsigned int i;
  160. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  161. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  162. }
  163. accum2 = widemul(&a[0],&a[3]);
  164. accum0 = widemul(&aa[0],&aa[3]);
  165. accum1 = widemul(&a[4],&a[7]);
  166. mac(&accum2, &a[1], &a[2]);
  167. mac(&accum0, &aa[1], &aa[2]);
  168. mac(&accum1, &a[5], &a[6]);
  169. accum0 -= accum2;
  170. accum1 += accum2;
  171. c[3] = ((uint64_t)(accum1))<<1 & mask;
  172. c[7] = ((uint64_t)(accum0))<<1 & mask;
  173. accum0 >>= 55;
  174. accum1 >>= 55;
  175. mac2(&accum0, &aa[1],&aa[3]);
  176. mac2(&accum1, &a[5], &a[7]);
  177. mac(&accum0, &aa[2], &aa[2]);
  178. accum1 += accum0;
  179. msb2(&accum0, &a[1], &a[3]);
  180. mac(&accum1, &a[6], &a[6]);
  181. accum2 = widemul(&a[0],&a[0]);
  182. accum1 -= accum2;
  183. accum0 += accum2;
  184. msb(&accum0, &a[2], &a[2]);
  185. mac(&accum1, &aa[0], &aa[0]);
  186. mac(&accum0, &a[4], &a[4]);
  187. c[0] = ((uint64_t)(accum0)) & mask;
  188. c[4] = ((uint64_t)(accum1)) & mask;
  189. accum0 >>= 56;
  190. accum1 >>= 56;
  191. accum2 = widemul2(&aa[2],&aa[3]);
  192. msb2(&accum0, &a[2], &a[3]);
  193. mac2(&accum1, &a[6], &a[7]);
  194. accum1 += accum2;
  195. accum0 += accum2;
  196. accum2 = widemul2(&a[0],&a[1]);
  197. mac2(&accum1, &aa[0], &aa[1]);
  198. mac2(&accum0, &a[4], &a[5]);
  199. accum1 -= accum2;
  200. accum0 += accum2;
  201. c[1] = ((uint64_t)(accum0)) & mask;
  202. c[5] = ((uint64_t)(accum1)) & mask;
  203. accum0 >>= 56;
  204. accum1 >>= 56;
  205. accum2 = widemul(&aa[3],&aa[3]);
  206. msb(&accum0, &a[3], &a[3]);
  207. mac(&accum1, &a[7], &a[7]);
  208. accum1 += accum2;
  209. accum0 += accum2;
  210. accum2 = widemul2(&a[0],&a[2]);
  211. mac2(&accum1, &aa[0], &aa[2]);
  212. mac2(&accum0, &a[4], &a[6]);
  213. mac(&accum2, &a[1], &a[1]);
  214. mac(&accum1, &aa[1], &aa[1]);
  215. mac(&accum0, &a[5], &a[5]);
  216. accum1 -= accum2;
  217. accum0 += accum2;
  218. c[2] = ((uint64_t)(accum0)) & mask;
  219. c[6] = ((uint64_t)(accum1)) & mask;
  220. accum0 >>= 56;
  221. accum1 >>= 56;
  222. accum0 += c[3];
  223. accum1 += c[7];
  224. c[3] = ((uint64_t)(accum0)) & mask;
  225. c[7] = ((uint64_t)(accum1)) & mask;
  226. /* we could almost stop here, but it wouldn't be stable, so... */
  227. accum0 >>= 56;
  228. accum1 >>= 56;
  229. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  230. c[0] += ((uint64_t)(accum1));
  231. }
  232. void
  233. p448_strong_reduce (
  234. p448_t *a
  235. ) {
  236. uint64_t mask = (1ull<<56)-1;
  237. /* first, clear high */
  238. a->limb[4] += a->limb[7]>>56;
  239. a->limb[0] += a->limb[7]>>56;
  240. a->limb[7] &= mask;
  241. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  242. /* compute total_value - p. No need to reduce mod p. */
  243. __int128_t scarry = 0;
  244. int i;
  245. for (i=0; i<8; i++) {
  246. scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
  247. a->limb[i] = scarry & mask;
  248. scarry >>= 56;
  249. }
  250. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  251. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  252. * so let's add back in p. will carry back off the top for 2^448.
  253. */
  254. assert(is_zero(scarry) | is_zero(scarry+1));
  255. uint64_t scarry_mask = scarry & mask;
  256. __uint128_t carry = 0;
  257. /* add it back */
  258. for (i=0; i<8; i++) {
  259. carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
  260. a->limb[i] = carry & mask;
  261. carry >>= 56;
  262. }
  263. assert(is_zero(carry + scarry));
  264. }
  265. mask_t
  266. p448_is_zero (
  267. const struct p448_t *a
  268. ) {
  269. struct p448_t b;
  270. p448_copy(&b,a);
  271. p448_strong_reduce(&b);
  272. uint64_t any = 0;
  273. int i;
  274. for (i=0; i<8; i++) {
  275. any |= b.limb[i];
  276. }
  277. return is_zero(any);
  278. }
  279. void
  280. p448_serialize (
  281. uint8_t *serial,
  282. const struct p448_t *x
  283. ) {
  284. int i,j;
  285. p448_t red;
  286. p448_copy(&red, x);
  287. p448_strong_reduce(&red);
  288. for (i=0; i<8; i++) {
  289. for (j=0; j<7; j++) {
  290. serial[7*i+j] = red.limb[i];
  291. red.limb[i] >>= 8;
  292. }
  293. assert(red.limb[i] == 0);
  294. }
  295. }
  296. mask_t
  297. p448_deserialize (
  298. p448_t *x,
  299. const uint8_t serial[56]
  300. ) {
  301. int i,j;
  302. for (i=0; i<8; i++) {
  303. word_t out = 0;
  304. for (j=0; j<7; j++) {
  305. out |= ((word_t)serial[7*i+j])<<(8*j);
  306. }
  307. x->limb[i] = out;
  308. }
  309. /* Check for reduction.
  310. *
  311. * The idea is to create a variable ge which is all ones (rather, 56 ones)
  312. * if and only if the low $i$ words of $x$ are >= those of p.
  313. *
  314. * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
  315. */
  316. word_t ge = -1, mask = (1ull<<56)-1;
  317. for (i=0; i<4; i++) {
  318. ge &= x->limb[i];
  319. }
  320. /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
  321. ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
  322. /* Propagate the rest */
  323. for (i=5; i<8; i++) {
  324. ge &= x->limb[i];
  325. }
  326. return ~is_zero(ge ^ mask);
  327. }
  328. void
  329. simultaneous_invert_p448(
  330. struct p448_t *__restrict__ out,
  331. const struct p448_t *in,
  332. unsigned int n
  333. ) {
  334. if (n==0) {
  335. return;
  336. } else if (n==1) {
  337. p448_inverse(out,in);
  338. return;
  339. }
  340. p448_copy(&out[1], &in[0]);
  341. int i;
  342. for (i=1; i<(int) (n-1); i++) {
  343. p448_mul(&out[i+1], &out[i], &in[i]);
  344. }
  345. p448_mul(&out[0], &out[n-1], &in[n-1]);
  346. struct p448_t tmp;
  347. p448_inverse(&tmp, &out[0]);
  348. p448_copy(&out[0], &tmp);
  349. /* at this point, out[0] = product(in[i]) ^ -1
  350. * out[i] = product(in[0]..in[i-1]) if i != 0
  351. */
  352. for (i=n-1; i>0; i--) {
  353. p448_mul(&tmp, &out[i], &out[0]);
  354. p448_copy(&out[i], &tmp);
  355. p448_mul(&tmp, &out[0], &in[i]);
  356. p448_copy(&out[0], &tmp);
  357. }
  358. }