You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

447 lines
9.6 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "p448.h"
  5. #include "x86-64-arith.h"
  6. void
  7. p448_mul (
  8. p448_t *__restrict__ cs,
  9. const p448_t *as,
  10. const p448_t *bs
  11. ) {
  12. const uint64_t *a = as->limb, *b = bs->limb;
  13. uint64_t *c = cs->limb;
  14. __uint128_t accum0 = 0, accum1 = 0, accum2;
  15. uint64_t mask = (1ull<<56) - 1;
  16. uint64_t aa[4], bb[4];
  17. /* For some reason clang doesn't vectorize this without prompting? */
  18. unsigned int i;
  19. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  20. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  21. ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
  22. }
  23. /*
  24. for (int i=0; i<4; i++) {
  25. aa[i] = a[i] + a[i+4];
  26. bb[i] = b[i] + b[i+4];
  27. }
  28. */
  29. accum2 = widemul(&a[0],&b[3]);
  30. accum1 = widemul(&aa[0],&bb[3]);
  31. accum0 = widemul(&a[4],&b[7]);
  32. mac(&accum2, &a[1], &b[2]);
  33. mac(&accum1, &aa[1], &bb[2]);
  34. mac(&accum0, &a[5], &b[6]);
  35. mac(&accum2, &a[2], &b[1]);
  36. mac(&accum1, &aa[2], &bb[1]);
  37. mac(&accum0, &a[6], &b[5]);
  38. mac(&accum2, &a[3], &b[0]);
  39. mac(&accum1, &aa[3], &bb[0]);
  40. mac(&accum0, &a[7], &b[4]);
  41. accum1 -= accum2;
  42. accum0 += accum2;
  43. c[3] = ((uint64_t)(accum0)) & mask;
  44. c[7] = ((uint64_t)(accum1)) & mask;
  45. accum0 >>= 56;
  46. accum1 >>= 56;
  47. {
  48. accum2 = accum1;
  49. accum1 += accum0;
  50. accum0 = accum2;
  51. }
  52. accum2 = widemul(&a[0],&b[0]);
  53. accum1 -= accum2;
  54. accum0 += accum2;
  55. accum2 = widemul(&aa[1],&bb[3]);
  56. msb(&accum0, &a[1], &b[3]);
  57. mac(&accum1, &a[5], &b[7]);
  58. msb(&accum0, &a[2], &b[2]);
  59. mac(&accum2, &aa[2], &bb[2]);
  60. mac(&accum1, &a[6], &b[6]);
  61. msb(&accum0, &a[3], &b[1]);
  62. mac(&accum1, &a[7], &b[5]);
  63. mac(&accum2, &aa[3], &bb[1]);
  64. accum0 += accum2;
  65. accum1 += accum2;
  66. mac(&accum0, &a[4], &b[4]);
  67. mac(&accum1, &aa[0], &bb[0]);
  68. c[0] = ((uint64_t)(accum0)) & mask;
  69. c[4] = ((uint64_t)(accum1)) & mask;
  70. accum0 >>= 56;
  71. accum1 >>= 56;
  72. accum2 = widemul(&aa[2],&bb[3]);
  73. msb(&accum0, &a[2], &b[3]);
  74. mac(&accum1, &a[6], &b[7]);
  75. mac(&accum2, &aa[3], &bb[2]);
  76. msb(&accum0, &a[3], &b[2]);
  77. mac(&accum1, &a[7], &b[6]);
  78. accum1 += accum2;
  79. accum0 += accum2;
  80. accum2 = widemul(&a[0],&b[1]);
  81. mac(&accum1, &aa[0], &bb[1]);
  82. mac(&accum0, &a[4], &b[5]);
  83. mac(&accum2, &a[1], &b[0]);
  84. mac(&accum1, &aa[1], &bb[0]);
  85. mac(&accum0, &a[5], &b[4]);
  86. accum1 -= accum2;
  87. accum0 += accum2;
  88. c[1] = ((uint64_t)(accum0)) & mask;
  89. c[5] = ((uint64_t)(accum1)) & mask;
  90. accum0 >>= 56;
  91. accum1 >>= 56;
  92. accum2 = widemul(&aa[3],&bb[3]);
  93. msb(&accum0, &a[3], &b[3]);
  94. mac(&accum1, &a[7], &b[7]);
  95. accum1 += accum2;
  96. accum0 += accum2;
  97. accum2 = widemul(&a[0],&b[2]);
  98. mac(&accum1, &aa[0], &bb[2]);
  99. mac(&accum0, &a[4], &b[6]);
  100. mac(&accum2, &a[1], &b[1]);
  101. mac(&accum1, &aa[1], &bb[1]);
  102. mac(&accum0, &a[5], &b[5]);
  103. mac(&accum2, &a[2], &b[0]);
  104. mac(&accum1, &aa[2], &bb[0]);
  105. mac(&accum0, &a[6], &b[4]);
  106. accum1 -= accum2;
  107. accum0 += accum2;
  108. c[2] = ((uint64_t)(accum0)) & mask;
  109. c[6] = ((uint64_t)(accum1)) & mask;
  110. accum0 >>= 56;
  111. accum1 >>= 56;
  112. accum0 += c[3];
  113. accum1 += c[7];
  114. c[3] = ((uint64_t)(accum0)) & mask;
  115. c[7] = ((uint64_t)(accum1)) & mask;
  116. /* we could almost stop here, but it wouldn't be stable, so... */
  117. accum0 >>= 56;
  118. accum1 >>= 56;
  119. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  120. c[0] += ((uint64_t)(accum1));
  121. }
  122. void
  123. p448_mulw (
  124. p448_t *__restrict__ cs,
  125. const p448_t *as,
  126. uint64_t b
  127. ) {
  128. const uint64_t *a = as->limb;
  129. uint64_t *c = cs->limb;
  130. __uint128_t accum0, accum4;
  131. uint64_t mask = (1ull<<56) - 1;
  132. accum0 = widemul_rm(b, &a[0]);
  133. accum4 = widemul_rm(b, &a[4]);
  134. c[0] = accum0 & mask; accum0 >>= 56;
  135. c[4] = accum4 & mask; accum4 >>= 56;
  136. mac_rm(&accum0, b, &a[1]);
  137. mac_rm(&accum4, b, &a[5]);
  138. c[1] = accum0 & mask; accum0 >>= 56;
  139. c[5] = accum4 & mask; accum4 >>= 56;
  140. mac_rm(&accum0, b, &a[2]);
  141. mac_rm(&accum4, b, &a[6]);
  142. c[2] = accum0 & mask; accum0 >>= 56;
  143. c[6] = accum4 & mask; accum4 >>= 56;
  144. mac_rm(&accum0, b, &a[3]);
  145. mac_rm(&accum4, b, &a[7]);
  146. c[3] = accum0 & mask; accum0 >>= 56;
  147. c[7] = accum4 & mask; accum4 >>= 56;
  148. c[4] += accum0 + accum4;
  149. c[0] += accum4;
  150. }
  151. void
  152. p448_sqr (
  153. p448_t *__restrict__ cs,
  154. const p448_t *as
  155. ) {
  156. const uint64_t *a = as->limb;
  157. uint64_t *c = cs->limb;
  158. __uint128_t accum0 = 0, accum1 = 0, accum2;
  159. uint64_t mask = (1ull<<56) - 1;
  160. uint64_t aa[4];
  161. /* For some reason clang doesn't vectorize this without prompting? */
  162. unsigned int i;
  163. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  164. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  165. }
  166. accum2 = widemul(&a[0],&a[3]);
  167. accum1 = widemul(&aa[0],&aa[3]);
  168. accum0 = widemul(&a[4],&a[7]);
  169. mac(&accum2, &a[1], &a[2]);
  170. mac(&accum1, &aa[1], &aa[2]);
  171. mac(&accum0, &a[5], &a[6]);
  172. accum1 -= accum2;
  173. accum0 += accum2;
  174. c[3] = ((uint64_t)(accum0))<<1 & mask;
  175. c[7] = ((uint64_t)(accum1))<<1 & mask;
  176. accum0 >>= 55;
  177. accum1 >>= 55;
  178. {
  179. accum2 = accum1;
  180. accum1 += accum0;
  181. accum0 = accum2;
  182. }
  183. accum2 = widemul(&a[0],&a[0]);
  184. accum1 -= accum2;
  185. accum0 += accum2;
  186. accum2 = widemul2(&aa[1],&aa[3]);
  187. msb2(&accum0, &a[1], &a[3]);
  188. mac2(&accum1, &a[5], &a[7]);
  189. msb(&accum0, &a[2], &a[2]);
  190. mac(&accum2, &aa[2], &aa[2]);
  191. mac(&accum1, &a[6], &a[6]);
  192. accum0 += accum2;
  193. accum1 += accum2;
  194. mac(&accum0, &a[4], &a[4]);
  195. mac(&accum1, &aa[0], &aa[0]);
  196. c[0] = ((uint64_t)(accum0)) & mask;
  197. c[4] = ((uint64_t)(accum1)) & mask;
  198. accum0 >>= 56;
  199. accum1 >>= 56;
  200. accum2 = widemul2(&aa[2],&aa[3]);
  201. msb2(&accum0, &a[2], &a[3]);
  202. mac2(&accum1, &a[6], &a[7]);
  203. accum1 += accum2;
  204. accum0 += accum2;
  205. accum2 = widemul2(&a[0],&a[1]);
  206. mac2(&accum1, &aa[0], &aa[1]);
  207. mac2(&accum0, &a[4], &a[5]);
  208. accum1 -= accum2;
  209. accum0 += accum2;
  210. c[1] = ((uint64_t)(accum0)) & mask;
  211. c[5] = ((uint64_t)(accum1)) & mask;
  212. accum0 >>= 56;
  213. accum1 >>= 56;
  214. accum2 = widemul(&aa[3],&aa[3]);
  215. msb(&accum0, &a[3], &a[3]);
  216. mac(&accum1, &a[7], &a[7]);
  217. accum1 += accum2;
  218. accum0 += accum2;
  219. accum2 = widemul2(&a[0],&a[2]);
  220. mac2(&accum1, &aa[0], &aa[2]);
  221. mac2(&accum0, &a[4], &a[6]);
  222. mac(&accum2, &a[1], &a[1]);
  223. mac(&accum1, &aa[1], &aa[1]);
  224. mac(&accum0, &a[5], &a[5]);
  225. accum1 -= accum2;
  226. accum0 += accum2;
  227. c[2] = ((uint64_t)(accum0)) & mask;
  228. c[6] = ((uint64_t)(accum1)) & mask;
  229. accum0 >>= 56;
  230. accum1 >>= 56;
  231. accum0 += c[3];
  232. accum1 += c[7];
  233. c[3] = ((uint64_t)(accum0)) & mask;
  234. c[7] = ((uint64_t)(accum1)) & mask;
  235. /* we could almost stop here, but it wouldn't be stable, so... */
  236. accum0 >>= 56;
  237. accum1 >>= 56;
  238. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  239. c[0] += ((uint64_t)(accum1));
  240. }
  241. void
  242. p448_strong_reduce (
  243. p448_t *a
  244. ) {
  245. uint64_t mask = (1ull<<56)-1;
  246. /* first, clear high */
  247. a->limb[4] += a->limb[7]>>56;
  248. a->limb[0] += a->limb[7]>>56;
  249. a->limb[7] &= mask;
  250. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  251. /* compute total_value - p. No need to reduce mod p. */
  252. __int128_t scarry = 0;
  253. int i;
  254. for (i=0; i<8; i++) {
  255. scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
  256. a->limb[i] = scarry & mask;
  257. scarry >>= 56;
  258. }
  259. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  260. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  261. * so let's add back in p. will carry back off the top for 2^448.
  262. */
  263. assert(is_zero(scarry) | is_zero(scarry+1));
  264. uint64_t scarry_mask = scarry & mask;
  265. __uint128_t carry = 0;
  266. /* add it back */
  267. for (i=0; i<8; i++) {
  268. carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
  269. a->limb[i] = carry & mask;
  270. carry >>= 56;
  271. }
  272. assert(is_zero(carry + scarry));
  273. }
  274. mask_t
  275. p448_is_zero (
  276. const struct p448_t *a
  277. ) {
  278. struct p448_t b;
  279. p448_copy(&b,a);
  280. p448_strong_reduce(&b);
  281. uint64_t any = 0;
  282. int i;
  283. for (i=0; i<8; i++) {
  284. any |= b.limb[i];
  285. }
  286. return is_zero(any);
  287. }
  288. void
  289. p448_serialize (
  290. uint8_t *serial,
  291. const struct p448_t *x
  292. ) {
  293. int i,j;
  294. p448_t red;
  295. p448_copy(&red, x);
  296. p448_strong_reduce(&red);
  297. for (i=0; i<8; i++) {
  298. for (j=0; j<7; j++) {
  299. serial[7*i+j] = red.limb[i];
  300. red.limb[i] >>= 8;
  301. }
  302. assert(red.limb[i] == 0);
  303. }
  304. }
  305. void
  306. q448_serialize (
  307. uint8_t *serial,
  308. const word_t x[7]
  309. ) {
  310. int i,j;
  311. for (i=0; i<7; i++) {
  312. for (j=0; j<8; j++) {
  313. serial[8*i+j] = x[i]>>(8*j);
  314. }
  315. }
  316. }
  317. mask_t
  318. q448_deserialize (
  319. word_t x[7],
  320. const uint8_t serial[56]
  321. ) {
  322. int i,j;
  323. for (i=0; i<7; i++) {
  324. word_t out = 0;
  325. for (j=0; j<8; j++) {
  326. out |= ((word_t)serial[8*i+j])<<(8*j);
  327. }
  328. x[i] = out;
  329. }
  330. /* TODO: check for reduction */
  331. return MASK_SUCCESS;
  332. }
  333. mask_t
  334. p448_deserialize (
  335. p448_t *x,
  336. const uint8_t serial[56]
  337. ) {
  338. int i,j;
  339. for (i=0; i<8; i++) {
  340. word_t out = 0;
  341. for (j=0; j<7; j++) {
  342. out |= ((word_t)serial[7*i+j])<<(8*j);
  343. }
  344. x->limb[i] = out;
  345. }
  346. /* TODO: check for reduction */
  347. return MASK_SUCCESS;
  348. }