You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

388 lines
8.6 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "p448.h"
  5. #include "x86-64-arith.h"
  6. void p448_mul(p448_t *__restrict__ cs, const p448_t *as, const p448_t *bs) {
  7. const uint64_t *a = as->limb, *b = bs->limb;
  8. uint64_t *c = cs->limb;
  9. __uint128_t accum0 = 0, accum1 = 0, accum2;
  10. uint64_t mask = (1ull<<56) - 1;
  11. uint64_t aa[4], bb[4];
  12. /* For some reason clang doesn't vectorize this without prompting? */
  13. unsigned int i;
  14. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  15. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  16. ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
  17. }
  18. /*
  19. for (int i=0; i<4; i++) {
  20. aa[i] = a[i] + a[i+4];
  21. bb[i] = b[i] + b[i+4];
  22. }
  23. */
  24. accum2 = widemul(&a[0],&b[3]);
  25. accum1 = widemul(&aa[0],&bb[3]);
  26. accum0 = widemul(&a[4],&b[7]);
  27. mac(&accum2, &a[1], &b[2]);
  28. mac(&accum1, &aa[1], &bb[2]);
  29. mac(&accum0, &a[5], &b[6]);
  30. mac(&accum2, &a[2], &b[1]);
  31. mac(&accum1, &aa[2], &bb[1]);
  32. mac(&accum0, &a[6], &b[5]);
  33. mac(&accum2, &a[3], &b[0]);
  34. mac(&accum1, &aa[3], &bb[0]);
  35. mac(&accum0, &a[7], &b[4]);
  36. accum1 -= accum2;
  37. accum0 += accum2;
  38. c[3] = ((uint64_t)(accum0)) & mask;
  39. c[7] = ((uint64_t)(accum1)) & mask;
  40. accum0 >>= 56;
  41. accum1 >>= 56;
  42. {
  43. accum2 = accum1;
  44. accum1 += accum0;
  45. accum0 = accum2;
  46. }
  47. accum2 = widemul(&a[0],&b[0]);
  48. accum1 -= accum2;
  49. accum0 += accum2;
  50. accum2 = widemul(&aa[1],&bb[3]);
  51. msb(&accum0, &a[1], &b[3]);
  52. mac(&accum1, &a[5], &b[7]);
  53. msb(&accum0, &a[2], &b[2]);
  54. mac(&accum2, &aa[2], &bb[2]);
  55. mac(&accum1, &a[6], &b[6]);
  56. msb(&accum0, &a[3], &b[1]);
  57. mac(&accum1, &a[7], &b[5]);
  58. mac(&accum2, &aa[3], &bb[1]);
  59. accum0 += accum2;
  60. accum1 += accum2;
  61. mac(&accum0, &a[4], &b[4]);
  62. mac(&accum1, &aa[0], &bb[0]);
  63. c[0] = ((uint64_t)(accum0)) & mask;
  64. c[4] = ((uint64_t)(accum1)) & mask;
  65. accum0 >>= 56;
  66. accum1 >>= 56;
  67. accum2 = widemul(&aa[2],&bb[3]);
  68. msb(&accum0, &a[2], &b[3]);
  69. mac(&accum1, &a[6], &b[7]);
  70. mac(&accum2, &aa[3], &bb[2]);
  71. msb(&accum0, &a[3], &b[2]);
  72. mac(&accum1, &a[7], &b[6]);
  73. accum1 += accum2;
  74. accum0 += accum2;
  75. accum2 = widemul(&a[0],&b[1]);
  76. mac(&accum1, &aa[0], &bb[1]);
  77. mac(&accum0, &a[4], &b[5]);
  78. mac(&accum2, &a[1], &b[0]);
  79. mac(&accum1, &aa[1], &bb[0]);
  80. mac(&accum0, &a[5], &b[4]);
  81. accum1 -= accum2;
  82. accum0 += accum2;
  83. c[1] = ((uint64_t)(accum0)) & mask;
  84. c[5] = ((uint64_t)(accum1)) & mask;
  85. accum0 >>= 56;
  86. accum1 >>= 56;
  87. accum2 = widemul(&aa[3],&bb[3]);
  88. msb(&accum0, &a[3], &b[3]);
  89. mac(&accum1, &a[7], &b[7]);
  90. accum1 += accum2;
  91. accum0 += accum2;
  92. accum2 = widemul(&a[0],&b[2]);
  93. mac(&accum1, &aa[0], &bb[2]);
  94. mac(&accum0, &a[4], &b[6]);
  95. mac(&accum2, &a[1], &b[1]);
  96. mac(&accum1, &aa[1], &bb[1]);
  97. mac(&accum0, &a[5], &b[5]);
  98. mac(&accum2, &a[2], &b[0]);
  99. mac(&accum1, &aa[2], &bb[0]);
  100. mac(&accum0, &a[6], &b[4]);
  101. accum1 -= accum2;
  102. accum0 += accum2;
  103. c[2] = ((uint64_t)(accum0)) & mask;
  104. c[6] = ((uint64_t)(accum1)) & mask;
  105. accum0 >>= 56;
  106. accum1 >>= 56;
  107. accum0 += c[3];
  108. accum1 += c[7];
  109. c[3] = ((uint64_t)(accum0)) & mask;
  110. c[7] = ((uint64_t)(accum1)) & mask;
  111. /* we could almost stop here, but it wouldn't be stable, so... */
  112. accum0 >>= 56;
  113. accum1 >>= 56;
  114. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  115. c[0] += ((uint64_t)(accum1));
  116. }
  117. void p448_mulw(p448_t *__restrict__ cs, const p448_t *as, uint64_t b) {
  118. const uint64_t *a = as->limb;
  119. uint64_t *c = cs->limb;
  120. __uint128_t accum0, accum4;
  121. uint64_t mask = (1ull<<56) - 1;
  122. accum0 = widemul_rm(b, &a[0]);
  123. accum4 = widemul_rm(b, &a[4]);
  124. c[0] = accum0 & mask; accum0 >>= 56;
  125. c[4] = accum4 & mask; accum4 >>= 56;
  126. mac_rm(&accum0, b, &a[1]);
  127. mac_rm(&accum4, b, &a[5]);
  128. c[1] = accum0 & mask; accum0 >>= 56;
  129. c[5] = accum4 & mask; accum4 >>= 56;
  130. mac_rm(&accum0, b, &a[2]);
  131. mac_rm(&accum4, b, &a[6]);
  132. c[2] = accum0 & mask; accum0 >>= 56;
  133. c[6] = accum4 & mask; accum4 >>= 56;
  134. mac_rm(&accum0, b, &a[3]);
  135. mac_rm(&accum4, b, &a[7]);
  136. c[3] = accum0 & mask; accum0 >>= 56;
  137. c[7] = accum4 & mask; accum4 >>= 56;
  138. c[4] += accum0 + accum4;
  139. c[0] += accum4;
  140. }
  141. void p448_sqr(p448_t *__restrict__ cs, const p448_t *as) {
  142. const uint64_t *a = as->limb;
  143. uint64_t *c = cs->limb;
  144. __uint128_t accum0 = 0, accum1 = 0, accum2;
  145. uint64_t mask = (1ull<<56) - 1;
  146. uint64_t aa[4];
  147. /* For some reason clang doesn't vectorize this without prompting? */
  148. unsigned int i;
  149. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  150. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  151. }
  152. accum2 = widemul(&a[0],&a[3]);
  153. accum1 = widemul(&aa[0],&aa[3]);
  154. accum0 = widemul(&a[4],&a[7]);
  155. mac(&accum2, &a[1], &a[2]);
  156. mac(&accum1, &aa[1], &aa[2]);
  157. mac(&accum0, &a[5], &a[6]);
  158. accum1 -= accum2;
  159. accum0 += accum2;
  160. c[3] = ((uint64_t)(accum0))<<1 & mask;
  161. c[7] = ((uint64_t)(accum1))<<1 & mask;
  162. accum0 >>= 55;
  163. accum1 >>= 55;
  164. {
  165. accum2 = accum1;
  166. accum1 += accum0;
  167. accum0 = accum2;
  168. }
  169. accum2 = widemul(&a[0],&a[0]);
  170. accum1 -= accum2;
  171. accum0 += accum2;
  172. accum2 = widemul2(&aa[1],&aa[3]);
  173. msb2(&accum0, &a[1], &a[3]);
  174. mac2(&accum1, &a[5], &a[7]);
  175. msb(&accum0, &a[2], &a[2]);
  176. mac(&accum2, &aa[2], &aa[2]);
  177. mac(&accum1, &a[6], &a[6]);
  178. accum0 += accum2;
  179. accum1 += accum2;
  180. mac(&accum0, &a[4], &a[4]);
  181. mac(&accum1, &aa[0], &aa[0]);
  182. c[0] = ((uint64_t)(accum0)) & mask;
  183. c[4] = ((uint64_t)(accum1)) & mask;
  184. accum0 >>= 56;
  185. accum1 >>= 56;
  186. accum2 = widemul2(&aa[2],&aa[3]);
  187. msb2(&accum0, &a[2], &a[3]);
  188. mac2(&accum1, &a[6], &a[7]);
  189. accum1 += accum2;
  190. accum0 += accum2;
  191. accum2 = widemul2(&a[0],&a[1]);
  192. mac2(&accum1, &aa[0], &aa[1]);
  193. mac2(&accum0, &a[4], &a[5]);
  194. accum1 -= accum2;
  195. accum0 += accum2;
  196. c[1] = ((uint64_t)(accum0)) & mask;
  197. c[5] = ((uint64_t)(accum1)) & mask;
  198. accum0 >>= 56;
  199. accum1 >>= 56;
  200. accum2 = widemul(&aa[3],&aa[3]);
  201. msb(&accum0, &a[3], &a[3]);
  202. mac(&accum1, &a[7], &a[7]);
  203. accum1 += accum2;
  204. accum0 += accum2;
  205. accum2 = widemul2(&a[0],&a[2]);
  206. mac2(&accum1, &aa[0], &aa[2]);
  207. mac2(&accum0, &a[4], &a[6]);
  208. mac(&accum2, &a[1], &a[1]);
  209. mac(&accum1, &aa[1], &aa[1]);
  210. mac(&accum0, &a[5], &a[5]);
  211. accum1 -= accum2;
  212. accum0 += accum2;
  213. c[2] = ((uint64_t)(accum0)) & mask;
  214. c[6] = ((uint64_t)(accum1)) & mask;
  215. accum0 >>= 56;
  216. accum1 >>= 56;
  217. accum0 += c[3];
  218. accum1 += c[7];
  219. c[3] = ((uint64_t)(accum0)) & mask;
  220. c[7] = ((uint64_t)(accum1)) & mask;
  221. /* we could almost stop here, but it wouldn't be stable, so... */
  222. accum0 >>= 56;
  223. accum1 >>= 56;
  224. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  225. c[0] += ((uint64_t)(accum1));
  226. }
  227. static __inline__ void p448_sqr_inplace(p448_t *x) {
  228. p448_t y;
  229. p448_sqr(&y,x);
  230. *x = y;
  231. }
  232. static __inline__ void p448_mul_inplace(p448_t *x, const p448_t *z) {
  233. p448_t y;
  234. p448_mul(&y,x,z);
  235. *x = y;
  236. }
  237. static __inline__ void p448_repunit(p448_t *x, int space, int teeth) {
  238. int i,j;
  239. p448_t working = *x;
  240. for (i=0; i<teeth-1; i++) {
  241. for (j=0; j<space-(i?0:1); j++)
  242. p448_sqr_inplace(&working);
  243. if (i==teeth-2)
  244. p448_mul_inplace(x,&working);
  245. else
  246. p448_mul_inplace(&working,x);
  247. }
  248. }
  249. void
  250. p448_strong_reduce(p448_t *a) {
  251. uint64_t mask = (1ull<<56)-1;
  252. /* first, clear high */
  253. a->limb[4] += a->limb[7]>>56;
  254. a->limb[0] += a->limb[7]>>56;
  255. a->limb[7] &= mask;
  256. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  257. /* compute total_value - p. No need to reduce mod p. */
  258. __int128_t scarry = 0;
  259. int i;
  260. for (i=0; i<8; i++) {
  261. scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
  262. a->limb[i] = scarry & mask;
  263. scarry >>= 56;
  264. }
  265. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  266. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  267. * so let's add back in p. will carry back off the top for 2^448.
  268. */
  269. assert(is_zero(scarry) | is_zero(scarry+1));
  270. uint64_t scarry_mask = scarry & mask;
  271. __uint128_t carry = 0;
  272. /* add it back */
  273. for (i=0; i<8; i++) {
  274. carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
  275. a->limb[i] = carry & mask;
  276. carry >>= 56;
  277. }
  278. assert(is_zero(carry + scarry));
  279. }
  280. mask_t p448_is_zero(const struct p448_t *a) {
  281. struct p448_t b;
  282. p448_copy(&b,a);
  283. p448_strong_reduce(&b);
  284. uint64_t any = 0;
  285. int i;
  286. for (i=0; i<8; i++) {
  287. any |= b.limb[i];
  288. }
  289. return is_zero(any);
  290. }