You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

402 lines
10 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "f_impl.h"
  5. static __inline__ __uint128_t widemul(
  6. const uint64_t a,
  7. const uint64_t b
  8. ) {
  9. return ((__uint128_t)a) * ((__uint128_t)b);
  10. }
  11. static __inline__ uint64_t is_zero(uint64_t a) {
  12. /* let's hope the compiler isn't clever enough to optimize this. */
  13. return (((__uint128_t)a)-1)>>64;
  14. }
  15. void
  16. p521_mul (
  17. p521_t *__restrict__ cs,
  18. const p521_t *as,
  19. const p521_t *bs
  20. ) {
  21. uint64_t *c = cs->limb;
  22. const uint64_t *a = as->limb, *b = bs->limb;
  23. __uint128_t accum0, accum1;
  24. accum0 = widemul(2*a[8], b[8]);
  25. accum1 = widemul(a[0], b[7]);
  26. accum0 += widemul(a[1], b[6]);
  27. accum1 += widemul(a[2], b[5]);
  28. accum0 += widemul(a[3], b[4]);
  29. accum1 += widemul(a[4], b[3]);
  30. accum0 += widemul(a[5], b[2]);
  31. accum1 += widemul(a[6], b[1]);
  32. accum0 += widemul(a[7], b[0]);
  33. accum1 += accum0;
  34. c[7] = accum1 & ((1ull<<58)-1);
  35. accum1 >>= 58;
  36. accum0 = 0;
  37. accum1 += widemul(a[0], b[8-0]);
  38. accum0 += widemul(a[1], b[8-1]);
  39. accum1 += widemul(a[2], b[8-2]);
  40. accum0 += widemul(a[3], b[8-3]);
  41. accum1 += widemul(a[4], b[8-4]);
  42. accum0 += widemul(a[5], b[8-5]);
  43. accum1 += widemul(a[6], b[8-6]);
  44. accum0 += widemul(a[7], b[8-7]);
  45. accum1 += widemul(a[8], b[8-8]);
  46. accum1 += accum0;
  47. c[8] = accum1 & ((1ull<<57)-1);
  48. accum1 >>= 57;
  49. accum0 = 0;
  50. accum0 += widemul(a[1], b[0+9-1]);
  51. accum0 += widemul(a[2], b[0+9-2]);
  52. accum0 += widemul(a[3], b[0+9-3]);
  53. accum0 += widemul(a[4], b[0+9-4]);
  54. accum1 += widemul(a[0], b[0-0]);
  55. accum0 += widemul(a[5], b[0+9-5]);
  56. accum0 += widemul(a[6], b[0+9-6]);
  57. accum0 += widemul(a[7], b[0+9-7]);
  58. accum0 += widemul(a[8], b[0+9-8]);
  59. accum1 += accum0 << 1;
  60. c[0] = accum1 & ((1ull<<58)-1);
  61. accum1 >>= 58;
  62. accum0 = 0;
  63. accum0 += widemul(a[2], b[1+9-2]);
  64. accum0 += widemul(a[3], b[1+9-3]);
  65. accum1 += widemul(a[0], b[1-0]);
  66. accum0 += widemul(a[4], b[1+9-4]);
  67. accum0 += widemul(a[5], b[1+9-5]);
  68. accum1 += widemul(a[1], b[1-1]);
  69. accum0 += widemul(a[6], b[1+9-6]);
  70. accum0 += widemul(a[7], b[1+9-7]);
  71. accum0 += widemul(a[8], b[1+9-8]);
  72. accum1 += accum0 << 1;
  73. c[1] = accum1 & ((1ull<<58)-1);
  74. accum1 >>= 58;
  75. accum0 = 0;
  76. accum0 += widemul(a[3], b[2+9-3]);
  77. accum1 += widemul(a[0], b[2-0]);
  78. accum0 += widemul(a[4], b[2+9-4]);
  79. accum0 += widemul(a[5], b[2+9-5]);
  80. accum1 += widemul(a[1], b[2-1]);
  81. accum0 += widemul(a[6], b[2+9-6]);
  82. accum0 += widemul(a[7], b[2+9-7]);
  83. accum1 += widemul(a[2], b[2-2]);
  84. accum0 += widemul(a[8], b[2+9-8]);
  85. accum1 += accum0 << 1;
  86. c[2] = accum1 & ((1ull<<58)-1);
  87. accum1 >>= 58;
  88. accum0 = 0;
  89. accum0 += widemul(a[4], b[3+9-4]);
  90. accum1 += widemul(a[0], b[3-0]);
  91. accum0 += widemul(a[5], b[3+9-5]);
  92. accum1 += widemul(a[1], b[3-1]);
  93. accum0 += widemul(a[6], b[3+9-6]);
  94. accum1 += widemul(a[2], b[3-2]);
  95. accum0 += widemul(a[7], b[3+9-7]);
  96. accum1 += widemul(a[3], b[3-3]);
  97. accum0 += widemul(a[8], b[3+9-8]);
  98. accum1 += accum0 << 1;
  99. c[3] = accum1 & ((1ull<<58)-1);
  100. accum1 >>= 58;
  101. accum0 = 0;
  102. accum1 += widemul(a[0], b[4-0]);
  103. accum0 += widemul(a[5], b[4+9-5]);
  104. accum1 += widemul(a[1], b[4-1]);
  105. accum0 += widemul(a[6], b[4+9-6]);
  106. accum1 += widemul(a[2], b[4-2]);
  107. accum0 += widemul(a[7], b[4+9-7]);
  108. accum1 += widemul(a[3], b[4-3]);
  109. accum0 += widemul(a[8], b[4+9-8]);
  110. accum1 += widemul(a[4], b[4-4]);
  111. accum1 += accum0 << 1;
  112. c[4] = accum1 & ((1ull<<58)-1);
  113. accum1 >>= 58;
  114. accum0 = 0;
  115. accum1 += widemul(a[0], b[5-0]);
  116. accum0 += widemul(a[6], b[5+9-6]);
  117. accum1 += widemul(a[1], b[5-1]);
  118. accum1 += widemul(a[2], b[5-2]);
  119. accum0 += widemul(a[7], b[5+9-7]);
  120. accum1 += widemul(a[3], b[5-3]);
  121. accum1 += widemul(a[4], b[5-4]);
  122. accum0 += widemul(a[8], b[5+9-8]);
  123. accum1 += widemul(a[5], b[5-5]);
  124. accum1 += accum0 << 1;
  125. c[5] = accum1 & ((1ull<<58)-1);
  126. accum1 >>= 58;
  127. accum0 = 0;
  128. accum1 += widemul(a[0], b[6-0]);
  129. accum1 += widemul(a[1], b[6-1]);
  130. accum0 += widemul(a[7], b[6+9-7]);
  131. accum1 += widemul(a[2], b[6-2]);
  132. accum1 += widemul(a[3], b[6-3]);
  133. accum1 += widemul(a[4], b[6-4]);
  134. accum0 += widemul(a[8], b[6+9-8]);
  135. accum1 += widemul(a[5], b[6-5]);
  136. accum1 += widemul(a[6], b[6-6]);
  137. accum1 += accum0 << 1;
  138. c[6] = accum1 & ((1ull<<58)-1);
  139. accum1 >>= 58;
  140. accum1 += c[7];
  141. c[7] = accum1 & ((1ull<<58)-1);
  142. c[8] += accum1 >> 58;
  143. }
  144. void
  145. p521_mulw (
  146. p521_t *__restrict__ cs,
  147. const p521_t *as,
  148. uint64_t b
  149. ) {
  150. const uint64_t *a = as->limb;
  151. uint64_t *c = cs->limb;
  152. __uint128_t accum0 = 0, accum3 = 0, accum6 = 0;
  153. uint64_t mask = (1ull<<58) - 1;
  154. int i;
  155. for (i=0; i<3; i++) {
  156. accum0 += widemul(b, a[i]);
  157. accum3 += widemul(b, a[i+3]);
  158. accum6 += widemul(b, a[i+6]);
  159. c[i] = accum0 & mask; accum0 >>= 58;
  160. c[i+3] = accum3 & mask; accum3 >>= 58;
  161. if (i==2) {
  162. c[i+6] = accum6 & (mask>>1); accum6 >>= 57;
  163. } else {
  164. c[i+6] = accum6 & mask; accum6 >>= 58;
  165. }
  166. }
  167. accum0 += c[3];
  168. c[3] = accum0 & mask;
  169. c[4] += accum0 >> 58;
  170. accum3 += c[6];
  171. c[6] = accum3 & mask;
  172. c[7] += accum3 >> 58;
  173. accum6 += c[0];
  174. c[0] = accum6 & mask;
  175. c[1] += accum6 >> 58;
  176. }
  177. void
  178. p521_sqr (
  179. p521_t *__restrict__ cs,
  180. const p521_t *as
  181. ) {
  182. uint64_t *c = cs->limb;
  183. const uint64_t *a = as->limb;
  184. __uint128_t accum0, accum1;
  185. accum0 = widemul(a[8], a[8]);
  186. accum1 = widemul(a[0], a[7]);
  187. accum0 += widemul(a[1], a[6]);
  188. accum1 += widemul(a[2], a[5]);
  189. accum0 += widemul(a[3], a[4]);
  190. accum1 += accum0;
  191. c[7] = 2 * (accum1 & ((1ull<<57)-1));
  192. accum1 >>= 57;
  193. accum0 = 0;
  194. accum0 = 0;
  195. accum1 += widemul(a[4], a[4]);
  196. accum0 += widemul(a[1], a[7]);
  197. accum1 += widemul(2*a[2], a[6]);
  198. accum0 += widemul(a[3], a[5]);
  199. accum1 += widemul(2*a[0], a[8]);
  200. accum1 += 2*accum0;
  201. c[8] = accum1 & ((1ull<<57)-1);
  202. accum1 >>= 57;
  203. accum0 = 0;
  204. accum1 += widemul(a[0], a[0]);
  205. accum0 += widemul(a[1], a[8]);
  206. accum0 += widemul(a[2], a[7]);
  207. accum0 += widemul(a[3], a[6]);
  208. accum0 += widemul(a[4], a[5]);
  209. accum1 += accum0 << 2;
  210. c[0] = accum1 & ((1ull<<58)-1);
  211. accum1 >>= 58;
  212. accum0 = 0;
  213. accum0 += widemul(a[2], a[8]);
  214. accum0 += widemul(a[3], a[7]);
  215. accum0 += widemul(a[4], a[6]);
  216. accum0 <<= 1;
  217. accum0 += widemul(a[5], a[5]);
  218. accum0 += widemul(a[0], a[1]);
  219. accum1 += accum0 << 1;
  220. c[1] = accum1 & ((1ull<<58)-1);
  221. accum1 >>= 58;
  222. accum0 = 0;
  223. accum1 += widemul(a[1], a[1]);
  224. accum0 += widemul(a[3], a[8]);
  225. accum0 += widemul(a[4], a[7]);
  226. accum0 += widemul(a[5], a[6]);
  227. accum0 <<= 1;
  228. accum0 += widemul(a[0], a[2]);
  229. accum1 += accum0 << 1;
  230. c[2] = accum1 & ((1ull<<58)-1);
  231. accum1 >>= 58;
  232. accum0 = 0;
  233. accum0 += widemul(a[6], a[6]);
  234. accum0 += widemul(2*a[5], a[7]);
  235. accum0 += widemul(2*a[4], a[8]);
  236. accum0 += widemul(a[0], a[3]);
  237. accum0 += widemul(a[1], a[2]);
  238. accum1 += accum0 << 1;
  239. c[3] = accum1 & ((1ull<<58)-1);
  240. accum1 >>= 58;
  241. accum0 = 0;
  242. accum0 += widemul(a[6], a[7]);
  243. accum0 += widemul(a[5], a[8]);
  244. accum0 <<= 1;
  245. accum1 += widemul(a[2], a[2]);
  246. accum0 += widemul(a[0], a[4]);
  247. accum0 += widemul(a[1], a[3]);
  248. accum1 += accum0 << 1;
  249. c[4] = accum1 & ((1ull<<58)-1);
  250. accum1 >>= 58;
  251. accum0 = 0;
  252. accum0 += widemul(2*a[6], a[8]);
  253. accum0 += widemul(a[7], a[7]);
  254. accum0 += widemul(a[0], a[5]);
  255. accum0 += widemul(a[1], a[4]);
  256. accum0 += widemul(a[2], a[3]);
  257. accum1 += accum0 << 1;
  258. c[5] = accum1 & ((1ull<<58)-1);
  259. accum1 >>= 58;
  260. accum0 = 0;
  261. accum1 += widemul(a[3], a[3]);
  262. accum0 += widemul(a[0], a[6]);
  263. accum0 += widemul(a[1], a[5]);
  264. accum0 += widemul(2*a[7], a[8]);
  265. accum0 += widemul(a[2], a[4]);
  266. accum1 += accum0 << 1;
  267. c[6] = accum1 & ((1ull<<58)-1);
  268. accum1 >>= 58;
  269. accum1 += c[7];
  270. c[7] = accum1 & ((1ull<<58)-1);
  271. c[8] += accum1 >> 58;
  272. }
  273. void
  274. p521_strong_reduce (
  275. p521_t *a
  276. ) {
  277. uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
  278. /* first, clear high */
  279. __int128_t scarry = a->limb[8]>>57;
  280. a->limb[8] &= mask2;
  281. /* now the total is less than 2p */
  282. /* compute total_value - p. No need to reduce mod p. */
  283. int i;
  284. for (i=0; i<9; i++) {
  285. scarry = scarry + a->limb[i] - ((i==8) ? mask2 : mask);
  286. a->limb[i] = scarry & ((i==8) ? mask2 : mask);
  287. scarry >>= (i==8) ? 57 : 58;
  288. }
  289. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  290. * common case: it was < p, so now scarry = -1 and this = x - p + 2^521
  291. * so let's add back in p. will carry back off the top for 2^521.
  292. */
  293. assert(is_zero(scarry) | is_zero(scarry+1));
  294. uint64_t scarry_mask = scarry & mask;
  295. __uint128_t carry = 0;
  296. /* add it back */
  297. for (i=0; i<9; i++) {
  298. carry = carry + a->limb[i] + ((i==8)?(scarry_mask>>1):scarry_mask);
  299. a->limb[i] = carry & ((i==8) ? mask>>1 : mask);
  300. carry >>= (i==8) ? 57 : 58;
  301. }
  302. assert(is_zero(carry + scarry));
  303. }
  304. void
  305. p521_serialize (
  306. uint8_t *serial,
  307. const struct p521_t *x
  308. ) {
  309. int i,k=0;
  310. p521_t red;
  311. p521_copy(&red, x);
  312. p521_strong_reduce(&red);
  313. uint64_t r=0;
  314. int bits = 0;
  315. for (i=0; i<9; i++) {
  316. r |= red.limb[i] << bits;
  317. for (bits += 58; bits >= 8; bits -= 8) {
  318. serial[k++] = r;
  319. r >>= 8;
  320. }
  321. assert(bits <= 6);
  322. }
  323. assert(bits);
  324. serial[k++] = r;
  325. }
  326. mask_t
  327. p521_deserialize (
  328. p521_t *x,
  329. const uint8_t serial[66]
  330. ) {
  331. int i,k=0,bits=0;
  332. __uint128_t out = 0;
  333. uint64_t mask = (1ull<<58)-1;
  334. for (i=0; i<9; i++) {
  335. out >>= 58;
  336. for (; bits<58; bits+=8) {
  337. out |= ((__uint128_t)serial[k++])<<bits;
  338. }
  339. x->limb[i] = out & mask;
  340. bits -= 58;
  341. }
  342. /* Check for reduction. First, high has to be < 2^57 */
  343. mask_t good = is_zero(out>>57);
  344. uint64_t and = -1ull;
  345. for (i=0; i<8; i++) {
  346. and &= x->limb[i];
  347. }
  348. and &= (2*out+1);
  349. good &= is_zero((and+1)>>58);
  350. return good;
  351. }