You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

947 lines
26 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "f_field.h"
  5. static inline mask_t is_zero (word_t x) {
  6. dword_t xx = x;
  7. xx--;
  8. return xx >> WORD_BITS;
  9. }
  10. static uint64_t widemul (
  11. const uint32_t a,
  12. const uint32_t b
  13. ) {
  14. return ((uint64_t)a)* b;
  15. }
  16. static inline void __attribute__((gnu_inline,always_inline))
  17. smlal (
  18. uint64_t *acc,
  19. const uint32_t a,
  20. const uint32_t b
  21. ) {
  22. #ifdef __ARMEL__
  23. uint32_t lo = *acc, hi = (*acc)>>32;
  24. __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
  25. : [lo]"+&r"(lo), [hi]"+&r"(hi)
  26. : [a]"r"(a), [b]"r"(b));
  27. *acc = lo + (((uint64_t)hi)<<32);
  28. #else
  29. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  30. #endif
  31. }
  32. static inline void __attribute__((gnu_inline,always_inline))
  33. smlal2 (
  34. uint64_t *acc,
  35. const uint32_t a,
  36. const uint32_t b
  37. ) {
  38. #ifdef __ARMEL__
  39. uint32_t lo = *acc, hi = (*acc)>>32;
  40. __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
  41. : [lo]"+&r"(lo), [hi]"+&r"(hi)
  42. : [a]"r"(a), [b]"r"(2*b));
  43. *acc = lo + (((uint64_t)hi)<<32);
  44. #else
  45. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
  46. #endif
  47. }
  48. static inline void __attribute__((gnu_inline,always_inline))
  49. smull (
  50. uint64_t *acc,
  51. const uint32_t a,
  52. const uint32_t b
  53. ) {
  54. #ifdef __ARMEL__
  55. uint32_t lo, hi;
  56. __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
  57. : [lo]"=&r"(lo), [hi]"=&r"(hi)
  58. : [a]"r"(a), [b]"r"(b));
  59. *acc = lo + (((uint64_t)hi)<<32);
  60. #else
  61. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  62. #endif
  63. }
  64. static inline void __attribute__((gnu_inline,always_inline))
  65. smull2 (
  66. uint64_t *acc,
  67. const uint32_t a,
  68. const uint32_t b
  69. ) {
  70. #ifdef __ARMEL__
  71. uint32_t lo, hi;
  72. __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
  73. : [lo]"=&r"(lo), [hi]"=&r"(hi)
  74. : [a]"r"(a), [b]"r"(2*b));
  75. *acc = lo + (((uint64_t)hi)<<32);
  76. #else
  77. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
  78. #endif
  79. }
  80. void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
  81. const uint32_t *a = as->limb, *b = bs->limb;
  82. uint32_t *c = cs->limb;
  83. uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
  84. uint32_t mask = (1<<28) - 1;
  85. uint32_t aa[8], bm[8];
  86. int i;
  87. for (i=0; i<8; i++) {
  88. aa[i] = a[i] + a[i+8];
  89. bm[i] = b[i] - b[i+8];
  90. }
  91. uint32_t ax,bx;
  92. {
  93. /* t^3 terms */
  94. smull(&accum1, ax = aa[1], bx = b[15]);
  95. smull(&accum3, ax = aa[2], bx);
  96. smlal(&accum1, ax, bx = b[14]);
  97. smlal(&accum3, ax = aa[3], bx);
  98. smlal(&accum1, ax, bx = b[13]);
  99. smlal(&accum3, ax = aa[4], bx);
  100. smlal(&accum1, ax, bx = b[12]);
  101. smlal(&accum3, ax = aa[5], bx);
  102. smlal(&accum1, ax, bx = b[11]);
  103. smlal(&accum3, ax = aa[6], bx);
  104. smlal(&accum1, ax, bx = b[10]);
  105. smlal(&accum3, ax = aa[7], bx);
  106. smlal(&accum1, ax, bx = b[9]);
  107. accum0 = accum1;
  108. accum2 = accum3;
  109. /* t^2 terms */
  110. smlal(&accum2, ax = aa[0], bx);
  111. smlal(&accum0, ax, bx = b[8]);
  112. smlal(&accum2, ax = aa[1], bx);
  113. smlal(&accum0, ax = a[9], bx = b[7]);
  114. smlal(&accum2, ax = a[10], bx);
  115. smlal(&accum0, ax, bx = b[6]);
  116. smlal(&accum2, ax = a[11], bx);
  117. smlal(&accum0, ax, bx = b[5]);
  118. smlal(&accum2, ax = a[12], bx);
  119. smlal(&accum0, ax, bx = b[4]);
  120. smlal(&accum2, ax = a[13], bx);
  121. smlal(&accum0, ax, bx = b[3]);
  122. smlal(&accum2, ax = a[14], bx);
  123. smlal(&accum0, ax, bx = b[2]);
  124. smlal(&accum2, ax = a[15], bx);
  125. smlal(&accum0, ax, bx = b[1]);
  126. /* t terms */
  127. accum1 += accum0;
  128. accum3 += accum2;
  129. smlal(&accum3, ax = a[8], bx);
  130. smlal(&accum1, ax, bx = b[0]);
  131. smlal(&accum3, ax = a[9], bx);
  132. smlal(&accum1, ax = a[1], bx = bm[7]);
  133. smlal(&accum3, ax = a[2], bx);
  134. smlal(&accum1, ax, bx = bm[6]);
  135. smlal(&accum3, ax = a[3], bx);
  136. smlal(&accum1, ax, bx = bm[5]);
  137. smlal(&accum3, ax = a[4], bx);
  138. smlal(&accum1, ax, bx = bm[4]);
  139. smlal(&accum3, ax = a[5], bx);
  140. smlal(&accum1, ax, bx = bm[3]);
  141. smlal(&accum3, ax = a[6], bx);
  142. smlal(&accum1, ax, bx = bm[2]);
  143. smlal(&accum3, ax = a[7], bx);
  144. smlal(&accum1, ax, bx = bm[1]);
  145. /* 1 terms */
  146. smlal(&accum2, ax = a[0], bx);
  147. smlal(&accum0, ax, bx = bm[0]);
  148. smlal(&accum2, ax = a[1], bx);
  149. accum2 += accum0 >> 28;
  150. accum3 += accum1 >> 28;
  151. c[0] = ((uint32_t)(accum0)) & mask;
  152. c[1] = ((uint32_t)(accum2)) & mask;
  153. c[8] = ((uint32_t)(accum1)) & mask;
  154. c[9] = ((uint32_t)(accum3)) & mask;
  155. accumC0 = accum2 >> 28;
  156. accumC1 = accum3 >> 28;
  157. }
  158. {
  159. /* t^3 terms */
  160. smull(&accum1, ax = aa[3], bx = b[15]);
  161. smull(&accum3, ax = aa[4], bx);
  162. smlal(&accum1, ax, bx = b[14]);
  163. smlal(&accum3, ax = aa[5], bx);
  164. smlal(&accum1, ax, bx = b[13]);
  165. smlal(&accum3, ax = aa[6], bx);
  166. smlal(&accum1, ax, bx = b[12]);
  167. smlal(&accum3, ax = aa[7], bx);
  168. smlal(&accum1, ax, bx = b[11]);
  169. accum0 = accum1;
  170. accum2 = accum3;
  171. /* t^2 terms */
  172. smlal(&accum2, ax = aa[0], bx);
  173. smlal(&accum0, ax, bx = b[10]);
  174. smlal(&accum2, ax = aa[1], bx);
  175. smlal(&accum0, ax, bx = b[9]);
  176. smlal(&accum2, ax = aa[2], bx);
  177. smlal(&accum0, ax, bx = b[8]);
  178. smlal(&accum2, ax = aa[3], bx);
  179. smlal(&accum0, ax = a[11], bx = b[7]);
  180. smlal(&accum2, ax = a[12], bx);
  181. smlal(&accum0, ax, bx = b[6]);
  182. smlal(&accum2, ax = a[13], bx);
  183. smlal(&accum0, ax, bx = b[5]);
  184. smlal(&accum2, ax = a[14], bx);
  185. smlal(&accum0, ax, bx = b[4]);
  186. smlal(&accum2, ax = a[15], bx);
  187. smlal(&accum0, ax, bx = b[3]);
  188. /* t terms */
  189. accum1 += accum0;
  190. accum3 += accum2;
  191. smlal(&accum3, ax = a[8], bx);
  192. smlal(&accum1, ax, bx = b[2]);
  193. smlal(&accum3, ax = a[9], bx);
  194. smlal(&accum1, ax, bx = b[1]);
  195. smlal(&accum3, ax = a[10], bx);
  196. smlal(&accum1, ax, bx = b[0]);
  197. smlal(&accum3, ax = a[11], bx);
  198. smlal(&accum1, ax = a[3], bx = bm[7]);
  199. smlal(&accum3, ax = a[4], bx);
  200. smlal(&accum1, ax, bx = bm[6]);
  201. smlal(&accum3, ax = a[5], bx);
  202. smlal(&accum1, ax, bx = bm[5]);
  203. smlal(&accum3, ax = a[6], bx);
  204. smlal(&accum1, ax, bx = bm[4]);
  205. smlal(&accum3, ax = a[7], bx);
  206. smlal(&accum1, ax, bx = bm[3]);
  207. /* 1 terms */
  208. smlal(&accum2, ax = a[0], bx);
  209. smlal(&accum0, ax, bx = bm[2]);
  210. smlal(&accum2, ax = a[1], bx);
  211. smlal(&accum0, ax, bx = bm[1]);
  212. smlal(&accum2, ax = a[2], bx);
  213. smlal(&accum0, ax, bx = bm[0]);
  214. smlal(&accum2, ax = a[3], bx);
  215. accum0 += accumC0;
  216. accum1 += accumC1;
  217. accum2 += accum0 >> 28;
  218. accum3 += accum1 >> 28;
  219. c[2] = ((uint32_t)(accum0)) & mask;
  220. c[3] = ((uint32_t)(accum2)) & mask;
  221. c[10] = ((uint32_t)(accum1)) & mask;
  222. c[11] = ((uint32_t)(accum3)) & mask;
  223. accumC0 = accum2 >> 28;
  224. accumC1 = accum3 >> 28;
  225. }
  226. {
  227. /* t^3 terms */
  228. smull(&accum1, ax = aa[5], bx = b[15]);
  229. smull(&accum3, ax = aa[6], bx);
  230. smlal(&accum1, ax, bx = b[14]);
  231. smlal(&accum3, ax = aa[7], bx);
  232. smlal(&accum1, ax, bx = b[13]);
  233. accum0 = accum1;
  234. accum2 = accum3;
  235. /* t^2 terms */
  236. smlal(&accum2, ax = aa[0], bx);
  237. smlal(&accum0, ax, bx = b[12]);
  238. smlal(&accum2, ax = aa[1], bx);
  239. smlal(&accum0, ax, bx = b[11]);
  240. smlal(&accum2, ax = aa[2], bx);
  241. smlal(&accum0, ax, bx = b[10]);
  242. smlal(&accum2, ax = aa[3], bx);
  243. smlal(&accum0, ax, bx = b[9]);
  244. smlal(&accum2, ax = aa[4], bx);
  245. smlal(&accum0, ax, bx = b[8]);
  246. smlal(&accum2, ax = aa[5], bx);
  247. smlal(&accum0, ax = a[13], bx = b[7]);
  248. smlal(&accum2, ax = a[14], bx);
  249. smlal(&accum0, ax, bx = b[6]);
  250. smlal(&accum2, ax = a[15], bx);
  251. smlal(&accum0, ax, bx = b[5]);
  252. /* t terms */
  253. accum1 += accum0;
  254. accum3 += accum2;
  255. smlal(&accum3, ax = a[8], bx);
  256. smlal(&accum1, ax, bx = b[4]);
  257. smlal(&accum3, ax = a[9], bx);
  258. smlal(&accum1, ax, bx = b[3]);
  259. smlal(&accum3, ax = a[10], bx);
  260. smlal(&accum1, ax, bx = b[2]);
  261. smlal(&accum3, ax = a[11], bx);
  262. smlal(&accum1, ax, bx = b[1]);
  263. smlal(&accum3, ax = a[12], bx);
  264. smlal(&accum1, ax, bx = b[0]);
  265. smlal(&accum3, ax = a[13], bx);
  266. smlal(&accum1, ax = a[5], bx = bm[7]);
  267. smlal(&accum3, ax = a[6], bx);
  268. smlal(&accum1, ax, bx = bm[6]);
  269. smlal(&accum3, ax = a[7], bx);
  270. smlal(&accum1, ax, bx = bm[5]);
  271. /* 1 terms */
  272. smlal(&accum2, ax = a[0], bx);
  273. smlal(&accum0, ax, bx = bm[4]);
  274. smlal(&accum2, ax = a[1], bx);
  275. smlal(&accum0, ax, bx = bm[3]);
  276. smlal(&accum2, ax = a[2], bx);
  277. smlal(&accum0, ax, bx = bm[2]);
  278. smlal(&accum2, ax = a[3], bx);
  279. smlal(&accum0, ax, bx = bm[1]);
  280. smlal(&accum2, ax = a[4], bx);
  281. smlal(&accum0, ax, bx = bm[0]);
  282. smlal(&accum2, ax = a[5], bx);
  283. accum0 += accumC0;
  284. accum1 += accumC1;
  285. accum2 += accum0 >> 28;
  286. accum3 += accum1 >> 28;
  287. c[4] = ((uint32_t)(accum0)) & mask;
  288. c[5] = ((uint32_t)(accum2)) & mask;
  289. c[12] = ((uint32_t)(accum1)) & mask;
  290. c[13] = ((uint32_t)(accum3)) & mask;
  291. accumC0 = accum2 >> 28;
  292. accumC1 = accum3 >> 28;
  293. }
  294. {
  295. /* t^3 terms */
  296. smull(&accum1, ax = aa[7], bx = b[15]);
  297. accum0 = accum1;
  298. /* t^2 terms */
  299. smull(&accum2, ax = aa[0], bx);
  300. smlal(&accum0, ax, bx = b[14]);
  301. smlal(&accum2, ax = aa[1], bx);
  302. smlal(&accum0, ax, bx = b[13]);
  303. smlal(&accum2, ax = aa[2], bx);
  304. smlal(&accum0, ax, bx = b[12]);
  305. smlal(&accum2, ax = aa[3], bx);
  306. smlal(&accum0, ax, bx = b[11]);
  307. smlal(&accum2, ax = aa[4], bx);
  308. smlal(&accum0, ax, bx = b[10]);
  309. smlal(&accum2, ax = aa[5], bx);
  310. smlal(&accum0, ax, bx = b[9]);
  311. smlal(&accum2, ax = aa[6], bx);
  312. smlal(&accum0, ax, bx = b[8]);
  313. smlal(&accum2, ax = aa[7], bx);
  314. smlal(&accum0, ax = a[15], bx = b[7]);
  315. /* t terms */
  316. accum1 += accum0;
  317. accum3 = accum2;
  318. smlal(&accum3, ax = a[8], bx);
  319. smlal(&accum1, ax, bx = b[6]);
  320. smlal(&accum3, ax = a[9], bx);
  321. smlal(&accum1, ax, bx = b[5]);
  322. smlal(&accum3, ax = a[10], bx);
  323. smlal(&accum1, ax, bx = b[4]);
  324. smlal(&accum3, ax = a[11], bx);
  325. smlal(&accum1, ax, bx = b[3]);
  326. smlal(&accum3, ax = a[12], bx);
  327. smlal(&accum1, ax, bx = b[2]);
  328. smlal(&accum3, ax = a[13], bx);
  329. smlal(&accum1, ax, bx = b[1]);
  330. smlal(&accum3, ax = a[14], bx);
  331. smlal(&accum1, ax, bx = b[0]);
  332. smlal(&accum3, ax = a[15], bx);
  333. smlal(&accum1, ax = a[7], bx = bm[7]);
  334. /* 1 terms */
  335. smlal(&accum2, ax = a[0], bx);
  336. smlal(&accum0, ax, bx = bm[6]);
  337. smlal(&accum2, ax = a[1], bx);
  338. smlal(&accum0, ax, bx = bm[5]);
  339. smlal(&accum2, ax = a[2], bx);
  340. smlal(&accum0, ax, bx = bm[4]);
  341. smlal(&accum2, ax = a[3], bx);
  342. smlal(&accum0, ax, bx = bm[3]);
  343. smlal(&accum2, ax = a[4], bx);
  344. smlal(&accum0, ax, bx = bm[2]);
  345. smlal(&accum2, ax = a[5], bx);
  346. smlal(&accum0, ax, bx = bm[1]);
  347. smlal(&accum2, ax = a[6], bx);
  348. smlal(&accum0, ax, bx = bm[0]);
  349. smlal(&accum2, ax = a[7], bx);
  350. accum0 += accumC0;
  351. accum1 += accumC1;
  352. accum2 += accum0 >> 28;
  353. accum3 += accum1 >> 28;
  354. c[6] = ((uint32_t)(accum0)) & mask;
  355. c[7] = ((uint32_t)(accum2)) & mask;
  356. c[14] = ((uint32_t)(accum1)) & mask;
  357. c[15] = ((uint32_t)(accum3)) & mask;
  358. accum0 = accum2 >> 28;
  359. accum1 = accum3 >> 28;
  360. }
  361. accum0 += accum1;
  362. accum0 += c[8];
  363. accum1 += c[0];
  364. c[8] = ((uint32_t)(accum0)) & mask;
  365. c[0] = ((uint32_t)(accum1)) & mask;
  366. accum0 >>= 28;
  367. accum1 >>= 28;
  368. c[9] += ((uint32_t)(accum0));
  369. c[1] += ((uint32_t)(accum1));
  370. }
  371. void gf_sqr (gf_s *__restrict__ cs, const gf as) {
  372. const uint32_t *a = as->limb;
  373. uint32_t *c = cs->limb;
  374. uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
  375. uint32_t mask = (1<<28) - 1;
  376. uint32_t bm[8];
  377. int i;
  378. for (i=0; i<8; i++) {
  379. bm[i] = a[i] - a[i+8];
  380. }
  381. uint32_t ax,bx;
  382. {
  383. /* t^3 terms */
  384. smull2(&accum1, ax = a[9], bx = a[15]);
  385. smull2(&accum3, ax = a[10], bx);
  386. smlal2(&accum1, ax, bx = a[14]);
  387. smlal2(&accum3, ax = a[11], bx);
  388. smlal2(&accum1, ax, bx = a[13]);
  389. smlal2(&accum3, ax = a[12], bx);
  390. smlal(&accum1, ax, ax);
  391. accum0 = accum1;
  392. accum2 = accum3;
  393. /* t^2 terms */
  394. smlal2(&accum2, ax = a[8], a[9]);
  395. smlal(&accum0, ax, ax);
  396. smlal2(&accum0, ax = a[1], bx = a[7]);
  397. smlal2(&accum2, ax = a[2], bx);
  398. smlal2(&accum0, ax, bx = a[6]);
  399. smlal2(&accum2, ax = a[3], bx);
  400. smlal2(&accum0, ax, bx = a[5]);
  401. smlal2(&accum2, ax = a[4], bx);
  402. smlal(&accum0, ax, ax);
  403. /* t terms */
  404. accum1 += accum0;
  405. accum3 += accum2;
  406. smlal2(&accum3, ax = a[0], bx = a[1]);
  407. smlal(&accum1, ax, ax);
  408. accum1 = -accum1;
  409. accum3 = -accum3;
  410. accum2 = -accum2;
  411. accum0 = -accum0;
  412. smlal2(&accum1, ax = bm[1], bx = bm[7]);
  413. smlal2(&accum3, ax = bm[2], bx);
  414. smlal2(&accum1, ax, bx = bm[6]);
  415. smlal2(&accum3, ax = bm[3], bx);
  416. smlal2(&accum1, ax, bx = bm[5]);
  417. smlal2(&accum3, ax = bm[4], bx);
  418. smlal(&accum1, ax, ax);
  419. /* 1 terms */
  420. smlal2(&accum2, ax = bm[0], bx = bm[1]);
  421. smlal(&accum0, ax, ax);
  422. tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
  423. tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
  424. accum2 += accum0 >> 28;
  425. accum3 += accum1 >> 28;
  426. c[0] = ((uint32_t)(accum0)) & mask;
  427. c[1] = ((uint32_t)(accum2)) & mask;
  428. c[8] = ((uint32_t)(accum1)) & mask;
  429. c[9] = ((uint32_t)(accum3)) & mask;
  430. accumC0 = accum2 >> 28;
  431. accumC1 = accum3 >> 28;
  432. }
  433. {
  434. /* t^3 terms */
  435. smull2(&accum1, ax = a[11], bx = a[15]);
  436. smull2(&accum3, ax = a[12], bx);
  437. smlal2(&accum1, ax, bx = a[14]);
  438. smlal2(&accum3, ax = a[13], bx);
  439. smlal(&accum1, ax, ax);
  440. accum0 = accum1;
  441. accum2 = accum3;
  442. /* t^2 terms */
  443. smlal2(&accum2, ax = a[8], bx = a[11]);
  444. smlal2(&accum0, ax, bx = a[10]);
  445. smlal2(&accum2, ax = a[9], bx);
  446. smlal(&accum0, ax, ax);
  447. smlal2(&accum0, ax = a[3], bx = a[7]);
  448. smlal2(&accum2, ax = a[4], bx);
  449. smlal2(&accum0, ax, bx = a[6]);
  450. smlal2(&accum2, ax = a[5], bx);
  451. smlal(&accum0, ax, ax);
  452. /* t terms */
  453. accum1 += accum0;
  454. accum3 += accum2;
  455. smlal2(&accum3, ax = a[0], bx = a[3]);
  456. smlal2(&accum1, ax, bx = a[2]);
  457. smlal2(&accum3, ax = a[1], bx);
  458. smlal(&accum1, ax, ax);
  459. accum1 = -accum1;
  460. accum3 = -accum3;
  461. accum2 = -accum2;
  462. accum0 = -accum0;
  463. smlal2(&accum1, ax = bm[3], bx = bm[7]);
  464. smlal2(&accum3, ax = bm[4], bx);
  465. smlal2(&accum1, ax, bx = bm[6]);
  466. smlal2(&accum3, ax = bm[5], bx);
  467. smlal(&accum1, ax, ax);
  468. /* 1 terms */
  469. smlal2(&accum2, ax = bm[0], bx = bm[3]);
  470. smlal2(&accum0, ax, bx = bm[2]);
  471. smlal2(&accum2, ax = bm[1], bx);
  472. smlal(&accum0, ax, ax);
  473. tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
  474. tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
  475. accum0 += accumC0;
  476. accum1 += accumC1;
  477. accum2 += accum0 >> 28;
  478. accum3 += accum1 >> 28;
  479. c[2] = ((uint32_t)(accum0)) & mask;
  480. c[3] = ((uint32_t)(accum2)) & mask;
  481. c[10] = ((uint32_t)(accum1)) & mask;
  482. c[11] = ((uint32_t)(accum3)) & mask;
  483. accumC0 = accum2 >> 28;
  484. accumC1 = accum3 >> 28;
  485. }
  486. {
  487. /* t^3 terms */
  488. smull2(&accum1, ax = a[13], bx = a[15]);
  489. smull2(&accum3, ax = a[14], bx);
  490. smlal(&accum1, ax, ax);
  491. accum0 = accum1;
  492. accum2 = accum3;
  493. /* t^2 terms */
  494. smlal2(&accum2, ax = a[8], bx = a[13]);
  495. smlal2(&accum0, ax, bx = a[12]);
  496. smlal2(&accum2, ax = a[9], bx);
  497. smlal2(&accum0, ax, bx = a[11]);
  498. smlal2(&accum2, ax = a[10], bx);
  499. smlal(&accum0, ax, ax);
  500. smlal2(&accum0, ax = a[5], bx = a[7]);
  501. smlal2(&accum2, ax = a[6], bx);
  502. smlal(&accum0, ax, ax);
  503. /* t terms */
  504. accum1 += accum0;
  505. accum3 += accum2;
  506. smlal2(&accum3, ax = a[0], bx = a[5]);
  507. smlal2(&accum1, ax, bx = a[4]);
  508. smlal2(&accum3, ax = a[1], bx);
  509. smlal2(&accum1, ax, bx = a[3]);
  510. smlal2(&accum3, ax = a[2], bx);
  511. smlal(&accum1, ax, ax);
  512. accum1 = -accum1;
  513. accum3 = -accum3;
  514. accum2 = -accum2;
  515. accum0 = -accum0;
  516. smlal2(&accum1, ax = bm[5], bx = bm[7]);
  517. smlal2(&accum3, ax = bm[6], bx);
  518. smlal(&accum1, ax, ax);
  519. /* 1 terms */
  520. smlal2(&accum2, ax = bm[0], bx = bm[5]);
  521. smlal2(&accum0, ax, bx = bm[4]);
  522. smlal2(&accum2, ax = bm[1], bx);
  523. smlal2(&accum0, ax, bx = bm[3]);
  524. smlal2(&accum2, ax = bm[2], bx);
  525. smlal(&accum0, ax, ax);
  526. tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
  527. tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
  528. accum0 += accumC0;
  529. accum1 += accumC1;
  530. accum2 += accum0 >> 28;
  531. accum3 += accum1 >> 28;
  532. c[4] = ((uint32_t)(accum0)) & mask;
  533. c[5] = ((uint32_t)(accum2)) & mask;
  534. c[12] = ((uint32_t)(accum1)) & mask;
  535. c[13] = ((uint32_t)(accum3)) & mask;
  536. accumC0 = accum2 >> 28;
  537. accumC1 = accum3 >> 28;
  538. }
  539. {
  540. /* t^3 terms */
  541. smull(&accum1, ax = a[15], bx = a[15]);
  542. accum0 = accum1;
  543. /* t^2 terms */
  544. smull2(&accum2, ax = a[8], bx);
  545. smlal2(&accum0, ax, bx = a[14]);
  546. smlal2(&accum2, ax = a[9], bx);
  547. smlal2(&accum0, ax, bx = a[13]);
  548. smlal2(&accum2, ax = a[10], bx);
  549. smlal2(&accum0, ax, bx = a[12]);
  550. smlal2(&accum2, ax = a[11], bx);
  551. smlal(&accum0, ax, ax);
  552. smlal(&accum0, ax = a[7], bx = a[7]);
  553. /* t terms */
  554. accum1 += accum0;
  555. accum3 = accum2;
  556. smlal2(&accum3, ax = a[0], bx);
  557. smlal2(&accum1, ax, bx = a[6]);
  558. smlal2(&accum3, ax = a[1], bx);
  559. smlal2(&accum1, ax, bx = a[5]);
  560. smlal2(&accum3, ax = a[2], bx);
  561. smlal2(&accum1, ax, bx = a[4]);
  562. smlal2(&accum3, ax = a[3], bx);
  563. smlal(&accum1, ax, ax);
  564. accum1 = -accum1;
  565. accum3 = -accum3;
  566. accum2 = -accum2;
  567. accum0 = -accum0;
  568. bx = bm[7];
  569. smlal(&accum1, bx, bx);
  570. /* 1 terms */
  571. smlal2(&accum2, ax = bm[0], bx);
  572. smlal2(&accum0, ax, bx = bm[6]);
  573. smlal2(&accum2, ax = bm[1], bx);
  574. smlal2(&accum0, ax, bx = bm[5]);
  575. smlal2(&accum2, ax = bm[2], bx);
  576. smlal2(&accum0, ax, bx = bm[4]);
  577. smlal2(&accum2, ax = bm[3], bx);
  578. smlal(&accum0, ax, ax);
  579. tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
  580. tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
  581. accum0 += accumC0;
  582. accum1 += accumC1;
  583. accum2 += accum0 >> 28;
  584. accum3 += accum1 >> 28;
  585. c[6] = ((uint32_t)(accum0)) & mask;
  586. c[7] = ((uint32_t)(accum2)) & mask;
  587. c[14] = ((uint32_t)(accum1)) & mask;
  588. c[15] = ((uint32_t)(accum3)) & mask;
  589. accum0 = accum2 >> 28;
  590. accum1 = accum3 >> 28;
  591. }
  592. accum0 += accum1;
  593. accum0 += c[8];
  594. accum1 += c[0];
  595. c[8] = ((uint32_t)(accum0)) & mask;
  596. c[0] = ((uint32_t)(accum1)) & mask;
  597. accum0 >>= 28;
  598. accum1 >>= 28;
  599. c[9] += ((uint32_t)(accum0));
  600. c[1] += ((uint32_t)(accum1));
  601. }
  602. void gf_mulw (
  603. gf_s *__restrict__ cs,
  604. const gf as,
  605. uint64_t b
  606. ) {
  607. uint32_t mask = (1ull<<28)-1;
  608. const uint32_t bhi = b>>28, blo = b & mask;
  609. const uint32_t *a = as->limb;
  610. uint32_t *c = cs->limb;
  611. uint64_t accum0, accum8;
  612. int i;
  613. uint32_t c0, c8, n0, n8;
  614. accum0 = widemul(bhi, a[15]);
  615. accum8 = widemul(bhi, a[15] + a[7]);
  616. c0 = a[0]; c8 = a[8];
  617. smlal(&accum0, blo, c0);
  618. smlal(&accum8, blo, c8);
  619. c[0] = accum0 & mask; accum0 >>= 28;
  620. c[8] = accum8 & mask; accum8 >>= 28;
  621. i=1;
  622. {
  623. n0 = a[i]; n8 = a[i+8];
  624. smlal(&accum0, bhi, c0);
  625. smlal(&accum8, bhi, c8);
  626. smlal(&accum0, blo, n0);
  627. smlal(&accum8, blo, n8);
  628. c[i] = accum0 & mask; accum0 >>= 28;
  629. c[i+8] = accum8 & mask; accum8 >>= 28;
  630. i++;
  631. }
  632. {
  633. c0 = a[i]; c8 = a[i+8];
  634. smlal(&accum0, bhi, n0);
  635. smlal(&accum8, bhi, n8);
  636. smlal(&accum0, blo, c0);
  637. smlal(&accum8, blo, c8);
  638. c[i] = accum0 & mask; accum0 >>= 28;
  639. c[i+8] = accum8 & mask; accum8 >>= 28;
  640. i++;
  641. }
  642. {
  643. n0 = a[i]; n8 = a[i+8];
  644. smlal(&accum0, bhi, c0);
  645. smlal(&accum8, bhi, c8);
  646. smlal(&accum0, blo, n0);
  647. smlal(&accum8, blo, n8);
  648. c[i] = accum0 & mask; accum0 >>= 28;
  649. c[i+8] = accum8 & mask; accum8 >>= 28;
  650. i++;
  651. }
  652. {
  653. c0 = a[i]; c8 = a[i+8];
  654. smlal(&accum0, bhi, n0);
  655. smlal(&accum8, bhi, n8);
  656. smlal(&accum0, blo, c0);
  657. smlal(&accum8, blo, c8);
  658. c[i] = accum0 & mask; accum0 >>= 28;
  659. c[i+8] = accum8 & mask; accum8 >>= 28;
  660. i++;
  661. }
  662. {
  663. n0 = a[i]; n8 = a[i+8];
  664. smlal(&accum0, bhi, c0);
  665. smlal(&accum8, bhi, c8);
  666. smlal(&accum0, blo, n0);
  667. smlal(&accum8, blo, n8);
  668. c[i] = accum0 & mask; accum0 >>= 28;
  669. c[i+8] = accum8 & mask; accum8 >>= 28;
  670. i++;
  671. }
  672. {
  673. c0 = a[i]; c8 = a[i+8];
  674. smlal(&accum0, bhi, n0);
  675. smlal(&accum8, bhi, n8);
  676. smlal(&accum0, blo, c0);
  677. smlal(&accum8, blo, c8);
  678. c[i] = accum0 & mask; accum0 >>= 28;
  679. c[i+8] = accum8 & mask; accum8 >>= 28;
  680. i++;
  681. }
  682. {
  683. n0 = a[i]; n8 = a[i+8];
  684. smlal(&accum0, bhi, c0);
  685. smlal(&accum8, bhi, c8);
  686. smlal(&accum0, blo, n0);
  687. smlal(&accum8, blo, n8);
  688. c[i] = accum0 & mask; accum0 >>= 28;
  689. c[i+8] = accum8 & mask; accum8 >>= 28;
  690. i++;
  691. }
  692. accum0 += accum8 + c[8];
  693. c[8] = accum0 & mask;
  694. c[9] += accum0 >> 28;
  695. accum8 += c[0];
  696. c[0] = accum8 & mask;
  697. c[1] += accum8 >> 28;
  698. }
  699. void gf_strong_reduce (
  700. gf a
  701. ) {
  702. word_t mask = (1ull<<28)-1;
  703. /* first, clear high */
  704. a->limb[8] += a->limb[15]>>28;
  705. a->limb[0] += a->limb[15]>>28;
  706. a->limb[15] &= mask;
  707. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  708. /* compute total_value - p. No need to reduce mod p. */
  709. dsword_t scarry = 0;
  710. int i;
  711. for (i=0; i<16; i++) {
  712. scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
  713. a->limb[i] = scarry & mask;
  714. scarry >>= 28;
  715. }
  716. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  717. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  718. * so let's add back in p. will carry back off the top for 2^448.
  719. */
  720. assert(is_zero(scarry) | is_zero(scarry+1));
  721. word_t scarry_mask = scarry & mask;
  722. dword_t carry = 0;
  723. /* add it back */
  724. for (i=0; i<16; i++) {
  725. carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
  726. a->limb[i] = carry & mask;
  727. carry >>= 28;
  728. }
  729. assert(is_zero(carry + scarry));
  730. }
  731. void gf_serialize (
  732. uint8_t *serial,
  733. const gf x
  734. ) {
  735. int i,j;
  736. gf red;
  737. gf_copy(red, x);
  738. gf_strong_reduce(red);
  739. for (i=0; i<8; i++) {
  740. uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
  741. for (j=0; j<7; j++) {
  742. serial[7*i+j] = limb;
  743. limb >>= 8;
  744. }
  745. assert(limb == 0);
  746. }
  747. }
  748. mask_t
  749. gf_deserialize (
  750. gf x,
  751. const uint8_t serial[56]
  752. ) {
  753. int i,j;
  754. for (i=0; i<8; i++) {
  755. uint64_t out = 0;
  756. for (j=0; j<7; j++) {
  757. out |= ((uint64_t)serial[7*i+j])<<(8*j);
  758. }
  759. x->limb[2*i] = out & ((1ull<<28)-1);
  760. x->limb[2*i+1] = out >> 28;
  761. }
  762. /* Check for reduction.
  763. *
  764. * The idea is to create a variable ge which is all ones (rather, 56 ones)
  765. * if and only if the low $i$ words of $x$ are >= those of p.
  766. *
  767. * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
  768. */
  769. uint32_t ge = -1, mask = (1ull<<28)-1;
  770. for (i=0; i<8; i++) {
  771. ge &= x->limb[i];
  772. }
  773. /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
  774. ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
  775. /* Propagate the rest */
  776. for (i=9; i<16; i++) {
  777. ge &= x->limb[i];
  778. }
  779. return ~is_zero(ge ^ mask);
  780. }