You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

724 lines
24 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "word.h"
  5. #include "p448.h"
  6. static inline mask_t __attribute__((always_inline))
  7. is_zero (
  8. word_t x
  9. ) {
  10. dword_t xx = x;
  11. xx--;
  12. return xx >> WORD_BITS;
  13. }
  14. static uint64_t widemul_32 (
  15. const uint32_t a,
  16. const uint32_t b
  17. ) {
  18. return ((uint64_t)a)* b;
  19. }
  20. #ifdef __ARM_NEON__
  21. static __inline__ void __attribute__((gnu_inline,always_inline))
  22. xx_vtrnq_s64 (
  23. int64x2_t *x,
  24. int64x2_t *y
  25. ) {
  26. __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y));
  27. }
  28. static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
  29. xx_vaddup_s64(int64x2_t x) {
  30. __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
  31. return x;
  32. }
  33. #else
  34. #include "neon_emulation.h"
  35. #endif /* ARM_NEON */
  36. static inline void __attribute__((gnu_inline,always_inline,unused))
  37. smlal (
  38. uint64_t *acc,
  39. const uint32_t a,
  40. const uint32_t b
  41. ) {
  42. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  43. }
  44. static inline void __attribute__((gnu_inline,always_inline,unused))
  45. smlal2 (
  46. uint64_t *acc,
  47. const uint32_t a,
  48. const uint32_t b
  49. ) {
  50. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  51. }
  52. static inline void __attribute__((gnu_inline,always_inline,unused))
  53. smull (
  54. uint64_t *acc,
  55. const uint32_t a,
  56. const uint32_t b
  57. ) {
  58. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  59. }
  60. static inline void __attribute__((gnu_inline,always_inline,unused))
  61. smull2 (
  62. uint64_t *acc,
  63. const uint32_t a,
  64. const uint32_t b
  65. ) {
  66. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  67. }
  68. void
  69. p448_mul (
  70. p448_t *__restrict__ cs,
  71. const p448_t *as,
  72. const p448_t *bs
  73. ) {
  74. const uint32_t *a = as->limb, *b = bs->limb;
  75. uint32_t *c = cs->limb;
  76. const int32x2_t
  77. *val = (const int32x2_t *)a,
  78. *vbl = (const int32x2_t *)b,
  79. *vah = (const int32x2_t *)(&a[8]),
  80. *vbh = (const int32x2_t *)(&b[8]);
  81. int32x2_t
  82. *vcl = (int32x2_t *)c,
  83. *vch = (int32x2_t *)(&c[8]),
  84. vmask = {(1<<28) - 1, (1<<28)-1};
  85. int64x2_t accumx0a, accumx0b;
  86. int64x2_t accumx1a, accumx1b;
  87. int64x2_t accumx2a, accumx2b;
  88. int64x2_t accumx3a, accumx3b;
  89. int64x2_t accumx4a, accumx4b;
  90. int64x2_t accumx5a, accumx5b;
  91. int64x2_t accumx6a, accumx6b;
  92. int64x2_t accumx7a, accumx7b;
  93. int64x2_t carry;
  94. int32x2x2_t trn_res;
  95. int32x2_t delta;
  96. accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0);
  97. accumx1a = vmull_lane_s32( delta, vbh[3], 1);
  98. accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0);
  99. accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
  100. accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0);
  101. accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
  102. accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0);
  103. accumx1b = vmull_lane_s32( delta, vbh[0], 1);
  104. accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
  105. accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
  106. accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
  107. accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
  108. accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
  109. accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
  110. accumx0b += accumx0a;
  111. accumx1b += accumx1a;
  112. accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
  113. accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
  114. accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
  115. accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
  116. accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
  117. accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
  118. accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
  119. accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
  120. accumx0a += accumx0b;
  121. accumx1a += accumx1b;
  122. accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
  123. accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
  124. xx_vtrnq_s64(&accumx0a, &accumx0b);
  125. xx_vtrnq_s64(&accumx1a, &accumx1b);
  126. accumx0b += accumx1a;
  127. accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
  128. accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
  129. trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
  130. vcl[0] = trn_res.val[1] & vmask;
  131. vch[0] = trn_res.val[0] & vmask;
  132. accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
  133. accumx3a = vmull_lane_s32( delta, vbh[3], 1);
  134. accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
  135. accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
  136. accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
  137. accumx3b = vmull_lane_s32( delta, vbh[1], 1);
  138. accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
  139. accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
  140. accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
  141. accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
  142. accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
  143. accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
  144. accumx2b += accumx2a;
  145. accumx3b += accumx3a;
  146. accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
  147. accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
  148. accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
  149. accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
  150. accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0);
  151. accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
  152. accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0);
  153. accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
  154. accumx2a += accumx2b;
  155. accumx3a += accumx3b;
  156. accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0);
  157. accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
  158. accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0);
  159. accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
  160. xx_vtrnq_s64(&accumx2a, &accumx2b);
  161. xx_vtrnq_s64(&accumx3a, &accumx3b);
  162. accumx2a += accumx1b;
  163. accumx2b += accumx3a;
  164. accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
  165. accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
  166. trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
  167. vcl[1] = trn_res.val[1] & vmask;
  168. vch[1] = trn_res.val[0] & vmask;
  169. carry = accumx3b;
  170. accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0);
  171. accumx5a = vmull_lane_s32( delta, vbh[3], 1);
  172. accumx4b = accumx4a;
  173. accumx5b = accumx5a;
  174. accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0);
  175. accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
  176. accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0);
  177. accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
  178. accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0);
  179. accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
  180. accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
  181. accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
  182. accumx4a += accumx4b;
  183. accumx5a += accumx5b;
  184. accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
  185. accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
  186. accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
  187. accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
  188. accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
  189. accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
  190. accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
  191. accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
  192. /**/
  193. accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
  194. accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
  195. accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
  196. accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
  197. accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
  198. accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
  199. xx_vtrnq_s64(&accumx4a, &accumx4b);
  200. xx_vtrnq_s64(&accumx5a, &accumx5b);
  201. accumx4a += carry;
  202. accumx4b += accumx5a;
  203. accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
  204. accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
  205. trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
  206. vcl[2] = trn_res.val[1] & vmask;
  207. vch[2] = trn_res.val[0] & vmask;
  208. accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
  209. accumx7b = vmull_lane_s32( delta, vbh[3], 1);
  210. accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
  211. accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
  212. accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
  213. accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
  214. accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
  215. accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
  216. accumx6a = accumx6b;
  217. accumx7a = accumx7b;
  218. accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
  219. accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
  220. accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
  221. accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
  222. accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
  223. accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
  224. accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
  225. accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
  226. /**/
  227. accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0);
  228. accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
  229. accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0);
  230. accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
  231. accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0);
  232. accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
  233. accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0);
  234. accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
  235. xx_vtrnq_s64(&accumx6a, &accumx6b);
  236. xx_vtrnq_s64(&accumx7a, &accumx7b);
  237. accumx6a += accumx5b;
  238. accumx6b += accumx7a;
  239. accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
  240. accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
  241. trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
  242. vcl[3] = trn_res.val[1] & vmask;
  243. vch[3] = trn_res.val[0] & vmask;
  244. accumx7b = xx_vaddup_s64(accumx7b);
  245. int32x2_t t0 = vcl[0], t1 = vch[0];
  246. trn_res = vtrn_s32(t0,t1);
  247. t0 = trn_res.val[0]; t1 = trn_res.val[1];
  248. accumx7b = vaddw_s32(accumx7b, t0);
  249. t0 = vmovn_s64(accumx7b) & vmask;
  250. accumx7b = vshrq_n_s64(accumx7b,28);
  251. accumx7b = vaddw_s32(accumx7b, t1);
  252. t1 = vmovn_s64(accumx7b) & vmask;
  253. trn_res = vtrn_s32(t0,t1);
  254. vcl[0] = trn_res.val[0];
  255. vch[0] = trn_res.val[1];
  256. accumx7b = vshrq_n_s64(accumx7b,28);
  257. t0 = vmovn_s64(accumx7b);
  258. uint32_t
  259. c0 = vget_lane_s32(t0,0),
  260. c1 = vget_lane_s32(t0,1);
  261. c[2] += c0;
  262. c[10] += c1;
  263. }
  264. void
  265. p448_sqr (
  266. p448_t *__restrict__ cs,
  267. const p448_t *as
  268. ) {
  269. /* FUTURE possible improvements:
  270. * don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end
  271. * or use phi/nega-phi for everything, montgomery style
  272. * or find some sort of phi algorithm which doesn't have this problem
  273. * break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks
  274. *
  275. * These improvements are all pretty minor, but I guess together they might matter?
  276. */
  277. const uint32_t *b = as->limb;
  278. uint32_t *c = cs->limb;
  279. int32x2_t vbm[4];
  280. const int32x2_t
  281. *vbl = (const int32x2_t *)b,
  282. *vbh = (const int32x2_t *)(&b[8]);
  283. int i;
  284. for (i=0; i<4; i++) {
  285. vbm[i] = vbl[i] - vbh[i];
  286. }
  287. int32x2_t
  288. *vcl = (int32x2_t *)c,
  289. *vch = (int32x2_t *)(&c[8]),
  290. vmask = {(1<<28) - 1, (1<<28)-1};
  291. int64x2_t accumx0a, accumx0b;
  292. int64x2_t accumx1a, accumx1b;
  293. int64x2_t accumx2a, accumx2b;
  294. int64x2_t accumx3a, accumx3b;
  295. int64x2_t accumx4a, accumx4b;
  296. int64x2_t accumx5a, accumx5b;
  297. int64x2_t accumx6a, accumx6b;
  298. int64x2_t accumx7a, accumx7b;
  299. int64x2_t carry;
  300. int32x2x2_t trn_res;
  301. accumx0a = vqdmull_lane_s32( vbh[1], vbh[3], 0);
  302. accumx1a = vqdmull_lane_s32( vbh[1], vbh[3], 1);
  303. accumx2a = vqdmull_lane_s32( vbh[2], vbh[3], 0);
  304. accumx3a = vqdmull_lane_s32( vbh[2], vbh[3], 1);
  305. accumx0a = vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0);
  306. accumx1a = vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1);
  307. accumx2b = accumx2a;
  308. accumx3b = accumx3a;
  309. accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0);
  310. accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1);
  311. accumx0b = accumx0a;
  312. accumx1b = accumx1a;
  313. accumx0b = vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0);
  314. accumx1b = vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1);
  315. accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0);
  316. accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1);
  317. accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0);
  318. accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1);
  319. accumx0b = vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0);
  320. accumx1b = vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1);
  321. accumx2a += accumx2b;
  322. accumx3a += accumx3b;
  323. accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0);
  324. accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1);
  325. accumx0a += accumx0b;
  326. accumx1a += accumx1b;
  327. accumx0a = vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0);
  328. accumx1a = vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1);
  329. accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0);
  330. accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1);
  331. accumx0a = vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0);
  332. accumx1a = vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1);
  333. accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0);
  334. accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1);
  335. accumx0b += accumx0a;
  336. accumx1b += accumx1a;
  337. accumx0b = vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0);
  338. accumx1b = vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1);
  339. accumx2b += accumx2a;
  340. accumx3b += accumx3a;
  341. accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0);
  342. accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1);
  343. xx_vtrnq_s64(&accumx0b, &accumx0a);
  344. xx_vtrnq_s64(&accumx1b, &accumx1a);
  345. xx_vtrnq_s64(&accumx2b, &accumx2a);
  346. xx_vtrnq_s64(&accumx3b, &accumx3a);
  347. accumx0a += accumx1b;
  348. accumx0a = vsraq_n_s64(accumx0a,accumx0b,28);
  349. accumx1a = vsraq_n_s64(accumx1a,accumx0a,28);
  350. accumx2b += accumx1a;
  351. accumx2a += accumx3b;
  352. accumx2a = vsraq_n_s64(accumx2a,accumx2b,28);
  353. accumx3a = vsraq_n_s64(accumx3a,accumx2a,28);
  354. trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a));
  355. vcl[0] = trn_res.val[1] & vmask;
  356. vch[0] = trn_res.val[0] & vmask;
  357. trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a));
  358. vcl[1] = trn_res.val[1] & vmask;
  359. vch[1] = trn_res.val[0] & vmask;
  360. carry = accumx3a;
  361. accumx4a = vmull_lane_s32( vbh[3], vbh[3], 0);
  362. accumx5a = vmull_lane_s32( vbh[3], vbh[3], 1);
  363. accumx6b = vqdmull_lane_s32( vbh[0], vbh[3], 0);
  364. accumx7b = vqdmull_lane_s32( vbh[0], vbh[3], 1);
  365. accumx4b = accumx4a;
  366. accumx5b = accumx5a;
  367. accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0);
  368. accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1);
  369. accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0);
  370. accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1);
  371. accumx4b = vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0);
  372. accumx5b = vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1);
  373. accumx4b = vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0);
  374. accumx5b = vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1);
  375. accumx6a = accumx6b;
  376. accumx7a = accumx7b;
  377. accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0);
  378. accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1);
  379. accumx4a += accumx4b;
  380. accumx5a += accumx5b;
  381. accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0);
  382. accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1);
  383. accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0);
  384. accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1);
  385. accumx4a = vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0);
  386. accumx5a = vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1);
  387. accumx4a = vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0);
  388. accumx5a = vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1);
  389. accumx6b += accumx6a;
  390. accumx7b += accumx7a;
  391. accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0);
  392. accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1);
  393. accumx4b += accumx4a;
  394. accumx5b += accumx5a;
  395. accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0);
  396. accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1);
  397. accumx4b = vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0);
  398. accumx5b = vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1);
  399. accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0);
  400. accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1);
  401. xx_vtrnq_s64(&accumx4b, &accumx4a);
  402. xx_vtrnq_s64(&accumx5b, &accumx5a);
  403. xx_vtrnq_s64(&accumx6b, &accumx6a);
  404. xx_vtrnq_s64(&accumx7b, &accumx7a);
  405. accumx4b += carry;
  406. accumx4a += accumx5b;
  407. accumx4a = vsraq_n_s64(accumx4a,accumx4b,28);
  408. accumx5a = vsraq_n_s64(accumx5a,accumx4a,28);
  409. accumx6b += accumx5a;
  410. accumx6a += accumx7b;
  411. trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a));
  412. vcl[2] = trn_res.val[1] & vmask;
  413. vch[2] = trn_res.val[0] & vmask;
  414. accumx6a = vsraq_n_s64(accumx6a,accumx6b,28);
  415. accumx7a = vsraq_n_s64(accumx7a,accumx6a,28);
  416. trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a));
  417. vcl[3] = trn_res.val[1] & vmask;
  418. vch[3] = trn_res.val[0] & vmask;
  419. accumx7a = xx_vaddup_s64(accumx7a);
  420. int32x2_t t0 = vcl[0], t1 = vch[0];
  421. trn_res = vtrn_s32(t0,t1);
  422. t0 = trn_res.val[0]; t1 = trn_res.val[1];
  423. accumx7a = vaddw_s32(accumx7a, t0);
  424. t0 = vmovn_s64(accumx7a) & vmask;
  425. accumx7a = vshrq_n_s64(accumx7a,28);
  426. accumx7a = vaddw_s32(accumx7a, t1);
  427. t1 = vmovn_s64(accumx7a) & vmask;
  428. trn_res = vtrn_s32(t0,t1);
  429. vcl[0] = trn_res.val[0];
  430. vch[0] = trn_res.val[1];
  431. accumx7a = vshrq_n_s64(accumx7a,28);
  432. t0 = vmovn_s64(accumx7a);
  433. uint32_t
  434. c0 = vget_lane_s32(t0,0),
  435. c1 = vget_lane_s32(t0,1);
  436. c[2] += c0;
  437. c[10] += c1;
  438. }
  439. void
  440. p448_mulw (
  441. p448_t *__restrict__ cs,
  442. const p448_t *as,
  443. uint64_t b
  444. ) {
  445. const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
  446. const uint32_t *a = as->limb;
  447. uint32_t *c = cs->limb;
  448. uint64_t accum0, accum8;
  449. uint32_t mask = (1ull<<28)-1;
  450. int i;
  451. uint32_t c0, c8, n0, n8;
  452. accum0 = widemul_32(bhi, a[15]);
  453. accum8 = widemul_32(bhi, a[15] + a[7]);
  454. c0 = a[0]; c8 = a[8];
  455. smlal(&accum0, blo, c0);
  456. smlal(&accum8, blo, c8);
  457. c[0] = accum0 & mask; accum0 >>= 28;
  458. c[8] = accum8 & mask; accum8 >>= 28;
  459. i=1;
  460. {
  461. n0 = a[i]; n8 = a[i+8];
  462. smlal(&accum0, bhi, c0);
  463. smlal(&accum8, bhi, c8);
  464. smlal(&accum0, blo, n0);
  465. smlal(&accum8, blo, n8);
  466. c[i] = accum0 & mask; accum0 >>= 28;
  467. c[i+8] = accum8 & mask; accum8 >>= 28;
  468. i++;
  469. }
  470. {
  471. c0 = a[i]; c8 = a[i+8];
  472. smlal(&accum0, bhi, n0);
  473. smlal(&accum8, bhi, n8);
  474. smlal(&accum0, blo, c0);
  475. smlal(&accum8, blo, c8);
  476. c[i] = accum0 & mask; accum0 >>= 28;
  477. c[i+8] = accum8 & mask; accum8 >>= 28;
  478. i++;
  479. }
  480. {
  481. n0 = a[i]; n8 = a[i+8];
  482. smlal(&accum0, bhi, c0);
  483. smlal(&accum8, bhi, c8);
  484. smlal(&accum0, blo, n0);
  485. smlal(&accum8, blo, n8);
  486. c[i] = accum0 & mask; accum0 >>= 28;
  487. c[i+8] = accum8 & mask; accum8 >>= 28;
  488. i++;
  489. }
  490. {
  491. c0 = a[i]; c8 = a[i+8];
  492. smlal(&accum0, bhi, n0);
  493. smlal(&accum8, bhi, n8);
  494. smlal(&accum0, blo, c0);
  495. smlal(&accum8, blo, c8);
  496. c[i] = accum0 & mask; accum0 >>= 28;
  497. c[i+8] = accum8 & mask; accum8 >>= 28;
  498. i++;
  499. }
  500. {
  501. n0 = a[i]; n8 = a[i+8];
  502. smlal(&accum0, bhi, c0);
  503. smlal(&accum8, bhi, c8);
  504. smlal(&accum0, blo, n0);
  505. smlal(&accum8, blo, n8);
  506. c[i] = accum0 & mask; accum0 >>= 28;
  507. c[i+8] = accum8 & mask; accum8 >>= 28;
  508. i++;
  509. }
  510. {
  511. c0 = a[i]; c8 = a[i+8];
  512. smlal(&accum0, bhi, n0);
  513. smlal(&accum8, bhi, n8);
  514. smlal(&accum0, blo, c0);
  515. smlal(&accum8, blo, c8);
  516. c[i] = accum0 & mask; accum0 >>= 28;
  517. c[i+8] = accum8 & mask; accum8 >>= 28;
  518. i++;
  519. }
  520. {
  521. n0 = a[i]; n8 = a[i+8];
  522. smlal(&accum0, bhi, c0);
  523. smlal(&accum8, bhi, c8);
  524. smlal(&accum0, blo, n0);
  525. smlal(&accum8, blo, n8);
  526. c[i] = accum0 & mask; accum0 >>= 28;
  527. c[i+8] = accum8 & mask; accum8 >>= 28;
  528. i++;
  529. }
  530. accum0 += accum8 + c[8];
  531. c[8] = accum0 & mask;
  532. c[9] += accum0 >> 28;
  533. accum8 += c[0];
  534. c[0] = accum8 & mask;
  535. c[1] += accum8 >> 28;
  536. }
  537. void
  538. p448_strong_reduce (
  539. p448_t *a
  540. ) {
  541. word_t mask = (1ull<<28)-1;
  542. /* first, clear high */
  543. a->limb[8] += a->limb[15]>>28;
  544. a->limb[0] += a->limb[15]>>28;
  545. a->limb[15] &= mask;
  546. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  547. /* compute total_value - p. No need to reduce mod p. */
  548. dsword_t scarry = 0;
  549. int i;
  550. for (i=0; i<16; i++) {
  551. scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
  552. a->limb[i] = scarry & mask;
  553. scarry >>= 28;
  554. }
  555. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  556. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  557. * so let's add back in p. will carry back off the top for 2^448.
  558. */
  559. assert(is_zero(scarry) | is_zero(scarry+1));
  560. word_t scarry_mask = scarry & mask;
  561. dword_t carry = 0;
  562. /* add it back */
  563. for (i=0; i<16; i++) {
  564. carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
  565. a->limb[i] = carry & mask;
  566. carry >>= 28;
  567. }
  568. assert(is_zero(carry + scarry));
  569. }
  570. mask_t
  571. p448_is_zero (
  572. const struct p448_t *a
  573. ) {
  574. struct p448_t b;
  575. p448_copy(&b,a);
  576. p448_strong_reduce(&b);
  577. uint32_t any = 0;
  578. int i;
  579. for (i=0; i<16; i++) {
  580. any |= b.limb[i];
  581. }
  582. return is_zero(any);
  583. }
  584. void
  585. p448_serialize (
  586. uint8_t *serial,
  587. const struct p448_t *x
  588. ) {
  589. int i,j;
  590. p448_t red;
  591. p448_copy(&red, x);
  592. p448_strong_reduce(&red);
  593. for (i=0; i<8; i++) {
  594. uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
  595. for (j=0; j<7; j++) {
  596. serial[7*i+j] = limb;
  597. limb >>= 8;
  598. }
  599. assert(limb == 0);
  600. }
  601. }
  602. mask_t
  603. p448_deserialize (
  604. p448_t *x,
  605. const uint8_t serial[56]
  606. ) {
  607. int i,j;
  608. for (i=0; i<8; i++) {
  609. uint64_t out = 0;
  610. for (j=0; j<7; j++) {
  611. out |= ((uint64_t)serial[7*i+j])<<(8*j);
  612. }
  613. x->limb[2*i] = out & ((1ull<<28)-1);
  614. x->limb[2*i+1] = out >> 28;
  615. }
  616. /* Check for reduction.
  617. *
  618. * The idea is to create a variable ge which is all ones (rather, 56 ones)
  619. * if and only if the low $i$ words of $x$ are >= those of p.
  620. *
  621. * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
  622. */
  623. uint32_t ge = -1, mask = (1ull<<28)-1;
  624. for (i=0; i<8; i++) {
  625. ge &= x->limb[i];
  626. }
  627. /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
  628. ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
  629. /* Propagate the rest */
  630. for (i=9; i<16; i++) {
  631. ge &= x->limb[i];
  632. }
  633. return ~is_zero(ge ^ mask);
  634. }