You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

723 lines
24 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "word.h"
  5. #include "p448.h"
  6. static inline mask_t __attribute__((always_inline))
  7. is_zero (
  8. word_t x
  9. ) {
  10. dword_t xx = x;
  11. xx--;
  12. return xx >> WORD_BITS;
  13. }
  14. static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
  15. xx_vaddup_u64(uint64x2_t x) {
  16. __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
  17. return x;
  18. }
  19. static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
  20. vrev128_s64(int64x2_t x) {
  21. __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
  22. return x;
  23. }
  24. static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
  25. vrev128_u64(uint64x2_t x) {
  26. __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
  27. return x;
  28. }
  29. static inline void __attribute__((gnu_inline,always_inline,unused))
  30. smlal (
  31. uint64_t *acc,
  32. const uint32_t a,
  33. const uint32_t b
  34. ) {
  35. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  36. }
  37. static inline void __attribute__((gnu_inline,always_inline,unused))
  38. smlal2 (
  39. uint64_t *acc,
  40. const uint32_t a,
  41. const uint32_t b
  42. ) {
  43. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  44. }
  45. static inline void __attribute__((gnu_inline,always_inline,unused))
  46. smull (
  47. uint64_t *acc,
  48. const uint32_t a,
  49. const uint32_t b
  50. ) {
  51. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  52. }
  53. static inline void __attribute__((gnu_inline,always_inline,unused))
  54. smull2 (
  55. uint64_t *acc,
  56. const uint32_t a,
  57. const uint32_t b
  58. ) {
  59. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  60. }
  61. void
  62. p448_mul (
  63. p448_t *__restrict__ cs,
  64. const p448_t *as,
  65. const p448_t *bs
  66. ) {
  67. #define _bl0 "q0"
  68. #define _bl0_0 "d0"
  69. #define _bl0_1 "d1"
  70. #define _bh0 "q1"
  71. #define _bh0_0 "d2"
  72. #define _bh0_1 "d3"
  73. #define _bs0 "q2"
  74. #define _bs0_0 "d4"
  75. #define _bs0_1 "d5"
  76. #define _bl2 "q3"
  77. #define _bl2_0 "d6"
  78. #define _bl2_1 "d7"
  79. #define _bh2 "q4"
  80. #define _bh2_0 "d8"
  81. #define _bh2_1 "d9"
  82. #define _bs2 "q5"
  83. #define _bs2_0 "d10"
  84. #define _bs2_1 "d11"
  85. #define _as0 "q6"
  86. #define _as0_0 "d12"
  87. #define _as0_1 "d13"
  88. #define _as2 "q7"
  89. #define _as2_0 "d14"
  90. #define _as2_1 "d15"
  91. #define _al0 "q8"
  92. #define _al0_0 "d16"
  93. #define _al0_1 "d17"
  94. #define _ah0 "q9"
  95. #define _ah0_0 "d18"
  96. #define _ah0_1 "d19"
  97. #define _al2 "q10"
  98. #define _al2_0 "d20"
  99. #define _al2_1 "d21"
  100. #define _ah2 "q11"
  101. #define _ah2_0 "d22"
  102. #define _ah2_1 "d23"
  103. #define _a0a "q12"
  104. #define _a0a_0 "d24"
  105. #define _a0a_1 "d25"
  106. #define _a0b "q13"
  107. #define _a0b_0 "d26"
  108. #define _a0b_1 "d27"
  109. #define _a1a "q14"
  110. #define _a1a_0 "d28"
  111. #define _a1a_1 "d29"
  112. #define _a1b "q15"
  113. #define _a1b_0 "d30"
  114. #define _a1b_1 "d31"
  115. #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
  116. #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
  117. #define VOP2(op,result,a) #op" "result", "a"\n\t"
  118. int32x2_t *vc = (int32x2_t*) cs->limb;
  119. __asm__ __volatile__(
  120. "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
  121. VOP3(vadd.i32,_as0,_al0,_ah0)
  122. "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
  123. VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
  124. VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
  125. "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
  126. VOP3(vadd.i32,_bs2,_bl2,_bh2)
  127. "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
  128. VOP3(vadd.i32,_as2,_al2,_ah2)
  129. VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
  130. VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
  131. VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
  132. VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
  133. VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
  134. VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
  135. VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
  136. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
  137. VOP2(vmov,_a0a,_a0b)
  138. VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
  139. VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
  140. VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
  141. VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
  142. VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
  143. VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
  144. VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
  145. VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
  146. VOP2(vmov,_a1a,_a1b)
  147. VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
  148. VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
  149. VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
  150. VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
  151. VOP2(vswp,_a0b_1,_a0a_0)
  152. VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
  153. VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
  154. VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
  155. VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
  156. VOP3(vsra.u64,_a0a,_a0b,"#28")
  157. VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
  158. VOP2(vmovn.i64,_a0b_0,_a0b)
  159. VOP2(vswp,_a1b_1,_a1a_0)
  160. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  161. VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
  162. VOP2(vmovn.i64,_a0b_1,_a1b)
  163. VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
  164. VOP3(vsra.u64,_a1a,_a1b,"#28")
  165. VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
  166. VOP2(vbic.i32,_a0b,"#0xf0000000")
  167. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
  168. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  169. VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
  170. VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
  171. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
  172. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
  173. VOP2(vmov,_a0b_1,_a0a_1)
  174. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  175. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  176. VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
  177. VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
  178. VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
  179. VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
  180. VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
  181. VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
  182. VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
  183. VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
  184. VOP2(vmov,_a1a,_a1b)
  185. VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
  186. VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
  187. VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
  188. VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
  189. VOP2(vswp,_a0b_1,_a0a_0)
  190. VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
  191. VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
  192. VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
  193. VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
  194. VOP3(vsra.u64,_a0a,_a0b,"#28")
  195. VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
  196. VOP2(vmovn.i64,_a0b_0,_a0b)
  197. VOP2(vswp,_a1b_1,_a1a_0)
  198. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  199. VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
  200. VOP2(vmovn.i64,_a0b_1,_a1b)
  201. VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
  202. VOP3(vsra.u64,_a1a,_a1b,"#28")
  203. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
  204. VOP2(vbic.i32,_a0b,"#0xf0000000")
  205. VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
  206. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  207. VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
  208. VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
  209. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
  210. VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
  211. VOP2(vmov,_a0b_1,_a0a_1)
  212. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  213. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  214. VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
  215. VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
  216. VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
  217. VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
  218. VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
  219. VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
  220. VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
  221. VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
  222. VOP2(vmov,_a1a,_a1b)
  223. VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
  224. VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
  225. VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
  226. VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
  227. VOP2(vswp,_a0b_1,_a0a_0)
  228. VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
  229. VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
  230. VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
  231. VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
  232. VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
  233. VOP3(vsra.u64,_a0a,_a0b,"#28")
  234. VOP2(vmovn.i64,_a0b_0,_a0b)
  235. VOP2(vswp,_a1b_1,_a1a_0)
  236. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  237. VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
  238. VOP2(vmovn.i64,_a0b_1,_a1b)
  239. VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
  240. VOP3(vsra.u64,_a1a,_a1b,"#28")
  241. VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
  242. VOP2(vbic.i32,_a0b,"#0xf0000000")
  243. VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
  244. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  245. VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
  246. VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
  247. VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
  248. VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
  249. VOP2(vmov,_a0b_1,_a0a_1)
  250. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  251. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  252. VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
  253. VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
  254. VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
  255. VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
  256. VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
  257. VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
  258. VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
  259. VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
  260. VOP2(vmov,_a1a,_a1b)
  261. VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
  262. VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
  263. VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
  264. VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
  265. VOP2(vswp,_a0b_1,_a0a_0)
  266. VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
  267. VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
  268. VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
  269. VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
  270. VOP3(vsra.u64,_a0a,_a0b,"#28")
  271. VOP2(vmovn.i64,_a0b_0,_a0b)
  272. VOP2(vswp,_a1b_1,_a1a_0)
  273. VOP3(vadd.i64,_a0a,_a0a,_a1b)
  274. VOP2(vmovn.i64,_a0b_1,_a0a)
  275. VOP3(vsra.u64,_a1a,_a0a,"#28")
  276. VOP2(vbic.i32,_a0b,"#0xf0000000")
  277. VOP2(vswp,_a1a_0,_a1a_1)
  278. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  279. "sub %[c], #64" "\n\t"
  280. VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
  281. "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  282. VOP2(vaddw.s32,_a1a,_a0a_0)
  283. VOP2(vmovn.i64,_a0a_0,_a1a)
  284. VOP2(vshr.s64,_a1a,"#28")
  285. VOP2(vaddw.s32,_a1a,_a0a_1)
  286. VOP2(vmovn.i64,_a0a_1,_a1a)
  287. VOP2(vshr.s64,_a1a,"#28")
  288. VOP2(vbic.i32,_a0a,"#0xf0000000")
  289. VOP2(vaddw.s32,_a1a,_a0b_0)
  290. VOP2(vmovn.i64,_a0b_0,_a1a)
  291. "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  292. : [a]"+r"(as)
  293. , [b]"+r"(bs)
  294. , [c]"+r"(vc)
  295. :: "q0","q1","q2","q3",
  296. "q4","q5","q6","q7",
  297. "q8","q9","q10","q11",
  298. "q12","q13","q14","q15",
  299. "memory"
  300. );
  301. }
  302. void
  303. p448_sqr (
  304. p448_t *__restrict__ cs,
  305. const p448_t *bs
  306. ) {
  307. int32x2_t *vc = (int32x2_t*) cs->limb;
  308. __asm__ __volatile__ (
  309. "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
  310. VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
  311. VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
  312. VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */
  313. "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
  314. VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */
  315. VOP2(vmov,_as2,_bs2)
  316. VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */
  317. VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */
  318. VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */
  319. VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
  320. VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */
  321. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */
  322. VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */
  323. VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
  324. VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */
  325. VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */
  326. VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
  327. VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */
  328. VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */
  329. VOP2(vmov,_a1a,_a1b)
  330. VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
  331. VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */
  332. VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */
  333. VOP2(vswp,_a0b_1,_a0a_0)
  334. VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
  335. VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */
  336. VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */
  337. VOP3(vsra.u64,_a0a,_a0b,"#28")
  338. VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
  339. VOP2(vmovn.i64,_a0b_0,_a0b)
  340. VOP2(vswp,_a1b_1,_a1a_0)
  341. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  342. VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
  343. VOP2(vmovn.i64,_a0b_1,_a1b)
  344. VOP3(vsra.u64,_a1a,_a1b,"#28")
  345. VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
  346. VOP2(vbic.i32,_a0b,"#0xf0000000")
  347. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  348. VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
  349. VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
  350. VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */
  351. VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
  352. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
  353. VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
  354. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
  355. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  356. VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
  357. VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */
  358. VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
  359. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
  360. VOP2(vswp,_a0b_1,_a0a_0)
  361. VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
  362. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
  363. VOP3(vsra.u64,_a0a,_a0b,"#28")
  364. VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
  365. VOP2(vmovn.i64,_a0b_0,_a0b)
  366. VOP2(vswp,_a1b_1,_a1a_0)
  367. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  368. VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
  369. VOP2(vmovn.i64,_a0b_1,_a1b)
  370. VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
  371. VOP3(vsra.u64,_a1a,_a1b,"#28")
  372. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
  373. VOP2(vbic.i32,_a0b,"#0xf0000000")
  374. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  375. VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
  376. VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
  377. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
  378. VOP2(vmov,_a0b_1,_a0a_1)
  379. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  380. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  381. VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
  382. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
  383. VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
  384. VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
  385. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
  386. VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
  387. VOP2(vmov,_a1a,_a1b)
  388. VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
  389. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
  390. VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
  391. VOP2(vswp,_a0b_1,_a0a_0)
  392. VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
  393. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
  394. VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
  395. VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
  396. VOP3(vsra.u64,_a0a,_a0b,"#28")
  397. VOP2(vmovn.i64,_a0b_0,_a0b)
  398. VOP2(vswp,_a1b_1,_a1a_0)
  399. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  400. VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
  401. VOP2(vmovn.i64,_a0b_1,_a1b)
  402. VOP3(vsra.u64,_a1a,_a1b,"#28")
  403. VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
  404. VOP2(vbic.i32,_a0b,"#0xf0000000")
  405. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  406. VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
  407. VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
  408. VOP2(vmov,_a0b_1,_a0a_1)
  409. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  410. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  411. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
  412. VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
  413. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
  414. VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
  415. VOP2(vmov,_a1a,_a1b)
  416. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
  417. VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
  418. VOP2(vswp,_a0b_1,_a0a_0)
  419. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
  420. VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
  421. VOP3(vsra.u64,_a0a,_a0b,"#28")
  422. VOP2(vmovn.i64,_a0b_0,_a0b)
  423. VOP2(vswp,_a1b_1,_a1a_0)
  424. VOP3(vadd.i64,_a0a,_a0a,_a1b)
  425. VOP2(vmovn.i64,_a0b_1,_a0a)
  426. VOP3(vsra.u64,_a1a,_a0a,"#28")
  427. VOP2(vbic.i32,_a0b,"#0xf0000000")
  428. VOP2(vswp,_a1a_0,_a1a_1)
  429. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  430. "sub %[c], #64" "\n\t"
  431. VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
  432. "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  433. VOP2(vaddw.s32,_a1a,_a0a_0)
  434. VOP2(vmovn.i64,_a0a_0,_a1a)
  435. VOP2(vshr.s64,_a1a,"#28")
  436. VOP2(vaddw.s32,_a1a,_a0a_1)
  437. VOP2(vmovn.i64,_a0a_1,_a1a)
  438. VOP2(vshr.s64,_a1a,"#28")
  439. VOP2(vbic.i32,_a0a,"#0xf0000000")
  440. VOP2(vaddw.s32,_a1a,_a0b_0)
  441. VOP2(vmovn.i64,_a0b_0,_a1a)
  442. "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  443. : [b]"+r"(bs)
  444. , [c]"+r"(vc)
  445. :: "q0","q1","q2","q3",
  446. "q4","q5","q6","q7",
  447. "q12","q13","q14","q15",
  448. "memory"
  449. );
  450. }
  451. void
  452. p448_mulw (
  453. p448_t *__restrict__ cs,
  454. const p448_t *as,
  455. uint64_t b
  456. ) {
  457. uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
  458. uint64x2_t accum;
  459. const uint32x2_t *va = (const uint32x2_t *) as->limb;
  460. uint32x2_t *vo = (uint32x2_t *) cs->limb;
  461. uint32x2_t vc, vn;
  462. uint32x2_t vb = {b & ((1<<28)-1), b>>28};
  463. accum = vmull_lane_u32(va[7], vb, 1);
  464. accum = xx_vaddup_u64(vrev128_u64(accum));
  465. vc = va[0];
  466. accum = vmlal_lane_u32(accum, vc, vb, 0);
  467. vo[0] = vmovn_u64(accum) & vmask;
  468. accum = vshrq_n_u64(accum,28);
  469. /* PERF: the right way to do this is to reduce behind, i.e.
  470. * vmull + vmlal round 0
  471. * vmull + vmlal round 1
  472. * vmull + vmlal round 2
  473. * vsraq round 0, 1
  474. * vmull + vmlal round 3
  475. * vsraq round 1, 2
  476. * ...
  477. */
  478. int i;
  479. for (i=1; i<8; i++) {
  480. vn = va[i];
  481. accum = vmlal_lane_u32(accum, vc, vb, 1);
  482. accum = vmlal_lane_u32(accum, vn, vb, 0);
  483. vo[i] = vmovn_u64(accum) & vmask;
  484. accum = vshrq_n_u64(accum,28);
  485. vc = vn;
  486. }
  487. accum = xx_vaddup_u64(vrev128_u64(accum));
  488. accum = vaddw_u32(accum, vo[0]);
  489. vo[0] = vmovn_u64(accum) & vmask;
  490. accum = vshrq_n_u64(accum,28);
  491. vo[1] += vmovn_u64(accum);
  492. }
  493. /* TODO: vectorize? */
  494. void
  495. p448_strong_reduce (
  496. p448_t *a
  497. ) {
  498. word_t mask = (1ull<<28)-1;
  499. /* first, clear high */
  500. a->limb[1] += a->limb[15]>>28;
  501. a->limb[0] += a->limb[15]>>28;
  502. a->limb[15] &= mask;
  503. /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  504. /* compute total_value - p. No need to reduce mod p. */
  505. dsword_t scarry = 0;
  506. int i;
  507. for (i=0; i<16; i++) {
  508. scarry = scarry + a->limb[LIMBPERM(i)] - ((i==8)?mask-1:mask);
  509. a->limb[LIMBPERM(i)] = scarry & mask;
  510. scarry >>= 28;
  511. }
  512. /* uncommon case: it was >= p, so now scarry = 0 and this = x
  513. * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
  514. * so let's add back in p. will carry back off the top for 2^448.
  515. */
  516. assert(is_zero(scarry) | is_zero(scarry+1));
  517. word_t scarry_mask = scarry & mask;
  518. dword_t carry = 0;
  519. /* add it back */
  520. for (i=0; i<16; i++) {
  521. carry = carry + a->limb[LIMBPERM(i)] + ((i==8)?(scarry_mask&~1):scarry_mask);
  522. a->limb[LIMBPERM(i)] = carry & mask;
  523. carry >>= 28;
  524. }
  525. assert(is_zero(carry + scarry));
  526. }
  527. void
  528. p448_serialize (
  529. uint8_t *serial,
  530. const struct p448_t *x
  531. ) {
  532. int i,j;
  533. p448_t red;
  534. p448_copy(&red, x);
  535. p448_strong_reduce(&red);
  536. for (i=0; i<8; i++) {
  537. uint64_t limb = red.limb[LIMBPERM(2*i)] + (((uint64_t)red.limb[LIMBPERM(2*i+1)])<<28);
  538. for (j=0; j<7; j++) {
  539. serial[7*i+j] = limb;
  540. limb >>= 8;
  541. }
  542. assert(limb == 0);
  543. }
  544. }
  545. mask_t
  546. p448_deserialize (
  547. p448_t *x,
  548. const uint8_t serial[56]
  549. ) {
  550. int i,j;
  551. for (i=0; i<8; i++) {
  552. uint64_t out = 0;
  553. for (j=0; j<7; j++) {
  554. out |= ((uint64_t)serial[7*i+j])<<(8*j);
  555. }
  556. x->limb[LIMBPERM(2*i)] = out & ((1ull<<28)-1);
  557. x->limb[LIMBPERM(2*i+1)] = out >> 28;
  558. }
  559. /* Check for reduction.
  560. *
  561. * The idea is to create a variable ge which is all ones (rather, 56 ones)
  562. * if and only if the low $i$ words of $x$ are >= those of p.
  563. *
  564. * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
  565. */
  566. uint32_t ge = -1, mask = (1ull<<28)-1;
  567. for (i=0; i<8; i++) {
  568. ge &= x->limb[LIMBPERM(i)];
  569. }
  570. /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
  571. ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask);
  572. /* Propagate the rest */
  573. for (i=9; i<16; i++) {
  574. ge &= x->limb[LIMBPERM(i)];
  575. }
  576. return ~is_zero(ge ^ mask);
  577. }