You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

593 lines
21 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "f_field.h"
  5. static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
  6. xx_vaddup_u64(uint64x2_t x) {
  7. __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
  8. return x;
  9. }
  10. static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
  11. vrev128_s64(int64x2_t x) {
  12. __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
  13. return x;
  14. }
  15. static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
  16. vrev128_u64(uint64x2_t x) {
  17. __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
  18. return x;
  19. }
  20. static inline void __attribute__((gnu_inline,always_inline,unused))
  21. smlal (
  22. uint64_t *acc,
  23. const uint32_t a,
  24. const uint32_t b
  25. ) {
  26. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  27. }
  28. static inline void __attribute__((gnu_inline,always_inline,unused))
  29. smlal2 (
  30. uint64_t *acc,
  31. const uint32_t a,
  32. const uint32_t b
  33. ) {
  34. *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  35. }
  36. static inline void __attribute__((gnu_inline,always_inline,unused))
  37. smull (
  38. uint64_t *acc,
  39. const uint32_t a,
  40. const uint32_t b
  41. ) {
  42. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
  43. }
  44. static inline void __attribute__((gnu_inline,always_inline,unused))
  45. smull2 (
  46. uint64_t *acc,
  47. const uint32_t a,
  48. const uint32_t b
  49. ) {
  50. *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
  51. }
  52. void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
  53. #define _bl0 "q0"
  54. #define _bl0_0 "d0"
  55. #define _bl0_1 "d1"
  56. #define _bh0 "q1"
  57. #define _bh0_0 "d2"
  58. #define _bh0_1 "d3"
  59. #define _bs0 "q2"
  60. #define _bs0_0 "d4"
  61. #define _bs0_1 "d5"
  62. #define _bl2 "q3"
  63. #define _bl2_0 "d6"
  64. #define _bl2_1 "d7"
  65. #define _bh2 "q4"
  66. #define _bh2_0 "d8"
  67. #define _bh2_1 "d9"
  68. #define _bs2 "q5"
  69. #define _bs2_0 "d10"
  70. #define _bs2_1 "d11"
  71. #define _as0 "q6"
  72. #define _as0_0 "d12"
  73. #define _as0_1 "d13"
  74. #define _as2 "q7"
  75. #define _as2_0 "d14"
  76. #define _as2_1 "d15"
  77. #define _al0 "q8"
  78. #define _al0_0 "d16"
  79. #define _al0_1 "d17"
  80. #define _ah0 "q9"
  81. #define _ah0_0 "d18"
  82. #define _ah0_1 "d19"
  83. #define _al2 "q10"
  84. #define _al2_0 "d20"
  85. #define _al2_1 "d21"
  86. #define _ah2 "q11"
  87. #define _ah2_0 "d22"
  88. #define _ah2_1 "d23"
  89. #define _a0a "q12"
  90. #define _a0a_0 "d24"
  91. #define _a0a_1 "d25"
  92. #define _a0b "q13"
  93. #define _a0b_0 "d26"
  94. #define _a0b_1 "d27"
  95. #define _a1a "q14"
  96. #define _a1a_0 "d28"
  97. #define _a1a_1 "d29"
  98. #define _a1b "q15"
  99. #define _a1b_0 "d30"
  100. #define _a1b_1 "d31"
  101. #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
  102. #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
  103. #define VOP2(op,result,a) #op" "result", "a"\n\t"
  104. int32x2_t *vc = (int32x2_t*) cs->limb;
  105. __asm__ __volatile__(
  106. "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
  107. VOP3(vadd.i32,_as0,_al0,_ah0)
  108. "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
  109. VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
  110. VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
  111. "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
  112. VOP3(vadd.i32,_bs2,_bl2,_bh2)
  113. "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
  114. VOP3(vadd.i32,_as2,_al2,_ah2)
  115. VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
  116. VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
  117. VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
  118. VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
  119. VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
  120. VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
  121. VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
  122. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
  123. VOP2(vmov,_a0a,_a0b)
  124. VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
  125. VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
  126. VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
  127. VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
  128. VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
  129. VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
  130. VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
  131. VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
  132. VOP2(vmov,_a1a,_a1b)
  133. VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
  134. VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
  135. VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
  136. VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
  137. VOP2(vswp,_a0b_1,_a0a_0)
  138. VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
  139. VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
  140. VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
  141. VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
  142. VOP3(vsra.u64,_a0a,_a0b,"#28")
  143. VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
  144. VOP2(vmovn.i64,_a0b_0,_a0b)
  145. VOP2(vswp,_a1b_1,_a1a_0)
  146. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  147. VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
  148. VOP2(vmovn.i64,_a0b_1,_a1b)
  149. VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
  150. VOP3(vsra.u64,_a1a,_a1b,"#28")
  151. VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
  152. VOP2(vbic.i32,_a0b,"#0xf0000000")
  153. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
  154. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  155. VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
  156. VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
  157. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
  158. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
  159. VOP2(vmov,_a0b_1,_a0a_1)
  160. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  161. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  162. VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
  163. VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
  164. VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
  165. VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
  166. VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
  167. VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
  168. VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
  169. VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
  170. VOP2(vmov,_a1a,_a1b)
  171. VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
  172. VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
  173. VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
  174. VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
  175. VOP2(vswp,_a0b_1,_a0a_0)
  176. VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
  177. VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
  178. VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
  179. VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
  180. VOP3(vsra.u64,_a0a,_a0b,"#28")
  181. VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
  182. VOP2(vmovn.i64,_a0b_0,_a0b)
  183. VOP2(vswp,_a1b_1,_a1a_0)
  184. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  185. VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
  186. VOP2(vmovn.i64,_a0b_1,_a1b)
  187. VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
  188. VOP3(vsra.u64,_a1a,_a1b,"#28")
  189. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
  190. VOP2(vbic.i32,_a0b,"#0xf0000000")
  191. VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
  192. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  193. VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
  194. VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
  195. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
  196. VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
  197. VOP2(vmov,_a0b_1,_a0a_1)
  198. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  199. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  200. VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
  201. VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
  202. VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
  203. VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
  204. VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
  205. VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
  206. VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
  207. VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
  208. VOP2(vmov,_a1a,_a1b)
  209. VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
  210. VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
  211. VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
  212. VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
  213. VOP2(vswp,_a0b_1,_a0a_0)
  214. VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
  215. VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
  216. VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
  217. VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
  218. VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
  219. VOP3(vsra.u64,_a0a,_a0b,"#28")
  220. VOP2(vmovn.i64,_a0b_0,_a0b)
  221. VOP2(vswp,_a1b_1,_a1a_0)
  222. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  223. VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
  224. VOP2(vmovn.i64,_a0b_1,_a1b)
  225. VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
  226. VOP3(vsra.u64,_a1a,_a1b,"#28")
  227. VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
  228. VOP2(vbic.i32,_a0b,"#0xf0000000")
  229. VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
  230. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  231. VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
  232. VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
  233. VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
  234. VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
  235. VOP2(vmov,_a0b_1,_a0a_1)
  236. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  237. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  238. VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
  239. VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
  240. VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
  241. VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
  242. VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
  243. VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
  244. VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
  245. VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
  246. VOP2(vmov,_a1a,_a1b)
  247. VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
  248. VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
  249. VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
  250. VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
  251. VOP2(vswp,_a0b_1,_a0a_0)
  252. VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
  253. VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
  254. VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
  255. VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
  256. VOP3(vsra.u64,_a0a,_a0b,"#28")
  257. VOP2(vmovn.i64,_a0b_0,_a0b)
  258. VOP2(vswp,_a1b_1,_a1a_0)
  259. VOP3(vadd.i64,_a0a,_a0a,_a1b)
  260. VOP2(vmovn.i64,_a0b_1,_a0a)
  261. VOP3(vsra.u64,_a1a,_a0a,"#28")
  262. VOP2(vbic.i32,_a0b,"#0xf0000000")
  263. VOP2(vswp,_a1a_0,_a1a_1)
  264. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  265. "sub %[c], #64" "\n\t"
  266. VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
  267. "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  268. VOP2(vaddw.s32,_a1a,_a0a_0)
  269. VOP2(vmovn.i64,_a0a_0,_a1a)
  270. VOP2(vshr.s64,_a1a,"#28")
  271. VOP2(vaddw.s32,_a1a,_a0a_1)
  272. VOP2(vmovn.i64,_a0a_1,_a1a)
  273. VOP2(vshr.s64,_a1a,"#28")
  274. VOP2(vbic.i32,_a0a,"#0xf0000000")
  275. VOP2(vaddw.s32,_a1a,_a0b_0)
  276. VOP2(vmovn.i64,_a0b_0,_a1a)
  277. "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  278. : [a]"+r"(as)
  279. , [b]"+r"(bs)
  280. , [c]"+r"(vc)
  281. :: "q0","q1","q2","q3",
  282. "q4","q5","q6","q7",
  283. "q8","q9","q10","q11",
  284. "q12","q13","q14","q15",
  285. "memory"
  286. );
  287. }
  288. void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
  289. int32x2_t *vc = (int32x2_t*) cs->limb;
  290. __asm__ __volatile__ (
  291. "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
  292. VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
  293. VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
  294. VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */
  295. "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
  296. VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */
  297. VOP2(vmov,_as2,_bs2)
  298. VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */
  299. VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */
  300. VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */
  301. VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
  302. VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */
  303. VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */
  304. VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */
  305. VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
  306. VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */
  307. VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */
  308. VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
  309. VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */
  310. VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */
  311. VOP2(vmov,_a1a,_a1b)
  312. VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
  313. VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */
  314. VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */
  315. VOP2(vswp,_a0b_1,_a0a_0)
  316. VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
  317. VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */
  318. VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */
  319. VOP3(vsra.u64,_a0a,_a0b,"#28")
  320. VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
  321. VOP2(vmovn.i64,_a0b_0,_a0b)
  322. VOP2(vswp,_a1b_1,_a1a_0)
  323. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  324. VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
  325. VOP2(vmovn.i64,_a0b_1,_a1b)
  326. VOP3(vsra.u64,_a1a,_a1b,"#28")
  327. VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
  328. VOP2(vbic.i32,_a0b,"#0xf0000000")
  329. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  330. VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
  331. VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
  332. VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */
  333. VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
  334. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
  335. VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
  336. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
  337. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  338. VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
  339. VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */
  340. VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
  341. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
  342. VOP2(vswp,_a0b_1,_a0a_0)
  343. VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
  344. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
  345. VOP3(vsra.u64,_a0a,_a0b,"#28")
  346. VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
  347. VOP2(vmovn.i64,_a0b_0,_a0b)
  348. VOP2(vswp,_a1b_1,_a1a_0)
  349. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  350. VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
  351. VOP2(vmovn.i64,_a0b_1,_a1b)
  352. VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
  353. VOP3(vsra.u64,_a1a,_a1b,"#28")
  354. VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
  355. VOP2(vbic.i32,_a0b,"#0xf0000000")
  356. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  357. VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
  358. VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
  359. VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
  360. VOP2(vmov,_a0b_1,_a0a_1)
  361. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  362. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  363. VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
  364. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
  365. VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
  366. VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
  367. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
  368. VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
  369. VOP2(vmov,_a1a,_a1b)
  370. VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
  371. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
  372. VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
  373. VOP2(vswp,_a0b_1,_a0a_0)
  374. VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
  375. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
  376. VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
  377. VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
  378. VOP3(vsra.u64,_a0a,_a0b,"#28")
  379. VOP2(vmovn.i64,_a0b_0,_a0b)
  380. VOP2(vswp,_a1b_1,_a1a_0)
  381. VOP3(vadd.i64,_a1b,_a0a,_a1b)
  382. VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
  383. VOP2(vmovn.i64,_a0b_1,_a1b)
  384. VOP3(vsra.u64,_a1a,_a1b,"#28")
  385. VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
  386. VOP2(vbic.i32,_a0b,"#0xf0000000")
  387. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  388. VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
  389. VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
  390. VOP2(vmov,_a0b_1,_a0a_1)
  391. VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
  392. VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
  393. VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
  394. VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
  395. VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
  396. VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
  397. VOP2(vmov,_a1a,_a1b)
  398. VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
  399. VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
  400. VOP2(vswp,_a0b_1,_a0a_0)
  401. VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
  402. VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
  403. VOP3(vsra.u64,_a0a,_a0b,"#28")
  404. VOP2(vmovn.i64,_a0b_0,_a0b)
  405. VOP2(vswp,_a1b_1,_a1a_0)
  406. VOP3(vadd.i64,_a0a,_a0a,_a1b)
  407. VOP2(vmovn.i64,_a0b_1,_a0a)
  408. VOP3(vsra.u64,_a1a,_a0a,"#28")
  409. VOP2(vbic.i32,_a0b,"#0xf0000000")
  410. VOP2(vswp,_a1a_0,_a1a_1)
  411. "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
  412. "sub %[c], #64" "\n\t"
  413. VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
  414. "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  415. VOP2(vaddw.s32,_a1a,_a0a_0)
  416. VOP2(vmovn.i64,_a0a_0,_a1a)
  417. VOP2(vshr.s64,_a1a,"#28")
  418. VOP2(vaddw.s32,_a1a,_a0a_1)
  419. VOP2(vmovn.i64,_a0a_1,_a1a)
  420. VOP2(vshr.s64,_a1a,"#28")
  421. VOP2(vbic.i32,_a0a,"#0xf0000000")
  422. VOP2(vaddw.s32,_a1a,_a0b_0)
  423. VOP2(vmovn.i64,_a0b_0,_a1a)
  424. "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
  425. : [b]"+r"(bs)
  426. , [c]"+r"(vc)
  427. :: "q0","q1","q2","q3",
  428. "q4","q5","q6","q7",
  429. "q12","q13","q14","q15",
  430. "memory"
  431. );
  432. }
  433. void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
  434. uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
  435. assert(b<(1<<28));
  436. uint64x2_t accum;
  437. const uint32x2_t *va = (const uint32x2_t *) as->limb;
  438. uint32x2_t *vo = (uint32x2_t *) cs->limb;
  439. uint32x2_t vc, vn;
  440. uint32x2_t vb = {b, 0};
  441. vc = va[0];
  442. accum = vmull_lane_u32(accum, vc, vb, 0);
  443. vo[0] = vmovn_u64(accum) & vmask;
  444. accum = vshrq_n_u64(accum,28);
  445. /* PERF: the right way to do this is to reduce behind, i.e.
  446. * vmull + vmlal round 0
  447. * vmull + vmlal round 1
  448. * vmull + vmlal round 2
  449. * vsraq round 0, 1
  450. * vmull + vmlal round 3
  451. * vsraq round 1, 2
  452. * ...
  453. */
  454. int i;
  455. for (i=1; i<8; i++) {
  456. vn = va[i];
  457. accum = vmlal_lane_u32(accum, vn, vb, 0);
  458. vo[i] = vmovn_u64(accum) & vmask;
  459. accum = vshrq_n_u64(accum,28);
  460. vc = vn;
  461. }
  462. accum = xx_vaddup_u64(vrev128_u64(accum));
  463. accum = vaddw_u32(accum, vo[0]);
  464. vo[0] = vmovn_u64(accum) & vmask;
  465. accum = vshrq_n_u64(accum,28);
  466. vo[1] += vmovn_u64(accum);
  467. }