You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

280 lines
7.1 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #ifndef __X86_64_ARITH_H__
  5. #define __X86_64_ARITH_H__
  6. #include <stdint.h>
  7. /* TODO: non x86-64 versions of these.
  8. * FUTURE: autogenerate
  9. */
  10. static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  11. #ifndef __BMI2__
  12. uint64_t c,d;
  13. __asm__ volatile
  14. ("movq %[a], %%rax;"
  15. "mulq %[b];"
  16. : [c]"=a"(c), [d]"=d"(d)
  17. : [b]"m"(*b), [a]"m"(*a)
  18. : "cc");
  19. return (((__uint128_t)(d))<<64) | c;
  20. #else
  21. uint64_t c,d;
  22. __asm__ volatile
  23. ("movq %[a], %%rdx;"
  24. "mulx %[b], %[c], %[d];"
  25. : [c]"=r"(c), [d]"=r"(d)
  26. : [b]"m"(*b), [a]"m"(*a)
  27. : "rdx");
  28. return (((__uint128_t)(d))<<64) | c;
  29. #endif
  30. }
  31. static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  32. #ifndef __BMI2__
  33. uint64_t c,d;
  34. __asm__ volatile
  35. ("movq %[a], %%rax;"
  36. "mulq %[b];"
  37. : [c]"=a"(c), [d]"=d"(d)
  38. : [b]"m"(*b), [a]"r"(a)
  39. : "cc");
  40. return (((__uint128_t)(d))<<64) | c;
  41. #else
  42. uint64_t c,d;
  43. __asm__ volatile
  44. ("mulx %[b], %[c], %[d];"
  45. : [c]"=r"(c), [d]"=r"(d)
  46. : [b]"m"(*b), [a]"d"(a));
  47. return (((__uint128_t)(d))<<64) | c;
  48. #endif
  49. }
  50. static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  51. #ifndef __BMI2__
  52. uint64_t c,d;
  53. __asm__ volatile
  54. ("movq %[a], %%rax; "
  55. "addq %%rax, %%rax; "
  56. "mulq %[b];"
  57. : [c]"=a"(c), [d]"=d"(d)
  58. : [b]"m"(*b), [a]"m"(*a)
  59. : "cc");
  60. return (((__uint128_t)(d))<<64) | c;
  61. #else
  62. uint64_t c,d;
  63. __asm__ volatile
  64. ("movq %[a], %%rdx;"
  65. "leaq (,%%rdx,2), %%rdx;"
  66. "mulx %[b], %[c], %[d];"
  67. : [c]"=r"(c), [d]"=r"(d)
  68. : [b]"m"(*b), [a]"m"(*a)
  69. : "rdx");
  70. return (((__uint128_t)(d))<<64) | c;
  71. #endif
  72. }
  73. static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  74. uint64_t lo = *acc, hi = *acc>>64;
  75. #ifdef __BMI2__
  76. uint64_t c,d;
  77. __asm__ volatile
  78. ("movq %[a], %%rdx; "
  79. "mulx %[b], %[c], %[d]; "
  80. "addq %[c], %[lo]; "
  81. "adcq %[d], %[hi]; "
  82. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  83. : [b]"m"(*b), [a]"m"(*a)
  84. : "rdx", "cc");
  85. #else
  86. __asm__ volatile
  87. ("movq %[a], %%rax; "
  88. "mulq %[b]; "
  89. "addq %%rax, %[lo]; "
  90. "adcq %%rdx, %[hi]; "
  91. : [lo]"+r"(lo), [hi]"+r"(hi)
  92. : [b]"m"(*b), [a]"m"(*a)
  93. : "rax", "rdx", "cc");
  94. #endif
  95. *acc = (((__uint128_t)(hi))<<64) | lo;
  96. }
  97. static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
  98. uint64_t lo = *acc, hi = *acc>>64;
  99. uint64_t lo2 = *acc2, hi2 = *acc2>>64;
  100. #ifdef __BMI2__
  101. uint64_t c,d;
  102. __asm__ volatile
  103. ("movq %[a], %%rdx; "
  104. "mulx %[b], %[c], %[d]; "
  105. "addq %[c], %[lo]; "
  106. "adcq %[d], %[hi]; "
  107. "addq %[c], %[lo2]; "
  108. "adcq %[d], %[hi2]; "
  109. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
  110. : [b]"m"(*b), [a]"m"(*a)
  111. : "rdx", "cc");
  112. #else
  113. __asm__ volatile
  114. ("movq %[a], %%rax; "
  115. "mulq %[b]; "
  116. "addq %%rax, %[lo]; "
  117. "adcq %%rdx, %[hi]; "
  118. "addq %%rax, %[lo2]; "
  119. "adcq %%rdx, %[hi2]; "
  120. : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
  121. : [b]"m"(*b), [a]"m"(*a)
  122. : "rax", "rdx", "cc");
  123. #endif
  124. *acc = (((__uint128_t)(hi))<<64) | lo;
  125. *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
  126. }
  127. static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  128. uint64_t lo = *acc, hi = *acc>>64;
  129. #ifdef __BMI2__
  130. uint64_t c,d;
  131. __asm__ volatile
  132. ("mulx %[b], %[c], %[d]; "
  133. "addq %[c], %[lo]; "
  134. "adcq %[d], %[hi]; "
  135. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  136. : [b]"m"(*b), [a]"d"(a)
  137. : "cc");
  138. #else
  139. __asm__ volatile
  140. ("movq %[a], %%rax; "
  141. "mulq %[b]; "
  142. "addq %%rax, %[lo]; "
  143. "adcq %%rdx, %[hi]; "
  144. : [lo]"+r"(lo), [hi]"+r"(hi)
  145. : [b]"m"(*b), [a]"r"(a)
  146. : "rax", "rdx", "cc");
  147. #endif
  148. *acc = (((__uint128_t)(hi))<<64) | lo;
  149. }
  150. static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  151. uint64_t lo = *acc, hi = *acc>>64;
  152. #ifdef __BMI2__
  153. uint64_t c,d;
  154. __asm__ volatile
  155. ("movq %[a], %%rdx; "
  156. "addq %%rdx, %%rdx; "
  157. "mulx %[b], %[c], %[d]; "
  158. "addq %[c], %[lo]; "
  159. "adcq %[d], %[hi]; "
  160. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  161. : [b]"m"(*b), [a]"m"(*a)
  162. : "rdx", "cc");
  163. #else
  164. __asm__ volatile
  165. ("movq %[a], %%rax; "
  166. "addq %%rax, %%rax; "
  167. "mulq %[b]; "
  168. "addq %%rax, %[lo]; "
  169. "adcq %%rdx, %[hi]; "
  170. : [lo]"+r"(lo), [hi]"+r"(hi)
  171. : [b]"m"(*b), [a]"m"(*a)
  172. : "rax", "rdx", "cc");
  173. #endif
  174. *acc = (((__uint128_t)(hi))<<64) | lo;
  175. }
  176. static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  177. uint64_t lo = *acc, hi = *acc>>64;
  178. #ifdef __BMI2__
  179. uint64_t c,d;
  180. __asm__ volatile
  181. ("movq %[a], %%rdx; "
  182. "mulx %[b], %[c], %[d]; "
  183. "subq %[c], %[lo]; "
  184. "sbbq %[d], %[hi]; "
  185. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  186. : [b]"m"(*b), [a]"m"(*a)
  187. : "rdx", "cc");
  188. #else
  189. __asm__ volatile
  190. ("movq %[a], %%rax; "
  191. "mulq %[b]; "
  192. "subq %%rax, %[lo]; "
  193. "sbbq %%rdx, %[hi]; "
  194. : [lo]"+r"(lo), [hi]"+r"(hi)
  195. : [b]"m"(*b), [a]"m"(*a)
  196. : "rax", "rdx", "cc");
  197. #endif
  198. *acc = (((__uint128_t)(hi))<<64) | lo;
  199. }
  200. static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  201. uint64_t lo = *acc, hi = *acc>>64;
  202. #ifdef __BMI2__
  203. uint64_t c,d;
  204. __asm__ volatile
  205. ("movq %[a], %%rdx; "
  206. "addq %%rdx, %%rdx; "
  207. "mulx %[b], %[c], %[d]; "
  208. "subq %[c], %[lo]; "
  209. "sbbq %[d], %[hi]; "
  210. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  211. : [b]"m"(*b), [a]"m"(*a)
  212. : "rdx", "cc");
  213. #else
  214. __asm__ volatile
  215. ("movq %[a], %%rax; "
  216. "addq %%rax, %%rax; "
  217. "mulq %[b]; "
  218. "subq %%rax, %[lo]; "
  219. "sbbq %%rdx, %[hi]; "
  220. : [lo]"+r"(lo), [hi]"+r"(hi)
  221. : [b]"m"(*b), [a]"m"(*a)
  222. : "rax", "rdx", "cc");
  223. #endif
  224. *acc = (((__uint128_t)(hi))<<64) | lo;
  225. }
  226. static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  227. uint64_t c,d, lo = *acc, hi = *acc>>64;
  228. __asm__ volatile
  229. ("movq %[a], %%rdx; "
  230. "mulx %[b], %[c], %[d]; "
  231. "subq %[lo], %[c]; "
  232. "sbbq %[hi], %[d]; "
  233. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  234. : [b]"m"(*b), [a]"m"(*a)
  235. : "rdx", "cc");
  236. *acc = (((__uint128_t)(d))<<64) | c;
  237. }
  238. static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  239. return ((__uint128_t)(a)) * b;
  240. }
  241. static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  242. return ((__int128_t)(a)) * b;
  243. }
  244. static __inline__ uint64_t opacify(uint64_t x) {
  245. __asm__ volatile("" : "+r"(x));
  246. return x;
  247. }
  248. static __inline__ mask_t is_zero(uint64_t x) {
  249. __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  250. return ~x;
  251. }
  252. #endif /* __X86_64_ARITH_H__ */