You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

276 lines
7.1 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #ifndef __X86_64_ARITH_H__
  5. #define __X86_64_ARITH_H__
  6. #include <stdint.h>
  7. static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  8. #ifndef __BMI2__
  9. uint64_t c,d;
  10. __asm__ volatile
  11. ("movq %[a], %%rax;"
  12. "mulq %[b];"
  13. : [c]"=a"(c), [d]"=d"(d)
  14. : [b]"m"(*b), [a]"m"(*a)
  15. : "cc");
  16. return (((__uint128_t)(d))<<64) | c;
  17. #else
  18. uint64_t c,d;
  19. __asm__ volatile
  20. ("movq %[a], %%rdx;"
  21. "mulx %[b], %[c], %[d];"
  22. : [c]"=r"(c), [d]"=r"(d)
  23. : [b]"m"(*b), [a]"m"(*a)
  24. : "rdx");
  25. return (((__uint128_t)(d))<<64) | c;
  26. #endif
  27. }
  28. static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  29. #ifndef __BMI2__
  30. uint64_t c,d;
  31. __asm__ volatile
  32. ("movq %[a], %%rax;"
  33. "mulq %[b];"
  34. : [c]"=a"(c), [d]"=d"(d)
  35. : [b]"m"(*b), [a]"r"(a)
  36. : "cc");
  37. return (((__uint128_t)(d))<<64) | c;
  38. #else
  39. uint64_t c,d;
  40. __asm__ volatile
  41. ("mulx %[b], %[c], %[d];"
  42. : [c]"=r"(c), [d]"=r"(d)
  43. : [b]"m"(*b), [a]"d"(a));
  44. return (((__uint128_t)(d))<<64) | c;
  45. #endif
  46. }
  47. static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  48. #ifndef __BMI2__
  49. uint64_t c,d;
  50. __asm__ volatile
  51. ("movq %[a], %%rax; "
  52. "addq %%rax, %%rax; "
  53. "mulq %[b];"
  54. : [c]"=a"(c), [d]"=d"(d)
  55. : [b]"m"(*b), [a]"m"(*a)
  56. : "cc");
  57. return (((__uint128_t)(d))<<64) | c;
  58. #else
  59. uint64_t c,d;
  60. __asm__ volatile
  61. ("movq %[a], %%rdx;"
  62. "leaq (,%%rdx,2), %%rdx;"
  63. "mulx %[b], %[c], %[d];"
  64. : [c]"=r"(c), [d]"=r"(d)
  65. : [b]"m"(*b), [a]"m"(*a)
  66. : "rdx");
  67. return (((__uint128_t)(d))<<64) | c;
  68. #endif
  69. }
  70. static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  71. uint64_t lo = *acc, hi = *acc>>64;
  72. #ifdef __BMI2__
  73. uint64_t c,d;
  74. __asm__ volatile
  75. ("movq %[a], %%rdx; "
  76. "mulx %[b], %[c], %[d]; "
  77. "addq %[c], %[lo]; "
  78. "adcq %[d], %[hi]; "
  79. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  80. : [b]"m"(*b), [a]"m"(*a)
  81. : "rdx", "cc");
  82. #else
  83. __asm__ volatile
  84. ("movq %[a], %%rax; "
  85. "mulq %[b]; "
  86. "addq %%rax, %[lo]; "
  87. "adcq %%rdx, %[hi]; "
  88. : [lo]"+r"(lo), [hi]"+r"(hi)
  89. : [b]"m"(*b), [a]"m"(*a)
  90. : "rax", "rdx", "cc");
  91. #endif
  92. *acc = (((__uint128_t)(hi))<<64) | lo;
  93. }
  94. static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
  95. uint64_t lo = *acc, hi = *acc>>64;
  96. uint64_t lo2 = *acc2, hi2 = *acc2>>64;
  97. #ifdef __BMI2__
  98. uint64_t c,d;
  99. __asm__ volatile
  100. ("movq %[a], %%rdx; "
  101. "mulx %[b], %[c], %[d]; "
  102. "addq %[c], %[lo]; "
  103. "adcq %[d], %[hi]; "
  104. "addq %[c], %[lo2]; "
  105. "adcq %[d], %[hi2]; "
  106. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
  107. : [b]"m"(*b), [a]"m"(*a)
  108. : "rdx", "cc");
  109. #else
  110. __asm__ volatile
  111. ("movq %[a], %%rax; "
  112. "mulq %[b]; "
  113. "addq %%rax, %[lo]; "
  114. "adcq %%rdx, %[hi]; "
  115. "addq %%rax, %[lo2]; "
  116. "adcq %%rdx, %[hi2]; "
  117. : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
  118. : [b]"m"(*b), [a]"m"(*a)
  119. : "rax", "rdx", "cc");
  120. #endif
  121. *acc = (((__uint128_t)(hi))<<64) | lo;
  122. *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
  123. }
  124. static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  125. uint64_t lo = *acc, hi = *acc>>64;
  126. #ifdef __BMI2__
  127. uint64_t c,d;
  128. __asm__ volatile
  129. ("mulx %[b], %[c], %[d]; "
  130. "addq %[c], %[lo]; "
  131. "adcq %[d], %[hi]; "
  132. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  133. : [b]"m"(*b), [a]"d"(a)
  134. : "cc");
  135. #else
  136. __asm__ volatile
  137. ("movq %[a], %%rax; "
  138. "mulq %[b]; "
  139. "addq %%rax, %[lo]; "
  140. "adcq %%rdx, %[hi]; "
  141. : [lo]"+r"(lo), [hi]"+r"(hi)
  142. : [b]"m"(*b), [a]"r"(a)
  143. : "rax", "rdx", "cc");
  144. #endif
  145. *acc = (((__uint128_t)(hi))<<64) | lo;
  146. }
  147. static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  148. uint64_t lo = *acc, hi = *acc>>64;
  149. #ifdef __BMI2__
  150. uint64_t c,d;
  151. __asm__ volatile
  152. ("movq %[a], %%rdx; "
  153. "addq %%rdx, %%rdx; "
  154. "mulx %[b], %[c], %[d]; "
  155. "addq %[c], %[lo]; "
  156. "adcq %[d], %[hi]; "
  157. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  158. : [b]"m"(*b), [a]"m"(*a)
  159. : "rdx", "cc");
  160. #else
  161. __asm__ volatile
  162. ("movq %[a], %%rax; "
  163. "addq %%rax, %%rax; "
  164. "mulq %[b]; "
  165. "addq %%rax, %[lo]; "
  166. "adcq %%rdx, %[hi]; "
  167. : [lo]"+r"(lo), [hi]"+r"(hi)
  168. : [b]"m"(*b), [a]"m"(*a)
  169. : "rax", "rdx", "cc");
  170. #endif
  171. *acc = (((__uint128_t)(hi))<<64) | lo;
  172. }
  173. static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  174. uint64_t lo = *acc, hi = *acc>>64;
  175. #ifdef __BMI2__
  176. uint64_t c,d;
  177. __asm__ volatile
  178. ("movq %[a], %%rdx; "
  179. "mulx %[b], %[c], %[d]; "
  180. "subq %[c], %[lo]; "
  181. "sbbq %[d], %[hi]; "
  182. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  183. : [b]"m"(*b), [a]"m"(*a)
  184. : "rdx", "cc");
  185. #else
  186. __asm__ volatile
  187. ("movq %[a], %%rax; "
  188. "mulq %[b]; "
  189. "subq %%rax, %[lo]; "
  190. "sbbq %%rdx, %[hi]; "
  191. : [lo]"+r"(lo), [hi]"+r"(hi)
  192. : [b]"m"(*b), [a]"m"(*a)
  193. : "rax", "rdx", "cc");
  194. #endif
  195. *acc = (((__uint128_t)(hi))<<64) | lo;
  196. }
  197. static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  198. uint64_t lo = *acc, hi = *acc>>64;
  199. #ifdef __BMI2__
  200. uint64_t c,d;
  201. __asm__ volatile
  202. ("movq %[a], %%rdx; "
  203. "addq %%rdx, %%rdx; "
  204. "mulx %[b], %[c], %[d]; "
  205. "subq %[c], %[lo]; "
  206. "sbbq %[d], %[hi]; "
  207. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  208. : [b]"m"(*b), [a]"m"(*a)
  209. : "rdx", "cc");
  210. #else
  211. __asm__ volatile
  212. ("movq %[a], %%rax; "
  213. "addq %%rax, %%rax; "
  214. "mulq %[b]; "
  215. "subq %%rax, %[lo]; "
  216. "sbbq %%rdx, %[hi]; "
  217. : [lo]"+r"(lo), [hi]"+r"(hi)
  218. : [b]"m"(*b), [a]"m"(*a)
  219. : "rax", "rdx", "cc");
  220. #endif
  221. *acc = (((__uint128_t)(hi))<<64) | lo;
  222. }
  223. static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  224. uint64_t c,d, lo = *acc, hi = *acc>>64;
  225. __asm__ volatile
  226. ("movq %[a], %%rdx; "
  227. "mulx %[b], %[c], %[d]; "
  228. "subq %[lo], %[c]; "
  229. "sbbq %[hi], %[d]; "
  230. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  231. : [b]"m"(*b), [a]"m"(*a)
  232. : "rdx", "cc");
  233. *acc = (((__uint128_t)(d))<<64) | c;
  234. }
  235. static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  236. return ((__uint128_t)(a)) * b;
  237. }
  238. static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  239. return ((__int128_t)(a)) * b;
  240. }
  241. static __inline__ uint64_t opacify(uint64_t x) {
  242. __asm__ volatile("" : "+r"(x));
  243. return x;
  244. }
  245. static __inline__ mask_t is_zero(uint64_t x) {
  246. __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  247. return ~x;
  248. }
  249. #endif /* __X86_64_ARITH_H__ */