You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

247 lines
6.2 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #ifndef __X86_64_ARITH_H__
  5. #define __X86_64_ARITH_H__
  6. #include <stdint.h>
  7. /* TODO: non x86-64 versions of these.
  8. * TODO: autogenerate
  9. */
  10. static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  11. #ifndef __BMI2__
  12. uint64_t c,d;
  13. __asm__ volatile
  14. ("movq %[a], %%rax;"
  15. "mulq %[b];"
  16. : [c]"=a"(c), [d]"=d"(d)
  17. : [b]"m"(*b), [a]"m"(*a)
  18. : "cc");
  19. return (((__uint128_t)(d))<<64) | c;
  20. #else
  21. uint64_t c,d;
  22. __asm__ volatile
  23. ("movq %[a], %%rdx;"
  24. "mulx %[b], %[c], %[d];"
  25. : [c]"=r"(c), [d]"=r"(d)
  26. : [b]"m"(*b), [a]"m"(*a)
  27. : "rdx");
  28. return (((__uint128_t)(d))<<64) | c;
  29. #endif
  30. }
  31. static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  32. #ifndef __BMI2__
  33. uint64_t c,d;
  34. __asm__ volatile
  35. ("movq %[a], %%rax;"
  36. "mulq %[b];"
  37. : [c]"=a"(c), [d]"=d"(d)
  38. : [b]"m"(*b), [a]"r"(a)
  39. : "cc");
  40. return (((__uint128_t)(d))<<64) | c;
  41. #else
  42. uint64_t c,d;
  43. __asm__ volatile
  44. ("mulx %[b], %[c], %[d];"
  45. : [c]"=r"(c), [d]"=r"(d)
  46. : [b]"m"(*b), [a]"d"(a));
  47. return (((__uint128_t)(d))<<64) | c;
  48. #endif
  49. }
  50. static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  51. #ifndef __BMI2__
  52. uint64_t c,d;
  53. __asm__ volatile
  54. ("movq %[a], %%rax; "
  55. "addq %%rax, %%rax; "
  56. "mulq %[b];"
  57. : [c]"=a"(c), [d]"=d"(d)
  58. : [b]"m"(*b), [a]"m"(*a)
  59. : "cc");
  60. return (((__uint128_t)(d))<<64) | c;
  61. #else
  62. uint64_t c,d;
  63. __asm__ volatile
  64. ("movq %[a], %%rdx;"
  65. "leaq (,%%rdx,2), %%rdx;"
  66. "mulx %[b], %[c], %[d];"
  67. : [c]"=r"(c), [d]"=r"(d)
  68. : [b]"m"(*b), [a]"m"(*a)
  69. : "rdx");
  70. return (((__uint128_t)(d))<<64) | c;
  71. #endif
  72. }
  73. static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  74. uint64_t lo = *acc, hi = *acc>>64;
  75. #ifdef __BMI2__
  76. uint64_t c,d;
  77. __asm__ volatile
  78. ("movq %[a], %%rdx; "
  79. "mulx %[b], %[c], %[d]; "
  80. "addq %[c], %[lo]; "
  81. "adcq %[d], %[hi]; "
  82. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  83. : [b]"m"(*b), [a]"m"(*a)
  84. : "rdx", "cc");
  85. #else
  86. __asm__ volatile
  87. ("movq %[a], %%rax; "
  88. "mulq %[b]; "
  89. "addq %%rax, %[lo]; "
  90. "adcq %%rdx, %[hi]; "
  91. : [lo]"+r"(lo), [hi]"+r"(hi)
  92. : [b]"m"(*b), [a]"m"(*a)
  93. : "rax", "rdx", "cc");
  94. #endif
  95. *acc = (((__uint128_t)(hi))<<64) | lo;
  96. }
  97. static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  98. uint64_t lo = *acc, hi = *acc>>64;
  99. #ifdef __BMI2__
  100. uint64_t c,d;
  101. __asm__ volatile
  102. ("mulx %[b], %[c], %[d]; "
  103. "addq %[c], %[lo]; "
  104. "adcq %[d], %[hi]; "
  105. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  106. : [b]"m"(*b), [a]"d"(a)
  107. : "cc");
  108. #else
  109. __asm__ volatile
  110. ("movq %[a], %%rax; "
  111. "mulq %[b]; "
  112. "addq %%rax, %[lo]; "
  113. "adcq %%rdx, %[hi]; "
  114. : [lo]"+r"(lo), [hi]"+r"(hi)
  115. : [b]"m"(*b), [a]"r"(a)
  116. : "rax", "rdx", "cc");
  117. #endif
  118. *acc = (((__uint128_t)(hi))<<64) | lo;
  119. }
  120. static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  121. uint64_t lo = *acc, hi = *acc>>64;
  122. #ifdef __BMI2__
  123. uint64_t c,d;
  124. __asm__ volatile
  125. ("movq %[a], %%rdx; "
  126. "addq %%rdx, %%rdx; "
  127. "mulx %[b], %[c], %[d]; "
  128. "addq %[c], %[lo]; "
  129. "adcq %[d], %[hi]; "
  130. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  131. : [b]"m"(*b), [a]"m"(*a)
  132. : "rdx", "cc");
  133. #else
  134. __asm__ volatile
  135. ("movq %[a], %%rax; "
  136. "addq %%rax, %%rax; "
  137. "mulq %[b]; "
  138. "addq %%rax, %[lo]; "
  139. "adcq %%rdx, %[hi]; "
  140. : [lo]"+r"(lo), [hi]"+r"(hi)
  141. : [b]"m"(*b), [a]"m"(*a)
  142. : "rax", "rdx", "cc");
  143. #endif
  144. *acc = (((__uint128_t)(hi))<<64) | lo;
  145. }
  146. static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  147. uint64_t lo = *acc, hi = *acc>>64;
  148. #ifdef __BMI2__
  149. uint64_t c,d;
  150. __asm__ volatile
  151. ("movq %[a], %%rdx; "
  152. "mulx %[b], %[c], %[d]; "
  153. "subq %[c], %[lo]; "
  154. "sbbq %[d], %[hi]; "
  155. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  156. : [b]"m"(*b), [a]"m"(*a)
  157. : "rdx", "cc");
  158. #else
  159. __asm__ volatile
  160. ("movq %[a], %%rax; "
  161. "mulq %[b]; "
  162. "subq %%rax, %[lo]; "
  163. "sbbq %%rdx, %[hi]; "
  164. : [lo]"+r"(lo), [hi]"+r"(hi)
  165. : [b]"m"(*b), [a]"m"(*a)
  166. : "rax", "rdx", "cc");
  167. #endif
  168. *acc = (((__uint128_t)(hi))<<64) | lo;
  169. }
  170. static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  171. uint64_t lo = *acc, hi = *acc>>64;
  172. #ifdef __BMI2__
  173. uint64_t c,d;
  174. __asm__ volatile
  175. ("movq %[a], %%rdx; "
  176. "addq %%rdx, %%rdx; "
  177. "mulx %[b], %[c], %[d]; "
  178. "subq %[c], %[lo]; "
  179. "sbbq %[d], %[hi]; "
  180. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  181. : [b]"m"(*b), [a]"m"(*a)
  182. : "rdx", "cc");
  183. #else
  184. __asm__ volatile
  185. ("movq %[a], %%rax; "
  186. "addq %%rax, %%rax; "
  187. "mulq %[b]; "
  188. "subq %%rax, %[lo]; "
  189. "sbbq %%rdx, %[hi]; "
  190. : [lo]"+r"(lo), [hi]"+r"(hi)
  191. : [b]"m"(*b), [a]"m"(*a)
  192. : "rax", "rdx", "cc");
  193. #endif
  194. *acc = (((__uint128_t)(hi))<<64) | lo;
  195. }
  196. static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  197. uint64_t c,d, lo = *acc, hi = *acc>>64;
  198. __asm__ volatile
  199. ("movq %[a], %%rdx; "
  200. "mulx %[b], %[c], %[d]; "
  201. "subq %[lo], %[c]; "
  202. "sbbq %[hi], %[d]; "
  203. : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
  204. : [b]"m"(*b), [a]"m"(*a)
  205. : "rdx", "cc");
  206. *acc = (((__uint128_t)(d))<<64) | c;
  207. }
  208. static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  209. return ((__uint128_t)(a)) * b;
  210. }
  211. static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  212. return ((__int128_t)(a)) * b;
  213. }
  214. static __inline__ uint64_t opacify(uint64_t x) {
  215. __asm__ volatile("" : "+r"(x));
  216. return x;
  217. }
  218. static __inline__ mask_t is_zero(uint64_t x) {
  219. __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  220. return ~x;
  221. }
  222. #endif /* __X86_64_ARITH_H__ */