You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

279 lines
7.5 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #ifndef __WORD_H__
  5. #define __WORD_H__
  6. /* for posix_memalign */
  7. #define _XOPEN_SOURCE 600
  8. #include "arch_config.h"
  9. #ifndef __APPLE__
  10. #ifndef _BSD_SOURCE
  11. #define _BSD_SOURCE 1
  12. #endif
  13. #include <endian.h>
  14. #endif
  15. #include <stdint.h>
  16. #include <stdlib.h>
  17. #include <sys/types.h>
  18. #include <inttypes.h>
  19. #if defined(__ARM_NEON__)
  20. #include <arm_neon.h>
  21. #elif defined(__SSE2__)
  22. #include <immintrin.h>
  23. #endif
  24. #if (WORD_BITS == 64)
  25. typedef uint32_t hword_t;
  26. typedef uint64_t word_t;
  27. typedef __uint128_t dword_t;
  28. typedef int32_t hsword_t;
  29. typedef int64_t sword_t;
  30. typedef __int128_t dsword_t;
  31. #define PRIxWORD PRIx64
  32. #define PRIxWORDfull "%016" PRIx64
  33. #define PRIxWORD56 "%014" PRIx64
  34. #define PRIxWORD60 "%015" PRIx60
  35. #define U64LE(x) x##ull
  36. #define U58LE(x) x##ull
  37. #define U56LE(x) x##ull
  38. #define U60LE(x) x##ull
  39. #define letohWORD letoh64
  40. #define GOLDI_BITS 64
  41. #define SC_LIMB(x) (x##ull)
  42. #elif (WORD_BITS == 32)
  43. typedef uint16_t hword_t;
  44. typedef uint32_t word_t;
  45. typedef uint64_t dword_t;
  46. typedef int16_t hsword_t;
  47. typedef int32_t sword_t;
  48. typedef int64_t dsword_t;
  49. #define PRIxWORD PRIx32
  50. #define PRIxWORDfull "%08" PRIx32
  51. #define PRIxWORD56 "%07" PRIx32
  52. #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
  53. #define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
  54. #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
  55. #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
  56. #define letohWORD letoh32
  57. #define GOLDI_BITS 32
  58. #define SC_LIMB(x) (x##ull)
  59. #else
  60. #error "For now, libdecaf only supports 32- and 64-bit architectures."
  61. #endif
  62. #define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y))
  63. #define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y))
  64. #define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS))
  65. typedef word_t mask_t;
  66. static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -(mask_t)1;
  67. #ifdef __ARM_NEON__
  68. typedef uint32x4_t vecmask_t;
  69. #elif __clang__
  70. typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
  71. typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
  72. typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
  73. typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
  74. typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
  75. typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
  76. typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
  77. typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
  78. typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
  79. typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
  80. typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
  81. #else /* GCC-cleanliness */
  82. typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
  83. typedef int64_t int64x2_t __attribute__((vector_size(16)));
  84. typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
  85. typedef int64_t int64x4_t __attribute__((vector_size(32)));
  86. typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
  87. typedef int32_t int32x4_t __attribute__((vector_size(16)));
  88. typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
  89. typedef int32_t int32x2_t __attribute__((vector_size(8)));
  90. typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
  91. typedef int32_t int32x8_t __attribute__((vector_size(32)));
  92. typedef word_t vecmask_t __attribute__((vector_size(32)));
  93. #endif
  94. #if __AVX2__
  95. #define VECTOR_ALIGNED __attribute__((aligned(32)))
  96. typedef uint32x8_t big_register_t;
  97. typedef uint64x4_t uint64xn_t;
  98. typedef uint32x8_t uint32xn_t;
  99. static __inline__ big_register_t
  100. br_set_to_mask(mask_t x) {
  101. uint32_t y = (uint32_t)x;
  102. big_register_t ret = {y,y,y,y,y,y,y,y};
  103. return ret;
  104. }
  105. #elif __SSE2__
  106. #define VECTOR_ALIGNED __attribute__((aligned(16)))
  107. typedef uint32x4_t big_register_t;
  108. typedef uint64x2_t uint64xn_t;
  109. typedef uint32x4_t uint32xn_t;
  110. static __inline__ big_register_t
  111. br_set_to_mask(mask_t x) {
  112. uint32_t y = x;
  113. big_register_t ret = {y,y,y,y};
  114. return ret;
  115. }
  116. #elif __ARM_NEON__
  117. #define VECTOR_ALIGNED __attribute__((aligned(16)))
  118. typedef uint32x4_t big_register_t;
  119. typedef uint64x2_t uint64xn_t;
  120. typedef uint32x4_t uint32xn_t;
  121. static __inline__ big_register_t
  122. br_set_to_mask(mask_t x) {
  123. return vdupq_n_u32(x);
  124. }
  125. #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
  126. #define VECTOR_ALIGNED __attribute__((aligned(8)))
  127. typedef uint64_t big_register_t, uint64xn_t;
  128. typedef uint32_t uint32xn_t;
  129. static __inline__ big_register_t
  130. br_set_to_mask(mask_t x) {
  131. return (big_register_t)x;
  132. }
  133. #else
  134. #define VECTOR_ALIGNED __attribute__((aligned(4)))
  135. typedef uint64_t uint64xn_t;
  136. typedef uint32_t uint32xn_t;
  137. typedef uint32_t big_register_t;
  138. static __inline__ big_register_t
  139. br_set_to_mask(mask_t x) {
  140. return (big_register_t)x;
  141. }
  142. #endif
  143. typedef struct {
  144. uint64xn_t unaligned;
  145. } __attribute__((packed)) unaligned_uint64xn_t;
  146. typedef struct {
  147. uint32xn_t unaligned;
  148. } __attribute__((packed)) unaligned_uint32xn_t;
  149. /**
  150. * Return -1 if x==0, and 0 otherwise.
  151. */
  152. static __inline__ mask_t
  153. __attribute__((always_inline,unused))
  154. word_is_zero(word_t x) {
  155. return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
  156. }
  157. #if __AVX2__
  158. static __inline__ big_register_t
  159. br_is_zero(big_register_t x) {
  160. return (big_register_t)(x == br_set_to_mask(0));
  161. }
  162. #elif __SSE2__
  163. static __inline__ big_register_t
  164. br_is_zero(big_register_t x) {
  165. return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
  166. //return (big_register_t)(x == br_set_to_mask(0));
  167. }
  168. #elif __ARM_NEON__
  169. static __inline__ big_register_t
  170. br_is_zero(big_register_t x) {
  171. return vceqq_u32(x,x^x);
  172. }
  173. #else
  174. static __inline__ mask_t
  175. br_is_zero(word_t x) {
  176. return (((dword_t)x) - 1)>>WORD_BITS;
  177. }
  178. #endif
  179. #ifdef __APPLE__
  180. static inline uint64_t
  181. htobe64 (uint64_t x) {
  182. __asm__ ("bswapq %0" : "+r"(x));
  183. return x;
  184. }
  185. static inline uint64_t
  186. htole64 (uint64_t x) { return x; }
  187. static inline uint64_t
  188. letoh64 (uint64_t x) { return x; }
  189. #endif
  190. /**
  191. * Really call memset, in a way that prevents the compiler from optimizing it out.
  192. * @param p The object to zeroize.
  193. * @param c The char to set it to (probably zero).
  194. * @param s The size of the object.
  195. */
  196. #if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
  197. #define HAS_MEMSET_S
  198. #endif
  199. #if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
  200. #define NEED_MEMSET_S_EXTERN
  201. #endif
  202. #ifdef HAS_MEMSET_S
  203. #ifdef NEED_MEMSET_S_EXTERN
  204. extern int memset_s(void *, size_t, int, size_t);
  205. #endif
  206. static __inline__ void
  207. really_memset(void *p, char c, size_t s) {
  208. memset_s(p, s, c, s);
  209. }
  210. #else
  211. static __inline__ void __attribute__((always_inline,unused))
  212. really_memset(void *p, char c, size_t s) {
  213. volatile char *pv = (volatile char *)p;
  214. size_t i;
  215. for (i=0; i<s; i++) pv[i] = c;
  216. }
  217. #endif
  218. /**
  219. * Allocate memory which is sufficiently aligned to be used for the
  220. * largest vector on the system (for now that's a big_register_t).
  221. *
  222. * Man malloc says that it does this, but at least for AVX2 on MacOS X,
  223. * it's lying.
  224. *
  225. * @param size The size of the region to allocate.
  226. * @return A suitable pointer, which can be free'd with free(),
  227. * or NULL if no memory can be allocated.
  228. */
  229. static __inline__ void *
  230. malloc_vector (
  231. size_t size
  232. ) __attribute__((always_inline, unused));
  233. void *
  234. malloc_vector(size_t size) {
  235. void *out = NULL;
  236. int ret = posix_memalign(&out, sizeof(big_register_t), size);
  237. if (ret) {
  238. return NULL;
  239. } else {
  240. return out;
  241. }
  242. }
  243. #endif /* __WORD_H__ */