jmg
/
ed448goldilocks


			
							/* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

#ifndef __WORD_H__
#define __WORD_H__

/* for posix_memalign */
#define _XOPEN_SOURCE 600

#include "arch_config.h"


#ifndef __APPLE__
#ifndef _BSD_SOURCE
#define _BSD_SOURCE 1
#endif
#include <endian.h>
#endif

#include <stdint.h>
#include <stdlib.h>
#include <sys/types.h>
#include <inttypes.h>

#if defined(__ARM_NEON__)
#include <arm_neon.h>
#elif defined(__SSE2__)
#include <immintrin.h>
#endif

#if (WORD_BITS == 64)
typedef uint32_t hword_t;
typedef uint64_t word_t;
typedef __uint128_t dword_t;
typedef int32_t hsword_t;
typedef int64_t sword_t;
typedef __int128_t dsword_t;
#define PRIxWORD PRIx64
#define PRIxWORDfull "%016" PRIx64
#define PRIxWORD56   "%014" PRIx64
#define PRIxWORD60   "%015" PRIx60
#define U64LE(x) x##ull
#define U58LE(x) x##ull
#define U56LE(x) x##ull
#define U60LE(x) x##ull
#define letohWORD letoh64
#define GOLDI_BITS 64
#define SC_LIMB(x) (x##ull)
#elif (WORD_BITS == 32)
typedef uint16_t hword_t;
typedef uint32_t word_t;
typedef uint64_t dword_t;
typedef int16_t hsword_t;
typedef int32_t sword_t;
typedef int64_t dsword_t;
#define PRIxWORD PRIx32
#define PRIxWORDfull "%08" PRIx32
#define PRIxWORD56   "%07" PRIx32
#define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
#define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
#define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
#define letohWORD letoh32
#define GOLDI_BITS 32
#define SC_LIMB(x) (x##ull)
#else
#error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif

#define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y))
#define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y))
#define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS))

typedef word_t mask_t;
static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -(mask_t)1;


#ifdef __ARM_NEON__
typedef uint32x4_t vecmask_t;
#elif __clang__
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
#else /* GCC-cleanliness */
typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
typedef int64_t  int64x2_t __attribute__((vector_size(16)));
typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
typedef int64_t  int64x4_t __attribute__((vector_size(32)));
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
typedef int32_t  int32x4_t __attribute__((vector_size(16)));
typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
typedef int32_t  int32x2_t __attribute__((vector_size(8)));
typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
typedef int32_t  int32x8_t __attribute__((vector_size(32)));
typedef word_t vecmask_t __attribute__((vector_size(32)));
#endif

#if __AVX2__
    #define VECTOR_ALIGNED __attribute__((aligned(32)))
    typedef uint32x8_t big_register_t;
    typedef uint64x4_t uint64xn_t;
    typedef uint32x8_t uint32xn_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = (uint32_t)x;
        big_register_t ret = {y,y,y,y,y,y,y,y};
        return ret;
    }
#elif __SSE2__
    #define VECTOR_ALIGNED __attribute__((aligned(16)))
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = x;
        big_register_t ret = {y,y,y,y};
        return ret;
    }
#elif __ARM_NEON__
    #define VECTOR_ALIGNED __attribute__((aligned(16)))
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;
    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return vdupq_n_u32(x);
    }
#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
    #define VECTOR_ALIGNED __attribute__((aligned(8)))
    typedef uint64_t big_register_t, uint64xn_t;

    typedef uint32_t uint32xn_t;
    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
#else
    #define VECTOR_ALIGNED __attribute__((aligned(4)))
    typedef uint64_t uint64xn_t;
    typedef uint32_t uint32xn_t;
    typedef uint32_t big_register_t;

    static __inline__ big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
#endif

typedef struct {
    uint64xn_t unaligned;
} __attribute__((packed)) unaligned_uint64xn_t;

typedef struct {
    uint32xn_t unaligned;
} __attribute__((packed)) unaligned_uint32xn_t;
    
/**
 * Return -1 if x==0, and 0 otherwise.
 */
static __inline__ mask_t
__attribute__((always_inline,unused))
word_is_zero(word_t x) {
    return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
}

#if __AVX2__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
    return (big_register_t)(x == br_set_to_mask(0));
}
#elif __SSE2__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
    return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
    //return (big_register_t)(x == br_set_to_mask(0));
}
#elif __ARM_NEON__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
    return vceqq_u32(x,x^x);
}
#else
static __inline__ mask_t
br_is_zero(word_t x) {
    return (((dword_t)x) - 1)>>WORD_BITS;
}
#endif


#ifdef __APPLE__
static inline uint64_t
htobe64 (uint64_t x) {
    __asm__ ("bswapq %0" : "+r"(x));
    return x;
}
static inline uint64_t
htole64 (uint64_t x) { return x; }

static inline uint64_t
letoh64 (uint64_t x) { return x; }
#endif

/**
 * Really call memset, in a way that prevents the compiler from optimizing it out.
 * @param p The object to zeroize.
 * @param c The char to set it to (probably zero).
 * @param s The size of the object.
 */
#if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
#define HAS_MEMSET_S
#endif

#if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
#define NEED_MEMSET_S_EXTERN
#endif

#ifdef HAS_MEMSET_S
#ifdef NEED_MEMSET_S_EXTERN
extern int memset_s(void *, size_t, int, size_t);
#endif
static __inline__ void
really_memset(void *p, char c, size_t s) {
    memset_s(p, s, c, s);
}
#else
static __inline__ void __attribute__((always_inline,unused))
really_memset(void *p, char c, size_t s) {
    volatile char *pv = (volatile char *)p;
    size_t i;
    for (i=0; i<s; i++) pv[i] = c;
}
#endif

/**
 * Allocate memory which is sufficiently aligned to be used for the
 * largest vector on the system (for now that's a big_register_t).
 *
 * Man malloc says that it does this, but at least for AVX2 on MacOS X,
 * it's lying.
 *
 * @param size The size of the region to allocate.
 * @return A suitable pointer, which can be free'd with free(),
 * or NULL if no memory can be allocated.
 */
static __inline__ void *
malloc_vector (
    size_t size
) __attribute__((always_inline, unused));

void *
malloc_vector(size_t size) {
    void *out = NULL;
    
    int ret = posix_memalign(&out, sizeof(big_register_t), size);
    
    if (ret) {
        return NULL;
    } else {
        return out;
    }
}

#endif /* __WORD_H__ */