/**
 * @cond internal
 * @file x25519.c
 * @copyright
 *   Copyright (c) 2015-2016 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Key exchange and signatures based on X25519.
 */
#include <stdint.h>
#include "x25519.h"
#include "strobe.h"
#include "strobe_config.h"

#if X25519_WBITS == 64
    typedef uint64_t limb_t;
    typedef __uint128_t dlimb_t;
    typedef __int128_t sdlimb_t;
    #define eswap_limb eswap_letoh_64
    #define LIMB(x) x##ull
#elif X25519_WBITS == 32
    typedef uint32_t limb_t;
    typedef uint64_t dlimb_t;
    typedef int64_t sdlimb_t;
    #define eswap_limb eswap_letoh_32
    #define LIMB(x) (uint32_t)(x##ull),(uint32_t)((x##ull)>>32)
#else
    #error "Need to know X25519_WBITS"
#endif

#define NLIMBS (256/X25519_WBITS)
typedef limb_t fe[NLIMBS];

#if X25519_SUPPORT_SIGN
typedef limb_t scalar_t[NLIMBS];
static const limb_t MONTGOMERY_FACTOR = (limb_t)0xd2b51da312547e1bull;
static const scalar_t sc_p = {
    LIMB(0x5812631a5cf5d3ed), LIMB(0x14def9dea2f79cd6),
    LIMB(0x0000000000000000), LIMB(0x1000000000000000)
}, sc_r2 = {
    LIMB(0xa40611e3449c0f01), LIMB(0xd00e1ba768859347),
    LIMB(0xceec73d217f5be65), LIMB(0x0399411b7c309a3d)
};
#endif

static inline limb_t umaal(
    limb_t *carry, limb_t acc, limb_t mand, limb_t mier
) {
    dlimb_t tmp = (dlimb_t) mand * mier + acc + *carry;
    *carry = tmp >> X25519_WBITS;
    return tmp;
}

/* These functions are implemented in terms of umaal on ARM */
static inline limb_t adc(limb_t *carry, limb_t acc, limb_t mand) {
    dlimb_t total = (dlimb_t)*carry + acc + mand;
    *carry = total>>X25519_WBITS;
    return total;
}

static inline limb_t adc0(limb_t *carry, limb_t acc) {
    dlimb_t total = (dlimb_t)*carry + acc;
    *carry = total>>X25519_WBITS;
    return total;
}

/* Precondition: carry is small.
 * Invariant: result of propagate is < 2^255 + 1 word
 * In particular, always less than 2p.
 * Also, output x >= min(x,19)
 */
static void propagate(fe x, limb_t over) {
    unsigned i;
    over = x[NLIMBS-1]>>(X25519_WBITS-1) | over<<1;
    x[NLIMBS-1] &= ~((limb_t)1<<(X25519_WBITS-1));

    limb_t carry = over * 19;
    for (i=0; i<NLIMBS; i++) {
        x[i] = adc0(&carry, x[i]);
    }
}

static void add(fe out, const fe a, const fe b) {      
    unsigned i;
    limb_t carry = 0;
    for (i=0; i<NLIMBS; i++) {
        out[i] = adc(&carry, a[i], b[i]);
    }
    propagate(out,carry);
}

static void sub(fe out, const fe a, const fe b) {
    unsigned i;
    sdlimb_t carry = -38;
    for (i=0; i<NLIMBS; i++) {
        out[i] = carry = carry + a[i] - b[i];
        carry >>= X25519_WBITS;
    }
    propagate(out,1+carry);
}

static void __attribute__((unused))
swapin(limb_t *x, const uint8_t *in) {
    memcpy(x,in,sizeof(fe));
    unsigned i;
    for (i=0; i<NLIMBS; i++) {
        x[i] = eswap_limb(x[i]);
    }
}

static void __attribute__((unused))
swapout(uint8_t *out, limb_t *x) {
    unsigned i;
    for (i=0; i<NLIMBS; i++) {
        x[i] = eswap_limb(x[i]);
    }
    memcpy(out,x,sizeof(fe));
}

static void mul(fe out, const fe a, const fe b, unsigned nb) {
    /* GCC at least produces pretty decent asm for this, so don't need to have dedicated asm. */
    limb_t accum[2*NLIMBS] = {0};
    unsigned i,j;

    limb_t carry2;
    for (i=0; i<nb; i++) {
        carry2 = 0;
        limb_t mand = b[i];
        for (j=0; j<NLIMBS; j++) {
            accum[i+j] = umaal(&carry2, accum[i+j], mand, a[j]);
        }
        accum[i+j] = carry2;
    }

    carry2 = 0;
    const limb_t mand = 38;
    for (j=0; j<NLIMBS; j++) {
         out[j] = umaal(&carry2, accum[j], mand, accum[j+NLIMBS]);
    }
    propagate(out,carry2);
}

static void sqr(fe out, const fe a) { mul(out,a,a,NLIMBS); }
static void mul1(fe out, const fe a) { mul(out,a,out,NLIMBS); }
static void sqr1(fe a) { mul1(a,a); }

static void condswap(limb_t a[2*NLIMBS], limb_t b[2*NLIMBS], limb_t doswap) {
    unsigned i;
    for (i=0; i<2*NLIMBS; i++) {
        limb_t xor = (a[i]^b[i]) & doswap;
        a[i] ^= xor; b[i] ^= xor;
    }
}

static limb_t canon(fe x) {
    /* Canonicalize a field element x, reducing it to the least residue
     * which is congruent to it mod 2^255-19.
     *
     * Precondition: x < 2^255 + 1 word
     */

    /* First, add 19. */
    unsigned i;
    limb_t carry0 = 19;
    for (i=0; i<NLIMBS; i++) {
        x[i] = adc0(&carry0, x[i]);
    }
    propagate(x,carry0);

    /* Here, 19 <= x2 < 2^255
     *
     * This is because we added 19, so before propagate it can't be less than 19.
     * After propagate, it still can't be less than 19, because if propagate does
     * anything it adds 19.
     *
     * We know that the high bit must be clear, because either the input was
     * ~ 2^255 + one word + 19 (in which case it propagates to at most 2 words)
     * or it was < 2^255.
     *
     * So now, if we subtract 19, we will get back to something in [0,2^255-19).
     */
    sdlimb_t carry = -19;
    limb_t res = 0;
    for (i=0; i<NLIMBS; i++) {
        res |= x[i] = carry += x[i];
        carry >>= X25519_WBITS;
    }
    return ((dlimb_t)res - 1) >> X25519_WBITS;
}

static const limb_t a24[1]={121665};

static void ladder_part1(fe xs[5]) {
    limb_t *x2 = xs[0], *z2=xs[1],*x3=xs[2],*z3=xs[3],*t1=xs[4];
    add(t1,x2,z2);  // t1 = A
    sub(z2,x2,z2);  // z2 = B
    add(x2,x3,z3);  // x2 = C
    sub(z3,x3,z3);  // z3 = D
    mul1(z3,t1);    // z3 = DA
    mul1(x2,z2);    // x3 = BC
    add(x3,z3,x2);  // x3 = DA+CB
    sub(z3,z3,x2);  // z3 = DA-CB
    sqr1(t1);       // t1 = AA
    sqr1(z2);       // z2 = BB
    sub(x2,t1,z2);  // x2 = E = AA-BB
    mul(z2,x2,a24,sizeof(a24)/sizeof(a24[0])); // z2 = E*a24
    add(z2,z2,t1);  // z2 = E*a24 + AA
}
static void ladder_part2(fe xs[5], const fe x1) {
    limb_t *x2 = xs[0], *z2=xs[1],*x3=xs[2],*z3=xs[3],*t1=xs[4];
    sqr1(z3);       // z3 = (DA-CB)^2
    mul1(z3,x1);    // z3 = x1 * (DA-CB)^2
    sqr1(x3);       // x3 = (DA+CB)^2
    mul1(z2,x2);    // z2 = AA*(E*a24+AA)
    sub(x2,t1,x2);  // x2 = BB again
    mul1(x2,t1);    // x2 = AA*BB
}

static void x25519_core(fe xs[5], const uint8_t scalar[X25519_BYTES], const uint8_t *x1, int clamp) {
    int i;
#if X25519_MEMCPY_PARAMS
    fe x1i;
    swapin(x1i,x1);
    x1 = (const uint8_t *)x1;
#endif
    limb_t swap = 0;
    limb_t *x2 = xs[0],*x3=xs[2],*z3=xs[3];
    memset(xs,0,4*sizeof(fe));
    x2[0] = z3[0] = 1;
    memcpy(x3,x1,sizeof(fe));

    for (i=255; i>=0; i--) {
        uint8_t bytei = scalar[i/8];
        if (clamp) {
            if (i/8 == 0) {
                bytei &= ~7;
            } else if (i/8 == X25519_BYTES-1) {
                bytei &= 0x7F;
                bytei |= 0x40;
            }
        }
        limb_t doswap = -(limb_t)((bytei>>(i%8)) & 1);
        condswap(x2,x3,swap^doswap);
        swap = doswap;

        ladder_part1(xs);
        ladder_part2(xs,(const limb_t *)x1);
    }
    condswap(x2,x3,swap);
}

int x25519(uint8_t out[X25519_BYTES], const uint8_t scalar[X25519_BYTES], const uint8_t x1[X25519_BYTES], int clamp) {
    fe xs[5];
    x25519_core(xs,scalar,x1,clamp);

    /* Precomputed inversion chain */
    limb_t *x2 = xs[0], *z2=xs[1], *z3=xs[3];
    int i;

    limb_t *prev = z2;
#if X25519_USE_POWER_CHAIN
    static const struct { uint8_t a,c,n; } steps[13] = {
        {2,1,1  },
        {2,1,1  },
        {4,2,3  },
        {2,4,6  },
        {3,1,1  },
        {3,2,12 },
        {4,3,25 },
        {2,3,25 },
        {2,4,50 },
        {3,2,125},
        {3,1,2  },
        {3,1,2  },
        {3,1,1  }
    };
    for (i=0; i<13; i++) {
        int j;
        limb_t *a = xs[steps[i].a];
        for (j=steps[i].n; j>0; j--) {
            sqr(a, prev);
            prev = a;
        }
        mul1(a,xs[steps[i].c]);
    }
#else
    /* Raise to the p-2 = 0x7f..ffeb */
    for (i=253; i>=0; i--) {
        sqr(z3,prev);
        prev = z3;
        if (i>=8 || (0xeb>>i & 1)) {
            mul1(z3,z2);
        }
    }
#endif

    /* Here prev = z3 */
    /* x2 /= z2 */
#if X25519_MEMCPY_PARAMS
    mul1(x2,z3);
    int ret = canon(x2);
    swapout(out,x2);
#else
    mul((limb_t *)out, x2, z3, NLIMBS);
    int ret = canon((limb_t*)out);
#endif
    if (clamp) return ret;
    else return 0;
}

const uint8_t X25519_BASE_POINT[X25519_BYTES] = {9};

#if X25519_SUPPORT_VERIFY
static limb_t x25519_verify_core(
    fe xs[5],
    const limb_t *other1,
    const uint8_t other2[X25519_BYTES]
) {
    limb_t *z2=xs[1],*x3=xs[2],*z3=xs[3];
#if X25519_MEMCPY_PARAMS
    fe xo2;
    swapin(xo2,other2);
#else
    const limb_t *xo2 = (const limb_t *)other2;
#endif

    memcpy(x3, other1, 2*sizeof(fe));

    ladder_part1(xs);

    /* Here z2 = t2^2 */
    mul1(z2,other1);
    mul1(z2,other1+NLIMBS);
    mul1(z2,xo2);
    const limb_t sixteen = 16;
    mul (z2,z2,&sixteen,1);

    mul1(z3,xo2);
    sub(z3,z3,x3);
    sqr1(z3);

    /* check equality */
    sub(z3,z3,z2);

    /* If canon(z2) then both sides are zero.
     * If canon(z3) then the two sides are equal.
     *
     * Reject sigs where both sides are zero, because
     * that can happen if an input causes the ladder to
     * return 0/0.
     */
    return canon(z2) | ~canon(z3);
}

int x25519_verify_p2 (
    const uint8_t response[X25519_BYTES],
    const uint8_t challenge[X25519_BYTES],
    const uint8_t eph[X25519_BYTES],
    const uint8_t pub[X25519_BYTES]
) {
    fe xs[7];
    x25519_core(&xs[0],challenge,pub,0);
    x25519_core(&xs[2],response,X25519_BASE_POINT,0);
    return x25519_verify_core(&xs[2],xs[0],eph);
}
#endif // X25519_SUPPORT_VERIFY

#if X25519_SUPPORT_SIGN
static void sc_montmul (
    scalar_t out,
    const scalar_t a,
    const scalar_t b
) {
   /**
    * OK, so carry bounding.  We're using a high carry, so that the
    * inputs don't have to be reduced.
    *
    * First montmul: output < (M^2 + Mp)/M = M+p, subtract p, < M.  This gets rid of high carry.
    * Second montmul, by r^2 mod p < p: output < (Mp + Mp)/M = 2p, subtract p, < p, done.
    */
    unsigned i,j;
    limb_t hic = 0;
    for (i=0; i<NLIMBS; i++) {
        limb_t carry=0, carry2=0, mand = a[i], mand2 = MONTGOMERY_FACTOR;

        for (j=0; j<NLIMBS; j++) {
            limb_t acc = out[j];
            acc = umaal(&carry, acc, mand, b[j]);
            if (j==0) mand2 *= acc;
            acc = umaal(&carry2, acc, mand2, sc_p[j]);
            if (j>0) out[j-1] = acc;
        }

        /* Add two carry registers and high carry */
        out[NLIMBS-1] = adc(&hic, carry, carry2);
    }

    /* Reduce */
    sdlimb_t scarry = 0;
    for (i=0; i<NLIMBS; i++) {
        out[i] = scarry = scarry + out[i] - sc_p[i];
        scarry >>= X25519_WBITS;
    }
    limb_t need_add = -(scarry + hic);

    limb_t carry = 0;
    for (i=0; i<NLIMBS; i++) {
        out[i] = umaal(&carry, out[i], need_add, sc_p[i]);
    }
}

void x25519_sign_p2 (
    uint8_t response[X25519_BYTES],
    const uint8_t challenge[X25519_BYTES],
    const uint8_t eph_secret[X25519_BYTES],
    const uint8_t secret[X25519_BYTES]
) {
    /* FUTURE memory/code size: just make eph_secret non-const? */
    scalar_t scalar1;
    swapin(scalar1,eph_secret);

    #if X25519_MEMCPY_PARAMS
    scalar_t scalar2, scalar3;
    swapin(scalar2,secret);
    swapin(scalar3,challenge);
    sc_montmul(scalar1,scalar2,scalar3);
    memset(scalar2,0,sizeof(scalar2));
    sc_montmul(scalar2,scalar1,sc_r2);
    swapout(response,scalar2);
    #else
    sc_montmul(scalar1,(const limb_t *)secret,(const limb_t *)challenge);
    memset(response,0,X25519_BYTES);
    sc_montmul((limb_t *)response,scalar1,sc_r2);
    #endif
}
#endif // X25519_SUPPORT_SIGN