/* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

#include "barrett_field.h"
#include <assert.h>

word_t
add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] + (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
}

static __inline__ word_t
add_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] + c[i];
        carry >>= WORD_BITS;
    }
    return carry;
}

static __inline__ word_t
sub_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] - c[i];
        carry >>= WORD_BITS;
    }
    return carry;
}

word_t
sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] - (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
}

static word_t
widemac(
    word_t *accum,
    int nwords_accum,
    const word_t *mier,
    int nwords_mier,
    word_t mand,
    word_t carry
) {
    int i;
    assert(nwords_accum >= nwords_mier);
    
    for (i=0; i<nwords_mier; i++) {
        /* UMAAL chain for the wordy part of p */
        dword_t product = ((dword_t)mand) * mier[i];
        product += accum[i];
        product += carry;
        accum[i] = product;
        carry = product >> WORD_BITS;
    }
    
    for (; i<nwords_accum; i++) {
        dword_t sum = ((dword_t)carry) + accum[i];
        accum[i] = sum;
        carry = sum >> WORD_BITS;
    }
    
    return carry;
}

void
barrett_negate (
    word_t *a,
    int nwords_a,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
) {
    int i;
    dsword_t carry = 0;
    
    barrett_reduce(a,nwords_a,0,p_lo,nwords_p,nwords_lo,p_shift);
    
    /* Have p = 2^big - p_lo.  Want p - a = 2^big - p_lo - a */
    
    for (i=0; i<nwords_lo; i++) {
        a[i] = carry = carry - p_lo[i] - a[i];
        carry >>= WORD_BITS;
    }
    for (; i<nwords_p; i++) {
        a[i] = carry = carry - a[i];
        if (i<nwords_p-1) {
            carry >>= WORD_BITS;
        }
    }
    
    a[nwords_p-1] = carry = carry + (((word_t)1) << p_shift);
    
    for (; i<nwords_a; i++) {
        assert(!a[i]);
    }
    
    assert(!(carry>>64));
}

void
barrett_reduce(
    word_t *a,
    int nwords_a,
    word_t a_carry,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
) {
    /* TODO: non 2^k-c primes. */
    int repeat, nwords_left_in_a=nwords_a;
    
    /* TODO: is there a point to this a_carry business? */
    assert(a_carry < ((word_t)1)<<p_shift && nwords_a >= nwords_p);
    
    for (; nwords_left_in_a >= nwords_p; nwords_left_in_a--) {
        for (repeat=0; repeat<2; repeat++) {
            /* PERF: surely a more careful implementation could
             * avoid this double round
             */
            word_t mand = a[nwords_left_in_a-1] >> p_shift;
            a[nwords_left_in_a-1] &= (((word_t)1)<<p_shift)-1;
            if (p_shift && !repeat) {
                /* collect high bits when there are any */
                if (nwords_left_in_a < nwords_a) {
                    mand |= a[nwords_left_in_a] << (WORD_BITS-p_shift);
                    a[nwords_left_in_a] = 0;
                } else {
                    mand |= a_carry << (WORD_BITS-p_shift);
                }
            }
            
            word_t carry = widemac(a+nwords_left_in_a-nwords_p, nwords_p, p_lo, nwords_lo, mand, 0);
            assert(!carry);
            (void)carry;
        }
    }
    
    assert(nwords_left_in_a == nwords_p-1);
    
    /* OK, but it still isn't reduced.  Add and subtract p_lo. */
    word_t cout = add_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,-1);
    if (p_shift) {
        cout = (cout<<(WORD_BITS-p_shift)) + (a[nwords_p-1]>>p_shift);
        a[nwords_p-1] &= (((word_t)1)<<p_shift)-1;
    }
    
    /* mask = carry-1: if no carry then do sub, otherwise don't */
    sub_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,cout-1);
}

/* PERF: This function is horribly slow.  Enough to break 1%. */
void
barrett_mul_or_mac(
    word_t *accum,
    int nwords_accum,
    
    const word_t *a,
    int nwords_a,
    
    const word_t *b,
    int nwords_b,
    
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift,
    
    mask_t doMac
) {
    assert(nwords_accum >= nwords_p);
    
    /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
    int nwords_tmp = (nwords_a > nwords_p) ? nwords_a : nwords_p;
    nwords_tmp++;
    if (nwords_tmp < nwords_accum && doMac)
        nwords_tmp = nwords_accum;
    
    word_t tmp[nwords_tmp];
    int bpos, i;
    
    for (i=0; i<nwords_tmp; i++) {
        tmp[i] = 0;
    }
    
    for (bpos=nwords_b-1; bpos >= 0; bpos--) {
        /* Invariant at the beginning of the loop: the high word is unused. */
        assert(tmp[nwords_tmp-1] == 0);
        
        /* shift up */
        for (i=nwords_tmp-2; i>=0; i--) {
            tmp[i+1] = tmp[i];
        }
        tmp[0] = 0;

        /* mac and reduce */
        word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
        
        /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
        assert(!carry);
        barrett_reduce(tmp, nwords_tmp, carry, p_lo, nwords_p, nwords_lo, p_shift);
        
        /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
         * so the high word is again clear */
    }
    
    if (doMac) {
        word_t cout = add_nr_packed(tmp, accum, nwords_accum);
        barrett_reduce(tmp, nwords_tmp, cout, p_lo, nwords_p, nwords_lo, p_shift);
    }
    
    for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
        accum[i] = tmp[i];
    }
    for (; i<nwords_tmp; i++) {
        assert(tmp[i] == 0);
    }
    for (; i<nwords_accum; i++) {
        accum[i] = 0;
    }
}