delete the text files, for real this time?

11 years ago · d7f64fd885
--- a/barrett_field.c
+++ b/barrett_field.c
@@ -1,269 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "barrett_field.h"
 #include <assert.h>

 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 ) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] + (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static __inline__ word_t
 add_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
 ) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] + c[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static __inline__ word_t
 sub_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
 ) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] - c[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 ) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] - (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static word_t
 widemac(
    word_t *accum,
    int nwords_accum,
    const word_t *mier,
    int nwords_mier,
    word_t mand,
    word_t carry
 ) {
    int i;
    assert(nwords_accum >= nwords_mier);
    
    for (i=0; i<nwords_mier; i++) {
        /* UMAAL chain for the wordy part of p */
        dword_t product = ((dword_t)mand) * mier[i];
        product += accum[i];
        product += carry;
        accum[i] = product;
        carry = product >> WORD_BITS;
    }
    
    for (; i<nwords_accum; i++) {
        dword_t sum = ((dword_t)carry) + accum[i];
        accum[i] = sum;
        carry = sum >> WORD_BITS;
    }
    
    return carry;
 }

 void
 barrett_negate (
    word_t *a,
    int nwords_a,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    int i;
    dsword_t carry = 0;
    
    barrett_reduce(a,nwords_a,0,p_lo,nwords_p,nwords_lo,p_shift);
    
    /* Have p = 2^big - p_lo.  Want p - a = 2^big - p_lo - a */
    
    for (i=0; i<nwords_lo; i++) {
        a[i] = carry = carry - p_lo[i] - a[i];
        carry >>= WORD_BITS;
    }
    for (; i<nwords_p; i++) {
        a[i] = carry = carry - a[i];
        if (i<nwords_p-1) {
            carry >>= WORD_BITS;
        }
    }
    
    a[nwords_p-1] = carry = carry + (((word_t)1) << p_shift);
    
    for (; i<nwords_a; i++) {
        assert(!a[i]);
    }
    
    assert(!(carry>>64));
 }

 void
 barrett_reduce(
    word_t *a,
    int nwords_a,
    word_t a_carry,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    /* TODO: non 2^k-c primes. */
    int repeat, nwords_left_in_a=nwords_a;
    
    /* TODO: is there a point to this a_carry business? */
    assert(a_carry < ((word_t)1)<<p_shift && nwords_a >= nwords_p);
    
    for (; nwords_left_in_a >= nwords_p; nwords_left_in_a--) {
        for (repeat=0; repeat<2; repeat++) {
            /* PERF: surely a more careful implementation could
             * avoid this double round
             */
            word_t mand = a[nwords_left_in_a-1] >> p_shift;
            a[nwords_left_in_a-1] &= (((word_t)1)<<p_shift)-1;
            if (p_shift && !repeat) {
                /* collect high bits when there are any */
                if (nwords_left_in_a < nwords_a) {
                    mand |= a[nwords_left_in_a] << (WORD_BITS-p_shift);
                    a[nwords_left_in_a] = 0;
                } else {
                    mand |= a_carry << (WORD_BITS-p_shift);
                }
            }
            
            word_t carry = widemac(a+nwords_left_in_a-nwords_p, nwords_p, p_lo, nwords_lo, mand, 0);
            assert(!carry);
            (void)carry;
        }
    }
    
    assert(nwords_left_in_a == nwords_p-1);
    
    /* OK, but it still isn't reduced.  Add and subtract p_lo. */
    word_t cout = add_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,-1);
    if (p_shift) {
        cout = (cout<<(WORD_BITS-p_shift)) + (a[nwords_p-1]>>p_shift);
        a[nwords_p-1] &= (((word_t)1)<<p_shift)-1;
    }
    
    /* mask = carry-1: if no carry then do sub, otherwise don't */
    sub_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,cout-1);
 }

 /* PERF: This function is horribly slow.  Enough to break 1%. */
 void
 barrett_mul_or_mac(
    word_t *accum,
    int nwords_accum,
    
    const word_t *a,
    int nwords_a,
    
    const word_t *b,
    int nwords_b,
    
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift,
    
    mask_t doMac
 ) {
    assert(nwords_accum >= nwords_p);
    
    /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
    int nwords_tmp = (nwords_a > nwords_p) ? nwords_a : nwords_p;
    nwords_tmp++;
    if (nwords_tmp < nwords_accum && doMac)
        nwords_tmp = nwords_accum;
    
    word_t tmp[nwords_tmp];
    int bpos, i;
    
    for (i=0; i<nwords_tmp; i++) {
        tmp[i] = 0;
    }
    
    for (bpos=nwords_b-1; bpos >= 0; bpos--) {
        /* Invariant at the beginning of the loop: the high word is unused. */
        assert(tmp[nwords_tmp-1] == 0);
        
        /* shift up */
        for (i=nwords_tmp-2; i>=0; i--) {
            tmp[i+1] = tmp[i];
        }
        tmp[0] = 0;

        /* mac and reduce */
        word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
        
        /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
        assert(!carry);
        barrett_reduce(tmp, nwords_tmp, carry, p_lo, nwords_p, nwords_lo, p_shift);
        
        /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
         * so the high word is again clear */
    }
    
    if (doMac) {
        word_t cout = add_nr_packed(tmp, accum, nwords_accum);
        barrett_reduce(tmp, nwords_tmp, cout, p_lo, nwords_p, nwords_lo, p_shift);
    }
    
    for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
        accum[i] = tmp[i];
    }
    for (; i<nwords_tmp; i++) {
        assert(tmp[i] == 0);
    }
    for (; i<nwords_accum; i++) {
        accum[i] = 0;
    }
 }
--- a/barrett_field.h
+++ b/barrett_field.h
@@ -1,126 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __BARRETT_FIELD_H__
 #define __BARRETT_FIELD_H__ 1

 #include "word.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 void
 barrett_reduce(
    word_t *a,
    int nwords_a,
    word_t a_carry,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 );
    
 /*
 * out = a+(c&mask), with carry returned.
 * #out must equal #a (HACK?)
 */
 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 );
    
 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 );
    
 void
 barrett_negate (
    word_t *a,
    int nwords_a,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 );

 /*
 * If doMac, accum = accum + a*b mod p.
 * Otherwise, accum = a*b mod p.
 *
 * This function is not __restrict__; you may pass accum,
 * a, b, etc all from the same location.
 */
 void
 barrett_mul_or_mac(
    word_t *accum,
    int nwords_accum,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift,
    
    mask_t doMac
 );
    
 static inline void
 barrett_mul(
    word_t *out,
    int nwords_out,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,0);
 }
    
 static inline void
 barrett_mac(
    word_t *out,
    int nwords_out,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,-1);
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __BARRETT_FIELD_H__ */
--- a/bench.c
+++ b/bench.c
@@ -1,827 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <memory.h>

 #include "p448.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"
 #include "goldilocks.h"
 #include "sha512.h"

 word_t q448_lo[4] = {
    0xdc873d6d54a7bb0dull,
    0xde933d8d723a70aaull,
    0x3bb124b65129c96full,
    0x000000008335dc16ull
 };

 double now() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  
  return tv.tv_sec + tv.tv_usec/1000000.0;
 }

 void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
    crandom_generate(crand, (unsigned char *)a, sizeof(*a));
    p448_strong_reduce(a);
 }

 void q448_randomize( struct crandom_state_t *crand, uint64_t sk[7] ) {
    crandom_generate(crand, (unsigned char *)sk, sizeof(uint64_t)*7);
 }

 void p448_print( const char *descr, const struct p448_t *a ) {
    p448_t b;
    p448_copy(&b, a);
    p448_strong_reduce(&b);
    int j;
    printf("%s = 0x", descr);
    for (j=7; j>=0; j--) {
        printf("%014llx", (unsigned long long)b.limb[j]);
    }
    printf("\n");
 }

 void p448_print_full( const char *descr, const struct p448_t *a ) {
    int j;
    printf("%s = 0x", descr);
    for (j=7; j>=0; j--) {
        printf("%02llx_%014llx ", a->limb[j]>>56, (unsigned long long)a->limb[j]&(1ull<<56)-1);
    }
    printf("\n");
 }

 void q448_print( const char *descr, const uint64_t secret[7] ) {
    int j;
    printf("%s = 0x", descr);
    for (j=6; j>=0; j--) {
        printf("%016llx", (unsigned long long)secret[j]);
    }
    printf("\n");
 }

 int main(int argc, char **argv) {
    (void)argc;
    (void)argv;

    struct tw_extensible_t ext;
    struct extensible_t exta;
    struct tw_niels_t niels;
    struct tw_pniels_t pniels;
    struct affine_t affine;
    struct montgomery_t mb;
    struct p448_t a,b,c,d;
    
    
    double when;
    int i,j;
    
    /* Bad randomness so we can debug. */
    char initial_seed[32];
    for (i=0; i<32; i++) initial_seed[i] = i;
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, initial_seed);
    
    uint64_t sk[7],tk[7];
    q448_randomize(&crand, sk);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_mul(&c, &b, &a);
    }
    when = now() - when;
    printf("mul:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_sqr(&c, &a);
    }
    when = now() - when;
    printf("sqr:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<5000000; i++) {
        p448_mul(&c, &b, &a);
        p448_mul(&a, &b, &c);
    }
    when = now() - when;
    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_mulw(&c, &b, 1234562);
    }
    when = now() - when;
    printf("mulw:        %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<100000; i++) {
        p448_randomize(&crand, &a);
    }
    when = now() - when;
    printf("rand448:     %5.1fns\n", when * 1e9 / i);
    
    struct sha512_ctx_t sha;
    uint8_t hashout[128];
    when = now();
    for (i=0; i<10000; i++) {
        sha512_init(&sha);
        sha512_final(&sha, hashout);
    }
    when = now() - when;
    printf("sha512 1blk: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        sha512_update(&sha, hashout, 128);
    }
    when = now() - when;
    printf("sha512 blk:  %5.1fns (%0.2f MB/s)\n", when * 1e9 / i, 128*i/when/1e6);
    
    when = now();
    for (i=0; i<10000; i++) {
        p448_isr(&c, &a);
    }
    when = now() - when;
    printf("isr auto:    %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        p448_isr(&d,&a);
        p448_sqr(&b,&d);
        p448_mul(&c,&b,&a);
        p448_sqr(&b,&c);
        p448_subw(&b,1);
        p448_bias(&b,1);
        if (!p448_is_zero(&b)) {
            printf("ISR validation failure!\n");
            p448_print("a", &a);
            p448_print("s", &d);
        }
    }
    
    when = now();
    for (i=0; i<10000; i++) {
        elligator_2s_inject(&affine, &a);
    }
    when = now() - when;
    printf("elligator:   %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        elligator_2s_inject(&affine, &a);
        if (!validate_affine(&affine)) {
            printf("Elligator validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        }
    }
    
    when = now();
    for (i=0; i<10000; i++) {
        deserialize_affine(&affine, &a);
    }
    when = now() - when;
    printf("decompress:  %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        serialize_extensible(&a, &exta);
    }
    when = now() - when;
    printf("compress:    %5.1fµs\n", when * 1e6 / i);
    
    int goods = 0;
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        mask_t good = deserialize_affine(&affine, &a);
        if (good & !validate_affine(&affine)) {
            printf("Deserialize validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        } else if (good) {
            goods++;
            convert_affine_to_extensible(&exta,&affine);
            serialize_extensible(&b, &exta);
            p448_sub(&c,&b,&a);
            p448_bias(&c,2);
            if (!p448_is_zero(&c)) {
                printf("Reserialize validation failure!\n");
                p448_print("a", &a);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                deserialize_affine(&affine, &b);
                p448_print("b", &b);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                printf("\n");
            }
        }
    }
    if (goods<i/3) {
        printf("Deserialization validation failure! Deserialized %d/%d points\n", goods, i);
    }
    
    uint64_t lsk[12];
    for (i=0;i<10; i++) {
        for (j=11; j>=0; j--) {
            lsk[j] = random();
            lsk[j] = lsk[j]<<22 ^ random();
            lsk[j] = lsk[j]<<22 ^ random();
        }
    }
    
    when = now();
    for (i=0; i<1000000; i++) {
        barrett_reduce(lsk,12,0,q448_lo,7,4,62);
    }
    when = now() - when;
    printf("barrett red: %5.1fns\n", when * 1e9 / i);
    // 
    // when = now();
    // for (i=0; i<100000; i++) {
    //     barrett_mac(lsk,7,lsk,7,lsk,7,q448_lo,7,4,62);
    // }
    // when = now() - when;
    // printf("barrett mac: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        add_tw_niels_to_tw_extensible(&ext, &niels);
    }
    when = now() - when;
    printf("exti+niels:  %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        add_tw_pniels_to_tw_extensible(&ext, &pniels);
    }
    when = now() - when;
    printf("exti+pniels: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        double_tw_extensible(&ext);
    }
    when = now() - when;
    printf("exti dbl:    %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        untwist_and_double(&exta, &ext);
    }
    when = now() - when;
    printf("i->a isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        twist_and_double(&ext, &exta);
    }
    when = now() - when;
    printf("a->i isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        montgomery_step(&mb);
    }
    when = now() - when;
    printf("monty step:  %5.1fns\n", when * 1e9 / i);
 	
    when = now();
    for (i=0; i<1000; i++) {
        p448_montgomery_ladder(&a,&b,sk,448,0);
    }
    when = now() - when;
    printf("full ladder: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        edwards_scalar_multiply(&ext,sk);
    }
    when = now() - when;
    printf("edwards smz: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        edwards_scalar_multiply_vlook(&ext,sk);
        untwist_and_double_and_serialize(&a,&ext);
    }
    when = now() - when;
    printf("edwards svl: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt(&ext,sk);
    }
    when = now() - when;
    printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
    
    struct tw_niels_t wnaft[1<<6];
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,6);
    }
    when = now() - when;
    printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,6);
    }
    when = now() - when;
    printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,4);
    }
    when = now() - when;
    printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,4);
    }
    when = now() - when;
    printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,5);
    }
    when = now() - when;
    printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,5);
    }
    when = now() - when;
    printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        q448_randomize(&crand, tk);
        edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
    }
    when = now() - when;
    printf("vt vf combo: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        deserialize_affine(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        edwards_scalar_multiply(&ext,sk);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
    }
    when = now() - when;
    printf("edwards sm:  %5.1fµs\n", when * 1e6 / i);
    
    struct tw_niels_t table[80] __attribute__((aligned(32)));

    while (1) {
        p448_randomize(&crand, &a);
        if (deserialize_affine(&affine, &a)) break;
    }
    convert_affine_to_extensible(&exta,&affine);
    twist_and_double(&ext,&exta);
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_combs(table, &ext, 5, 5, 18);
    }
    when = now() - when;
    printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
 	
    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 5, 5, 18);
    }
    when = now() - when;
    printf("com(5,5,18): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 3, 5, 30);
    }
    when = now() - when;
    printf("com(3,5,30): %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 8, 4, 14);
    }
    when = now() - when;
    printf("com(4,4,28): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        q448_randomize(&crand, sk);
        edwards_comb(&ext, sk, table, 5, 5, 18);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
    }
    when = now() - when;
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    printf("\nGoldilocks:\n");
    
    int res = goldilocks_init();
    assert(!res);
    
    struct goldilocks_public_key_t gpk,hpk;
    struct goldilocks_private_key_t gsk,hsk;
    
    when = now();
    for (i=0; i<10000; i++) {
        if (i&1) {
            res = goldilocks_keygen(&gsk,&gpk);
        } else {
            res = goldilocks_keygen(&hsk,&hpk);
        }
        assert(!res);
    }
    when = now() - when;
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    uint8_t ss1[64],ss2[64];
    int gres1,gres2;
    when = now();
    for (i=0; i<10000; i++) {
        if (i&1) {
            gres1 = goldilocks_shared_secret(ss1,&gsk,&hpk);
        } else {
            gres2 = goldilocks_shared_secret(ss2,&hsk,&gpk);
        }
    }
    when = now() - when;
    printf("ecdh:        %5.1fµs\n", when * 1e6 / i);
    if (gres1 || gres2 || memcmp(ss1,ss2,64)) {
        printf("[FAIL] %d %d\n",gres1,gres2);
        
        printf("ss1 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss1[i]);
        }
        printf("\nss2 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss2[i]);
        }
        printf("\n");
    }
    
    uint8_t sout[56*2];
    const char *message = "hello world";
    uint64_t message_len = strlen(message);
    when = now();
    for (i=0; i<10000; i++) {
        res = goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
        assert(!res);
    }
    when = now() - when;
    printf("sign:        %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
    }
    when = now() - when;
    printf("verify:      %5.1fµs\n", when * 1e6 / i);
    
    printf("\nTesting...\n");
    
    
    int failures=0, successes = 0;
    for (i=0; i<1000; i++) {
        (void)goldilocks_keygen(&gsk,&gpk);
        goldilocks_sign(sout,(const unsigned char *)message,message_len,&gsk);
        res = goldilocks_verify(sout,(const unsigned char *)message,message_len,&gpk);
        if (res) failures++;
    }
    if (failures) {
        printf("FAIL %d/%d signature checks!\n", failures, i);
    }
    
    failures=0; successes = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
 		uint64_t two = 2;
        mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
 		if (!good) continue;
 		
 		uint64_t x = rand(), y=rand(), z=x*y;
 		p448_montgomery_ladder(&b,&a,&x,64,0);
        p448_montgomery_ladder(&c,&b,&y,64,0);
        p448_montgomery_ladder(&b,&a,&z,64,0);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
 		if (!p448_is_zero(&d)) {
            printf("Odd ladder validation failure %d!\n", ++failures);
            p448_print("a", &a);
            printf("x=%llx, y=%llx, z=%llx\n", x,y,z);
            p448_print("c", &c);
            p448_print("b", &b);
 			printf("\n");
 		}
 	}
    
    failures = 0;
    for (i=0; i<1000; i++) {
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = deserialize_affine(&affine, &a);
        } while (!good);
        
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
        untwist_and_double_and_serialize(&c, &ext);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (good && !p448_is_zero(&d)){
            printf("Iso+serial validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("b", &b);
            p448_print("c", &c);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Iso+serial variation: only %d/%d successful.\n", successes, i);
    }
        
    failures = 0;
    uint64_t four = 4;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
        
        mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
        good &= p448_montgomery_ladder(&c,&b,sk,448,0);
        
        mask_t goodb = deserialize_affine(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        edwards_scalar_multiply(&ext,sk);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (good != goodb) {
            printf("Compatibility validation failure %d: good: %d != %d\n", ++failures, (int)(-good), (int)(-goodb));
        } else if (good && !p448_is_zero(&d)){
            printf("Compatibility validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Compatibility variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
        good &= p448_montgomery_ladder(&c,&b,sk,448,0);
        if (!good) continue;
        
        deserialize_affine(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        
        precompute_for_combs(table, &ext, 5, 5, 18);
        edwards_comb(&ext, sk, table, 5, 5, 18);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("Comb validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Comb variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = deserialize_affine(&affine, &a);
        if (!good) continue;
        
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        struct tw_extensible_t exu;
        copy_tw_extensible(&exu, &ext);
        
        edwards_scalar_multiply(&ext,sk);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);
        
        edwards_scalar_multiply_vt(&exu,sk);
        untwist_and_double(&exta,&exu);
        serialize_extensible(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("WNAF validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("WNAF variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = deserialize_affine(&affine, &a);
        if (!good) continue;
        
        convert_affine_to_extensible(&exta,&affine);
        twist_and_double(&ext,&exta);
        struct tw_extensible_t exu;
        copy_tw_extensible(&exu, &ext);
        
        edwards_scalar_multiply(&ext,sk);
        untwist_and_double(&exta,&ext);
        serialize_extensible(&b, &exta);

        precompute_for_wnaf(wnaft,&exu,5);
        edwards_scalar_multiply_vt_pre(&exu,sk,wnaft,5);
        untwist_and_double(&exta,&exu);
        serialize_extensible(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("PreWNAF validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            for (j=0; j<1<<5; j++) {
                printf("WNAFT %d\n", j);
                p448_print("  a",&wnaft[j].a);
                p448_print("  b",&wnaft[j].b);
                p448_print("  c",&wnaft[j].c);
            }
            printf("\n\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("PreWNAF variation: only %d/%d successful.\n", successes, i);
    }
    
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        struct p448_t aa;
        struct tw_extensible_t exu,exv,exw;
        
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = deserialize_affine(&affine, &a);
            convert_affine_to_extensible(&exta,&affine);
            twist_and_double(&ext,&exta);
        } while (!good);
        do {
            p448_randomize(&crand, &aa);
            good = deserialize_affine(&affine, &aa);
            convert_affine_to_extensible(&exta,&affine);
            twist_and_double(&exu,&exta);
        } while (!good);
        p448_randomize(&crand, &aa);
        
        q448_randomize(&crand, sk);
 		if (i==0 || i==2) bzero(&sk, sizeof(sk));
        q448_randomize(&crand, tk);
 		if (i==0 || i==1) bzero(&tk, sizeof(tk));
        
        copy_tw_extensible(&exv, &ext);
        copy_tw_extensible(&exw, &exu);
        edwards_scalar_multiply(&exv,sk);
        edwards_scalar_multiply(&exw,tk);
        convert_tw_extensible_to_tw_pniels(&pniels, &exw);
        add_tw_pniels_to_tw_extensible(&exv,&pniels);
        untwist_and_double(&exta,&exv);
        serialize_extensible(&b, &exta);

        precompute_for_wnaf(wnaft,&exu,5);
        edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
        untwist_and_double(&exta,&exv);
        serialize_extensible(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("PreWNAF combo validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("A", &aa);
            q448_print("s", sk);
            q448_print("t", tk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i) {
        printf("PreWNAF combo variation: only %d/%d successful.\n", successes, i);
    }
    
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        
        q448_randomize(&crand, sk);
        q448_randomize(&crand, tk);
        
 		uint64_t two = 2;
        mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
 		p448_montgomery_ladder(&b,&a,sk,448,0);
        p448_montgomery_ladder(&d,&b,tk,448,0);
        p448_montgomery_ladder(&b,&a,tk,448,0);
        p448_montgomery_ladder(&c,&b,sk,448,0);
        
        p448_sub(&b,&c,&d);
        p448_bias(&b,2);
        
        mask_t success = p448_is_zero(&b) | ~good;
        
        if (!success) {
            printf("Ladder validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            q448_print("t", tk);
            p448_print("c", &c);
            p448_print("d", &d);
            printf("\n");
        }
    }
    
    return 0;
 }
--- a/crandom.c
+++ b/crandom.c
@@ -1,442 +0,0 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* Chacha random number generator code copied from crandom */

 #include "intrinsics.h"
 #include "crandom.h"
 #include <stdio.h>

 volatile unsigned int crandom_features = 0;

 unsigned int crandom_detect_features() {
  unsigned int out = GEN;
  
 # if (defined(__i386__) || defined(__x86_64__))
    u_int32_t a,b,c,d;
    
    a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    out |= GEN;
    if (d & 1<<26) out |= SSE2;
    if (d & 1<< 9) out |= SSSE3;
    if (c & 1<<25) out |= AESNI;
    if (c & 1<<28) out |= AVX;
    if (b & 1<<5) out  |= AVX2;
    
    a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    if (c & 1<<11) out |= XOP;
    if (c & 1<<30) out |= RDRAND;
 # endif
  
  return out;
 }



 INTRINSIC u_int64_t rdrand(int abort_on_fail) {
    uint64_t out = 0;
    int tries = 1000;
    
    if (HAVE(RDRAND)) {
    # if defined(__x86_64__)
        u_int64_t out, a=0;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(out), "+a"(a) :: "cc"
            );
        }
    # elif (defined(__i386__))
        u_int32_t reg, a=0;
        uint64_t out;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(reg), "+a"(a) :: "cc"
            );
        }
        out = reg; a = 0;
        for (; tries && !a; tries--) {
            __asm__ __volatile__ (
                "rdrand %0\n\tsetc %%al"
                    : "=r"(reg), "+a"(a) :: "cc"
            );
        }
        out = out << 32 | reg;
        return out;
    # else
        abort(); // whut
    # endif
    } else {
        tries = 0;
    }
    
    if (abort_on_fail && !tries) {
        abort();
    }
    
    return out;
 }


 /* ------------------------------- Vectorized code ------------------------------- */
 #define shuffle(x,i) _mm_shuffle_epi32(x, \
  i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64)

 #define add _mm_add_epi32
 #define add64 _mm_add_epi64

 #define NEED_XOP   (MIGHT_HAVE(XOP))
 #define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP))
 #define NEED_SSE2  (MIGHT_HAVE(SSE2)  && !MUST_HAVE(SSSE3))
 #define NEED_CONV  (!MUST_HAVE(SSE2))

 #if NEED_XOP
 static __inline__ void
 quarter_round_xop(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = xop_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = xop_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(7,  *b ^ *c);
 }
 #endif

 #if NEED_SSSE3
 static const ssereg shuffle8  = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull };
 static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull };
  
 INTRINSIC ssereg ssse3_rotate_8(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle8);
 }
  
 INTRINSIC ssereg ssse3_rotate_16(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle16);
 }
  
 static __inline__ void
 quarter_round_ssse3(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */

 #if NEED_SSE2
 static __inline__ void
 quarter_round_sse2(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = sse2_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif

 #define DOUBLE_ROUND(qrf) { \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,1);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,3);       \
  b2 = shuffle(b2,1);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,3);       \
                            \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,3);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,1);       \
  b2 = shuffle(b2,3);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,1);       \
                          }
                          
 #define OUTPUT_FUNCTION   { \
  output[0] = add(a1,aa);   \
  output[1] = add(b1,bb);   \
  output[2] = add(c1,cc);   \
  output[3] = add(d1,dd);   \
  output[4] = add(a2,aa);   \
  output[5] = add(b2,bb);   \
  output[6] = add(c2,add(cc,p)); \
  output[7] = add(d2,dd);   \
                            \
  output += 8;              \
                            \
  cc = add64(add64(cc,p), p); \
  a1 = a2 = aa;             \
  b1 = b2 = bb;             \
  c1 = cc; c2 = add64(cc,p);\
  d1 = d2 = dd;             \
                          }
 /* ------------------------------------------------------------------------------- */

 INTRINSIC u_int32_t rotate(int r, u_int32_t a) {
    return a<<r ^ a>>(32-r);
 }

 static __inline__ void
 quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) {
    *a = *a + *b; *d = rotate(16, *d^*a);
    *c = *c + *d; *b = rotate(12, *b^*c);
    *a = *a + *b; *d = rotate(8,  *d^*a);
    *c = *c + *d; *b = rotate(7,  *b^*c);
 }

 static void
 crandom_chacha_expand(u_int64_t iv,
                         u_int64_t ctr,
                         int nr,
                         int output_size,
                         const unsigned char *key_,
                         unsigned char *output_) {
 # if MIGHT_HAVE_SSE2
    if (HAVE(SSE2)) {
        ssereg *key = (ssereg *)key_;
        ssereg *output = (ssereg *)output_;
                 
        ssereg a1 = key[0], a2 = a1, aa = a1,
               b1 = key[1], b2 = b1, bb = b1,
               c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1,
               d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull},
               d2 = d1, dd = d1,
               p = {0, 1};
 
        int i,r;
 #   if (NEED_XOP)
        if (HAVE(XOP)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_xop);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSSE3)
        if (HAVE(SSSE3)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_ssse3);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSE2)
        if (HAVE(SSE2)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_sse2);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
    }
 # endif

 # if NEED_CONV
    {
        const u_int32_t *key = (const u_int32_t *)key_;
        u_int32_t
        x[16],
        input[16] = {
            key[0], key[1], key[2], key[3],
            key[4], key[5], key[6], key[7],
            iv, iv>>32, ctr, ctr>>32,
            0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
        },
        *output = (u_int32_t *)output_;
        int i, r;

        for (i=0; i<output_size; i+= 64) {
            for (r=0; r<16; r++) {
                x[r] = input[r];
            }
                for (r=nr; r>0; r-=2) {
                quarter_round(&x[0], &x[4],  &x[8], &x[12]);
                quarter_round(&x[1], &x[5],  &x[9], &x[13]);
                quarter_round(&x[2], &x[6], &x[10], &x[14]);
                quarter_round(&x[3], &x[7], &x[11], &x[15]);

                quarter_round(&x[0], &x[5], &x[10], &x[15]);
                quarter_round(&x[1], &x[6], &x[11], &x[12]);
                quarter_round(&x[2], &x[7],  &x[8], &x[13]);
                quarter_round(&x[3], &x[4],  &x[9], &x[14]);
            }
            for (r=0; r<16; r++) {
                output[r] = x[r] + input[r];
            }

            output += 16;
            input[11] ++;
            if (!input[11]) input[12]++;
        }
    }
  
 #endif /* NEED_CONV */
 }

 /* "return 4", cf xkcd #221 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 int
 crandom_init_from_file(
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) {
    state->fill = 0;
    state->reseed_countdown = reseed_interval;
    state->reseed_interval = reseed_interval;
    state->ctr = 0;

    state->randomfd = open(filename, O_RDONLY);
    if (state->randomfd == -1) {
        int err = errno;
        return err ? err : -1;
    }

    ssize_t offset = 0, red;
    do {
        red = read(state->randomfd, state->seed + offset, 32 - offset);
        if (red > 0) offset += red;
    } while (red > 0 && offset < 32);

    if (offset < 32) {
        int err = errno;
        return err ? err : -1;
    }

    memset(state->buffer, 0, 96);

    state->magic = CRANDOM_MAGIC;
    state->reseeds_mandatory = reseeds_mandatory;

    return 0;
 }

 void
 crandom_init_from_buffer(
    struct crandom_state_t *state,
    const char initial_seed[32]
 ) {
    memcpy(state->seed, initial_seed, 32);
    memset(state->buffer, 0, 96);
    state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
    state->randomfd = -1;
    state->magic = CRANDOM_MAGIC;
 }

 int
 crandom_generate(
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 ) {
    /* the generator isn't seeded; maybe they ignored the return value of init_from_file */
    if (unlikely(state->magic != CRANDOM_MAGIC)) {
        abort();
    }

    int ret = 0;

    while (length) {
        if (unlikely(state->fill <= 0)) {
            uint64_t iv = 0;
            if (state->reseed_interval) {
                /* it's nondeterministic, stir in some rdrand() or rdtsc() */
                if (HAVE(RDRAND)) {
                    iv = rdrand(0);
                    if (!iv) iv = rdtsc();
                } else {
                    iv = rdtsc();
                }

                state->reseed_countdown--;
                if (unlikely(state->reseed_countdown <= 0)) {
                    /* reseed by xoring in random state */
                    state->reseed_countdown = state->reseed_interval;
                    ssize_t offset = 0, red;
                    do {
                        red = read(state->randomfd, state->buffer + offset, 32 - offset);
                        if (red > 0) offset += red;
                    } while (red > 0 && offset < 32);

                    if (offset < 32) {
                        /* The read failed.  Signal an error with the return code.
                         *
                         * If reseeds are mandatory, crash.
                         *
                         * If not, the generator is still probably safe to use, because reseeding
                         * is basically over-engineering for caution.  Also, the user might ignore
                         * the return code, so we still need to fill the request.
                         *
                         * Set reseed_countdown = 1 so we'll try again later.  If the user's
                         * performance sucks as a result of ignoring the error code while calling
                         * us in a loop, well, that's life.
                         */
                        if (state->reseeds_mandatory) {
                            abort();
                        }

                        ret = errno;
                        if (ret == 0) ret = -1;
                        state->reseed_countdown = 1;
                    }

                    int i;
                    for (i=0; i<32; i++) {
                        /* Stir in the buffer.  If somehow the read failed, it'll be zeros. */
                        state->seed[i] ^= state->buffer[i];
                    }
                }
            }
            crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
            state->ctr++;
            state->fill = sizeof(state->buffer);
        }

        unsigned long long copy = (length > state->fill) ? state->fill : length;
        state->fill -= copy;
        memcpy(output, state->buffer + state->fill, copy);
        memset(state->buffer + state->fill, 0, copy);
        output += copy; length -= copy;
    }

    return ret;
 }

 void
 crandom_destroy(
    struct crandom_state_t *state
 ) { 
    if (state->magic == CRANDOM_MAGIC && state->randomfd) {
        (void) close(state->randomfd);
        /* Ignore the return value from close(), because what would it mean?
         * "Your random device, which you were reading over NFS, lost some data"?
         */
    }

    memset(state, 0, sizeof(*state));
 }
--- a/crandom.h
+++ b/crandom.h
@@ -1,140 +0,0 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file crandom.h
 * @author Mike Hamburg
 * @brief A miniature version of the (as of yet incomplete) crandom project.
 */

 #ifndef __GOLDI_CRANDOM_H__
 #define __GOLDI_CRANDOM_H__ 1

 #include <stdint.h>  /* for uint64_t */
 #include <fcntl.h>   /* for open */
 #include <errno.h>   /* for returning errors after open */
 #include <stdlib.h>  /* for abort */
 #include <string.h>  /* for memcpy */
 #include <strings.h> /* for bzero */
 #include <unistd.h>  /* for read */

 /**
 * @brief The state of a crandom generator.
 *
 * This object is opaque.  It is not protected by a lock, and so must
 * not be accessed by multiple threads at the same time.
 */
 struct crandom_state_t {
    /** @privatesection */
    unsigned char seed[32];
    unsigned char buffer[96];
    uint64_t ctr;
    uint64_t magic;
    unsigned int fill;
    int reseed_countdown;
    int reseed_interval;
    int reseeds_mandatory;
    int randomfd;
 } __attribute__((aligned(16))) ;

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Initialize a crandom state from the chosen file.
 * 
 * This function initializes a state from a given state file, or
 * from a random device (eg. /dev/random or /dev/urandom).
 *
 * You must check the return value of this function.
 *
 * @param [out] state The crandom state variable to initalize.
 * @param [in] filename The name of the seed file or random device.
 * @param [in] reseed_interval The number of 96-byte blocks which can be
 *        generated without reseeding.  Suggest 10000.
 * @param [in] reseeds_mandatory If nonzero, call abort() if a reseed fails.
 *        Suggest 1.
 *
 * @retval 0 Success.
 * @retval Nonzero An error to be interpreted by strerror().
 */
 int
 crandom_init_from_file (
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) __attribute__((warn_unused_result));


 /**
 * Initialize a crandom state from a buffer, for deterministic operation.
 * 
 * This function is used to initialize a crandom state deterministically,
 * mainly for testing purposes.  It can also be used to expand a secret
 * random value deterministically.
 *
 * @warning The crandom implementation is not guaranteed to be stable.
 * That is, a later release might produce a different random stream from
 * the same seed.
 *
 * @param [out] state The crandom state variable to initalize.
 * @param [in] initial_seed The seed value.
 */
 void
 crandom_init_from_buffer (
    struct crandom_state_t *state,
    const char initial_seed[32]
 );

 /**
 * Fill the output buffer with random data.
 *
 * This function uses the given crandom state to produce pseudorandom data
 * in the output buffer.
 *
 * This function may perform reads from the state's random device if it needs
 * to reseed.  This could block if that file is a blocking source, such as
 * a pipe or /dev/random on Linux.  If reseeding fails and the state has
 * reseeds_mandatory set, this function will call abort().  Otherwise, it will
 * return an error code, but it will still randomize the buffer.
 *
 * If called on a corrupted, uninitialized or destroyed state, this function
 * will abort().
 *
 * @warning This function is not thread-safe with respect to the state.  Don't
 * call it from multiple threads with the same state at the same time.
 *
 * @param [inout] state The crandom state to use for generation.
 * @param [out] output The buffer to fill with random data.
 * @param [in] length The length of the buffer.
 *
 * @retval 0 Success.
 * @retval Nonezero A non-mandatory reseed operation failed.
 */
 int
 crandom_generate (
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 );

 /**
 * Destroy the random state.  Further calls to crandom_generate() on that state
 * will abort().
 *
 * @param [inout] state The state to be destroyed.
 */
 void
 crandom_destroy (
    struct crandom_state_t *state
 );

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDI_CRANDOM_H__ */
--- a/ec_point.c
+++ b/ec_point.c
@@ -1,745 +0,0 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_bias ( &d->y,     2 );
    p448_bias ( &d->z,     2 );
    p448_sub  (   &L1, &d->y, &d->x );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_bias ( &d->y,     2 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_bias ( &d->y,     2 );
    p448_bias ( &d->z,     2 );
    p448_sub  (   &L1, &d->y, &d->x );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_bias ( &d->y,     2 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_bias (   &L1,     3 );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_bias ( &a->u,     3 );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_bias (   &L0,     3 );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_bias ( &b->u,     3 );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_bias ( &a->xd,     2 );
    p448_bias ( &a->xa,     2 );
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_bias ( &a->xd,     2 );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_bias ( &a->xd,     2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_bias ( &a->zd,     4 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 serialize_montgomery (
    struct p448_t*             sign,
    struct p448_t*             ser,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L2, &a->z0, &a->zd );
    p448_bias (   &L2,     2 );
    p448_sub  (   &L0,   &L2, &a->xd );
    p448_mul  (   &L2, &a->za,   &L0 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1, &a->z0, &a->xd );
    p448_bias (   &L1,     2 );
    p448_sub  (   &L0,   &L1, &a->zd );
    p448_mul  (   &L3, &a->xa,   &L0 );
    p448_add  (   &L1,   &L3,   &L2 );
    p448_sub  (   &L0,   &L2,   &L3 );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L0,   sbz,   &L2 );
    p448_mul  (   &L2, &a->zd,   &L0 );
    p448_mul  (  sign,   &L2, &a->zd );
    p448_mul  (   ser,   &L2, &a->xd );
    p448_mul  (   &L2,  sign,   ser );
    p448_isr  (   &L1,   &L2 );
    p448_mul  (   ser,  sign,   &L1 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (  sign,   &L2,   &L0 );
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_bias (   &L0,     1 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_bias ( &a->t,     1 );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L1, L2;
    struct p448_t L0;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_sqr  (   &L2,   &L9 );
    p448_bias (   &L2,     1 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_bias (   &L7,     2 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L0, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L0 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L1, &ext->x, &ext->y );
    p448_neg  (   &L0,   &L1 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
       L5 = p448_is_zero(   &L1 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L0,   &L2 );
    p448_addw (   &L0,     0 );
    p448_sqr  (   &L1, &ext->x );
    p448_bias (   &L1,     4 );
    p448_add  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L1, &ext->t );
    p448_mul  (   &L0,   &L1,   &L3 );
    p448_mulw (   &L1,   &L0, 39081 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L1,   &L3,   &L2 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L2,   &L3,   &L1 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/ec_point.h
+++ b/ec_point.h
@@ -1,503 +0,0 @@
 /**
 * @file ec_point.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #ifndef __CC_INCLUDED_EC_POINT_H__
 #define __CC_INCLUDED_EC_POINT_H__

 #include "p448.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Affine point on an Edwards curve.
 */
 struct affine_t {
    struct p448_t x, y;
 };

 /**
 * Affine point on a twisted Edwards curve.
 */
 struct tw_affine_t {
    struct p448_t x, y;
 };

 /**
 * Montgomery buffer.
 */
 struct montgomery_t {
    struct p448_t z0, xd, zd, xa, za;
 };

 /**
 * Extensible coordinates for Edwards curves, suitable for
 * accumulators.
 * 
 * Represents the point (x/z, y/z).  The extra coordinates
 * t,u satisfy xy = tuz, allowing for conversion to Extended
 * form by multiplying t and u.
 * 
 * The idea is that you don't have to do this multiplication
 * when doubling the accumulator, because the t-coordinate
 * isn't used there.  At the same time, as long as you only
 * have one point in extensible form, additions don't cost
 * extra.
 * 
 * This is essentially a lazier version of Hisil et al's
 * lookahead trick.  It might be worth considering that trick
 * instead.
 */
 struct extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /**
 * Extensible coordinates for twisted Edwards curves,
 * suitable for accumulators.
 */
 struct tw_extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /**
 * Niels coordinates for twisted Edwards curves.
 * 
 * Good for mixed readdition; suitable for fixed tables.
 */
 struct tw_niels_t {
    struct p448_t a, b, c;
 };

 /**
 * Projective niels coordinates for twisted Edwards curves.
 * 
 * Good for readdition; suitable for temporary tables.
 */
 struct tw_pniels_t {
    struct tw_niels_t n;
    struct p448_t z;
 };


 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
 */
 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
 */
 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in projective Niels form.
 */
 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in projective Niels form.
 */
 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 );

 /**
 * Double a point on a twisted Edwards curve, in "extensible" coordinates.
 */
 void
 double_tw_extensible (
    struct tw_extensible_t* a
 );

 /**
 * Double a point on an Edwards curve, in "extensible" coordinates.
 */
 void
 double_extensible (
    struct extensible_t* a
 );

 /**
 * Double a point, and transfer it to the twisted curve.
 * 
 * That is, apply the 4-isogeny.
 */
 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 /**
 * Double a point, and transfer it to the untwisted curve.
 * 
 * That is, apply the dual isogeny.
 */
 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 );

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 );

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 );

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 );

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 );

 void
 montgomery_step (
    struct montgomery_t* a
 );

 void
 serialize_montgomery (
    struct p448_t*             sign,
    struct p448_t*             ser,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 );

 /**
 * Serialize a point on an Edwards curve.
 * 
 * The serialized form would be sqrt((z-y)/(z+y)) with sign of xz.
 * 
 * It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x.
 * 
 * But 4/(1-d) isn't square, so we need to twist it:
 * 
 * -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x
 */
 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 );

 /**
 * 
 */
 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 );

 /**
 * Expensive transfer from untwisted to twisted.  Roughly equivalent to halve and isogeny.
 * Correctly transfers point of order 2.
 * 
 * Can't have x=+1 (it's not even).  There is code to fix the exception that would otherwise
 * occur at (0,1).
 * 
 * Input point must be even.
 */
 void
 twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 /**
 * Deserialize a point to an untwisted affine curve.
 */
 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 );

 /**
 * Deserialize a point and transfer it to the twist.
 * 
 * Not guaranteed to preserve the 4-torsion component.
 * 
 * Refuses to deserialize +-1, which are the points of order 2.
 */
 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 );

 void
 set_identity_extensible (
    struct extensible_t* a
 );

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 );

 void
 set_identity_affine (
    struct affine_t* a
 );

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 );

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 );

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 );

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 );

 mask_t
 validate_affine (
    const struct affine_t* a
 );

 /**
 * Check the invariants for struct tw_extensible_t.
 * PERF: This function was automatically generated
 * with no regard for speed.
 */
 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 );


 void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) {
    p448_copy ( &a->z0, &ds->z0 );
    p448_copy ( &a->xd, &ds->xd );
    p448_copy ( &a->zd, &ds->zd );
    p448_copy ( &a->xa, &ds->xa );
    p448_copy ( &a->za, &ds->za );
 }

 void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) {
    p448_copy ( &a->a, &ds->a );
    p448_copy ( &a->b, &ds->b );
    p448_copy ( &a->c, &ds->c );
 }

 void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) {
    copy_tw_niels( &a->n, &ds->n );
    p448_copy ( &a->z, &ds->z );
 }



 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __CC_INCLUDED_EC_POINT_H__ */
--- a/exported.sym
+++ b/exported.sym
@@ -1,5 +0,0 @@
 _goldilocks_init
 _goldilocks_keygen
 _goldilocks_shared_secret
 _goldilocks_sign
 _goldilocks_verify
--- a/goldilocks.c
+++ b/goldilocks.c
@@ -1,299 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include <errno.h>

 #include "goldilocks.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"
 #include "sha512.h"

 #ifndef GOLDILOCKS_RANDOM_INIT_FILE
 #define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom"
 #endif

 #ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL
 #define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000
 #endif

 /* We'll check it ourselves */
 #ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif

 /* TODO: word size; precompute */
 const struct affine_t goldilocks_base_point = {
    {{ 0xf0de840aed939full, 0xc170033f4ba0c7ull, 0xf3932d94c63d96ull, 0x9cecfa96147eaaull,
       0x5f065c3c59d070ull, 0x3a6a26adf73324ull, 0x1b4faff4609845ull, 0x297ea0ea2692ffull
    }},
    {{ 19, 0, 0, 0, 0, 0, 0, 0 }}
 };

 // /* TODO: direct */
 // void
 // transfer_and_serialize(struct p448_t *out, const struct tw_extensible_t *twext) {
 //     struct extensible_t ext;
 //     transfer_tw_to_un(&ext, twext);
 //     serialize_extensible(out, &ext);
 // }

 // FIXME: threading
 // TODO: autogen instead of init
 struct {
    struct tw_niels_t combs[80];
    struct tw_niels_t wnafs[32];
    struct crandom_state_t rand;
 } goldilocks_global;

 int
 goldilocks_init () {
    struct extensible_t ext;
    struct tw_extensible_t text;
    
    /* Sanity check: the base point is on the curve. */
    assert(validate_affine(&goldilocks_base_point));
    
    /* Convert it to twisted Edwards. */
    convert_affine_to_extensible(&ext, &goldilocks_base_point);
    twist(&text, &ext);
    //p448_transfer_un_to_tw(&text, &ext);
    
    /* Precompute the tables. */
    precompute_for_combs(goldilocks_global.combs, &text, 5, 5, 18);
    precompute_for_wnaf(goldilocks_global.wnafs, &text, 5);
    
    return crandom_init_from_file(&goldilocks_global.rand,
        GOLDILOCKS_RANDOM_INIT_FILE,
        GOLDILOCKS_RANDOM_RESEED_INTERVAL,
        GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
 }

 static word_t
 q448_lo[4] = {
    0xdc873d6d54a7bb0dull,
    0xde933d8d723a70aaull,
    0x3bb124b65129c96full,
    0x000000008335dc16ull
 };

 static const struct p448_t
 sqrt_d_minus_1 = {{
    0xd2e21836749f46ull,
    0x888db42b4f0179ull,
    0x5a189aabdeea38ull,
    0x51e65ca6f14c06ull,
    0xa49f7b424d9770ull,
    0xdcac4628c5f656ull,
    0x49443b8748734aull,
    0x12fec0c0b25b7aull
 }};

 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) {
    // TODO: check for init.  Also maybe take CRANDOM object?  API...
    word_t sk[448*2/WORD_BITS];
    
    struct tw_extensible_t exta;
    struct p448_t pk;
    
    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,q448_lo,7,4,62); // TODO word size
    q448_serialize(privkey->opaque, sk);
    
    edwards_comb(&exta, sk, goldilocks_global.combs, 5, 5, 18);
    //transfer_and_serialize_qtor(&pk, &sqrt_d_minus_1, &exta);
    untwist_and_double_and_serialize(&pk, &exta);
    
    p448_serialize(pubkey->opaque, &pk);
    memcpy(&privkey->opaque[56], pubkey->opaque, 56);
    
    int ret2 = crandom_generate(&goldilocks_global.rand, &privkey->opaque[112], 32);
    if (!ret) ret = ret2;
    
    return ret ? GOLDI_ENODICE : GOLDI_EOK;
 }

 int
 goldilocks_shared_secret (
    uint8_t shared[64],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) {
    word_t sk[448/WORD_BITS];
    struct p448_t pk;
    
    mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1;
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    struct p448_t sum, prod;
    msucc &= p448_deserialize(&sum,&my_privkey->opaque[56]);
    p448_mul(&prod,&pk,&sum);
    p448_add(&sum,&pk,&sum);
 #endif
    
    msucc &= q448_deserialize(sk,my_privkey->opaque);
    succ &= p448_montgomery_ladder(&pk,&pk,sk,446,2);
    
    p448_serialize(shared,&pk);
    
    /* obliterate records of our failure by adjusting with obliteration key */
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);

 #ifdef EXPERIMENT_ECDH_OBLITERATE_CT
    uint8_t oblit[40];
    unsigned i;
    for (i=0; i<8; i++) {
        oblit[i] = "noshared"[i] & ~(succ&msucc);
    }
    for (i=0; i<32; i++) {
        oblit[8+i] = my_privkey->opaque[112+i] & ~(succ&msucc);
    }
    sha512_update(&ctx, oblit, 40);
 #endif
    
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
    /* stir in the sum and product of the pubkeys. */
    uint8_t a_pk[56];
    p448_serialize(a_pk, &sum);
    sha512_update(&ctx, a_pk, 56);
    p448_serialize(a_pk, &prod);
    sha512_update(&ctx, a_pk, 56);
 #endif
       
    /* stir in the shared key and finish */
    sha512_update(&ctx, shared, 56);
    sha512_final(&ctx, shared);
    
    return (GOLDI_ECORRUPT & ~msucc)
        | (GOLDI_EINVAL & msucc &~ succ)
        | (GOLDI_EOK & msucc & succ);
 }

 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 ) {
    
    /* challenge = H(pk, [nonceG], message).  FIXME: endian. */
    word_t skw[448/WORD_BITS];
    mask_t succ = q448_deserialize(skw,privkey->opaque);
    if (!succ) {
        memset(skw,0,sizeof(skw));
        return GOLDI_ECORRUPT;
    }
        
    /* Derive a nonce.  TODO: use HMAC. FIXME: endian.  FUTURE: factor. */
    word_t tk[512/WORD_BITS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, (const unsigned char *)"signonce", 8);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_update(&ctx, message, message_len);
    sha512_update(&ctx, &privkey->opaque[112], 32);
    sha512_final(&ctx, (unsigned char *)tk);
    barrett_reduce(tk,512/WORD_BITS,0,q448_lo,7,4,62); // TODO word size
    
    /* 4[nonce]G */
    uint8_t signature_tmp[56];
    struct tw_extensible_t exta;
    struct p448_t gsk;
    edwards_comb(&exta, tk, goldilocks_global.combs, 5, 5, 18);
    double_tw_extensible(&exta);
    untwist_and_double_and_serialize(&gsk, &exta);
    p448_serialize(signature_tmp, &gsk);
    
    word_t challenge[512/WORD_BITS];
    sha512_update(&ctx, &privkey->opaque[56], 56);
    sha512_update(&ctx, signature_tmp, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, (unsigned char *)challenge);
    
    // reduce challenge and sub.
    barrett_negate(challenge,512/WORD_BITS,q448_lo,7,4,62);

    barrett_mac(
        tk,512/WORD_BITS,
        challenge,512/WORD_BITS,
        skw,448/WORD_BITS,
        q448_lo,7,4,62
    );
        
    word_t carry = add_nr_ext_packed(tk,tk,512/WORD_BITS,tk,512/WORD_BITS,-1);
    barrett_reduce(tk,512/WORD_BITS,carry,q448_lo,7,4,62);
        
    memcpy(signature_out, signature_tmp, 56);
    q448_serialize(signature_out+56, tk);
    memset((unsigned char *)tk,0,sizeof(tk));
    memset((unsigned char *)skw,0,sizeof(skw));
    memset((unsigned char *)challenge,0,sizeof(challenge));
    
    /* response = 2(nonce_secret - sk*challenge)
     * Nonce = 8[nonce_secret]*G
     * PK = 2[sk]*G, except doubled (TODO)
     * so [2] ( [response]G + 2[challenge]PK ) = Nonce
     */
    
    return 0;
 }

 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) {
    struct p448_t pk;
    word_t s[448/WORD_BITS];
    
    mask_t succ = p448_deserialize(&pk,pubkey->opaque);
    if (!succ) return GOLDI_EINVAL;
    
    succ = q448_deserialize(s, &signature[56]);
    if (!succ) return GOLDI_EINVAL;
    
    /* challenge = H(pk, [nonceG], message).  FIXME: endian. */
    word_t challenge[512/WORD_BITS];
    struct sha512_ctx_t ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, pubkey->opaque, 56);
    sha512_update(&ctx, signature, 56);
    sha512_update(&ctx, message, message_len);
    sha512_final(&ctx, (unsigned char *)challenge);
    barrett_reduce(challenge,512/WORD_BITS,0,q448_lo,7,4,62);
    
    struct p448_t eph;
    struct tw_extensible_t pk_text;
    
    /* deserialize [nonce]G */
    succ = p448_deserialize(&eph, signature);
    if (!succ) return GOLDI_EINVAL;
    
    
    // succ = affine_deserialize(&pk_aff,&pk);
    // if (!succ) return EINVAL;
    // 
    // convert_affine_to_extensible(&pk_ext,&pk_aff);
    // transfer_un_to_tw(&pk_text,&pk_ext);
    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
    if (!succ) return GOLDI_EINVAL;
    
    edwards_combo_var_fixed_vt( &pk_text, challenge, s, goldilocks_global.wnafs, 5 );
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    p448_sub(&eph, &eph, &pk);
    p448_bias(&eph, 2);
    
    succ = p448_is_zero(&eph);
    
    return succ ? 0 : GOLDI_EINVAL;
 }
--- a/goldilocks.h
+++ b/goldilocks.h
@@ -1,171 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file goldilocks.h
 * @author Mike Hamburg
 * @brief Goldilocks high-level functions.
 */
 #ifndef __GOLDILOCKS_H__
 #define __GOLDILOCKS_H__ 1

 #include <stdint.h>

 /**
 * @brief Serialized form of a Goldilocks public key.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_public_key_t {
    uint8_t opaque[56]; /**< Serialized data. */
 };

 /**
 * @brief Serialized form of a Goldilocks private key.
 *
 * Contains 56 bytes of actual private key, 56 bytes of
 * public key, and 32 bytes of symmetric key for randomization.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_private_key_t {
    uint8_t opaque[144]; /**< Serialized data. */
 };

 #ifdef __cplusplus
 extern "C" {
 #endif

 /** @brief No error. */
 static const int GOLDI_EOK      = 0;

 /** @brief Error: your key is corrupt. */
 static const int GOLDI_ECORRUPT = 44801;

 /** @brief Error: other party's key is corrupt. */
 static const int GOLDI_EINVAL   = 44802;

 /** @brief Error: not enough entropy. */
 static const int GOLDI_ENODICE  = 44804;

 /**
 * @brief Initialize Goldilocks' precomputed tables and
 * random number generator.
 * @retval GOLDI_EOK Success.
 * @retval Nonzero An error occurred.
 */
 int
 goldilocks_init();

 /**
 * @brief Generate a new random keypair.
 * @param [out] privkey The generated private key.
 * @param [out] pubkey The generated public key.
 *
 * @warning This isn't even my final form!
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ENODICE Insufficient entropy.
 */
 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result));

 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
 *
 * This function uses some compile-time flags whose merit remains to
 * be decided.
 *
 * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
 * of zeros to the secret before hashing.  In the case that the other
 * party's key is detectably corrupt, instead the symmetric part
 * of the secret key is used to produce a pseudorandom value.
 *
 * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
 * the two parties' public keys is prepended to the hash.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] shared The shared secret established with the other party.
 * @param [in] my_privkey My private key.
 * @param [in] your_pubkey The other party's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EINVAL   The other party's key is corrupt.
 */
 int
 goldilocks_shared_secret (
    uint8_t shared[64],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result));
    
 /**
 * @brief Sign a message.
 *
 * The signature is deterministic, using the symmetric secret found in the
 * secret key to form a nonce.
 *
 * The technique used in signing is a modified Schnorr system, like EdDSA.
 *
 * @warning This isn't even my final form!
 * @warning This function contains endian bugs. (TODO)
 *
 * @param [out] signature_out Space for the output signature.
 * @param [in] message The message to be signed.
 * @param [in] message_len The length of the message to be signed.
 * @param [in] privkey My private key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 */
 int
 goldilocks_sign (
    uint8_t signature_out[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 );
    
 /**
 * @brief Verify a signature.
 *
 * This function is fairly strict.  It will correctly detect when
 * the signature has the wrong cofactor companent.  Once deserialization
 * of numbers is strictified (TODO) it will limit the response to being
 * less than q as well.
 * 
 * Currently this function does not detect when the public key is weird,
 * eg 0, has cofactor, etc.  As a result, a party with a bogus public
 * key could create signatures that succeed on some systems and fail on
 * others.
 *
 * @warning This isn't even my final form!
 * @warning This function contains endian bugs. (TODO)
 *
 * @param [out] signature_out The signature.
 * @param [in] message The message to be verified.
 * @param [in] message_len The length of the message to be verified.
 * @param [in] pubkey The signer's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EINVAL The public key or signature is corrupt.
 */
 int
 goldilocks_verify (
    const uint8_t signature[56*2],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDILOCKS_H__ */
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -1,199 +0,0 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /** @file crandom.h
 * @brief cRandom intrinsics header.
 */

 #ifndef __CRANDOM_INTRINSICS_H__
 #define __CRANDOM_INTRINSICS_H__ 1

 #include <sys/types.h>

 #include <immintrin.h>

 #define INTRINSIC \
  static __inline__ __attribute__((__gnu_inline__, __always_inline__))

 #define GEN    1
 #define SSE2   2
 #define SSSE3  4
 #define AESNI  8
 #define XOP    16
 #define AVX    32
 #define AVX2   64
 #define RDRAND 128

 INTRINSIC u_int64_t rdtsc() {
  u_int64_t out = 0;
 # if (defined(__i386__) || defined(__x86_64__))
    __asm__ __volatile__ ("rdtsc" : "=A"(out));
 # endif
  return out;
 }

 /**
 * Return x unchanged, but confuse the compiler.
 *
 * This is mainly for use in test scripts, to prevent the value from
 * being constant-folded or removed by dead code elimination.
 *
 * @param x A 64-bit number.
 * @return The same number in a register.
 */
 INTRINSIC u_int64_t opacify(u_int64_t x) {
  __asm__ volatile("mov %0, %0" : "+r"(x));
  return x;
 }

 #ifdef __AVX2__
 #  define MIGHT_HAVE_AVX2 1
 #  ifndef MUST_HAVE_AVX2
 #    define MUST_HAVE_AVX2 0
 #  endif
 #else
 #  define MIGHT_HAVE_AVX2 0
 #  define MUST_HAVE_AVX2  0
 #endif

 #ifdef __AVX__
 #  define MIGHT_HAVE_AVX 1
 #  ifndef MUST_HAVE_AVX
 #    define MUST_HAVE_AVX MUST_HAVE_AVX2
 #  endif
 #else
 #  define MIGHT_HAVE_AVX 0
 #  define MUST_HAVE_AVX  0
 #endif

 #ifdef __SSSE3__
 #  define MIGHT_HAVE_SSSE3 1
 #  ifndef MUST_HAVE_SSSE3
 #    define MUST_HAVE_SSSE3 MUST_HAVE_AVX
 #  endif
 #else
 #  define MIGHT_HAVE_SSSE3 0
 #  define MUST_HAVE_SSSE3 0
 #endif

 #ifdef __SSE2__
 #  define MIGHT_HAVE_SSE2 1
 #  ifndef MUST_HAVE_SSE2
 #    define MUST_HAVE_SSE2 MUST_HAVE_SSSE3
 #  endif
   typedef __m128i ssereg;
 #  define pslldq _mm_slli_epi32
 #  define pshufd _mm_shuffle_epi32

 INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
 }

 #else
 #  define MIGHT_HAVE_SSE2 0
 #  define MUST_HAVE_SSE2  0
 #endif

 #ifdef __AES__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_AESNI 1
 #  ifndef MIGHT_HAVE_RDRAND
 #    define MIGHT_HAVE_RDRAND 1
 #  endif
 #  ifndef MUST_HAVE_RDRAND
 #    define MUST_HAVE_RDRAND 0
 #  endif
 #  ifndef MUST_HAVE_AESNI
 #    define MUST_HAVE_AESNI 0
 #  endif

 INTRINSIC ssereg aeskeygenassist(int rc, ssereg x) {
  ssereg out;
  __asm__("aeskeygenassist %2, %1, %0" : "=x"(out) : "x"(x), "g"(rc));
  return out;
 }

 INTRINSIC ssereg aesenc(ssereg subkey, ssereg block) {
  ssereg out = block;
  __asm__("aesenc %1, %0" : "+x"(out) : "x"(subkey));
  return out;
 }

 INTRINSIC ssereg aesenclast(ssereg subkey, ssereg block) {
  ssereg out = block;
  __asm__("aesenclast %1, %0" : "+x"(out) : "x"(subkey));
  return out;
 }

 #else
 #  define MIGHT_HAVE_AESNI 0
 #  define MUST_HAVE_AESNI 0
 #  define MIGHT_HAVE_RDRAND 0
 #  define MUST_HAVE_RDRAND 0
 #endif

 #ifdef __XOP__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_XOP 1
 #  ifndef MUST_HAVE_XOP
 #    define MUST_HAVE_XOP 0
 #  endif
 INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  ssereg out;
  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
  return out;
 }
 #else
 #  define MIGHT_HAVE_XOP 0
 #  define MUST_HAVE_XOP 0
 #endif

 #define MIGHT_MASK \
  ( SSE2   * MIGHT_HAVE_SSE2   \
  | SSSE3  * MIGHT_HAVE_SSSE3  \
  | AESNI  * MIGHT_HAVE_AESNI  \
  | XOP    * MIGHT_HAVE_XOP    \
  | AVX    * MIGHT_HAVE_AVX    \
  | RDRAND * MIGHT_HAVE_RDRAND \
  | AVX2   * MIGHT_HAVE_AVX2)

 #define MUST_MASK \
  ( SSE2   * MUST_HAVE_SSE2   \
  | SSSE3  * MUST_HAVE_SSSE3  \
  | AESNI  * MUST_HAVE_AESNI  \
  | XOP    * MUST_HAVE_XOP    \
  | AVX    * MUST_HAVE_AVX    \
  | RDRAND * MUST_HAVE_RDRAND \
  | AVX2   * MUST_HAVE_AVX2 )

 #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
 #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)

 #ifdef __cplusplus
 #  define extern_c extern "C"
 #else
 #  define extern_c
 #endif

 extern_c
 unsigned int crandom_detect_features();

 #ifndef likely
 #  define likely(x)       __builtin_expect((x),1)
 #  define unlikely(x)     __builtin_expect((x),0)
 #endif

 extern volatile unsigned int crandom_features;
 INTRINSIC int HAVE(unsigned int feature) {
  unsigned int features;
  if (!MIGHT_HAVE(feature)) return 0;
  if (MUST_HAVE(feature))   return 1;
  features = crandom_features;
  if (unlikely(!features))
    crandom_features = features = crandom_detect_features();
  return likely((features & feature) == feature);
 }

 #endif /* __CRANDOM_INTRINSICS_H__ */
--- a/p448.c
+++ b/p448.c
@@ -1,446 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"
 #include "x86-64-arith.h"

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4], bb[4];

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    }
    /*
    for (int i=0; i<4; i++) {
    aa[i] = a[i] + a[i+4];
    bb[i] = b[i] + b[i+4];
    }
    */

    accum2  = widemul(&a[0],&b[3]);
    accum1  = widemul(&aa[0],&bb[3]);
    accum0  = widemul(&a[4],&b[7]);

    mac(&accum2, &a[1], &b[2]);
    mac(&accum1, &aa[1], &bb[2]);
    mac(&accum0, &a[5], &b[6]);

    mac(&accum2, &a[2], &b[1]);
    mac(&accum1, &aa[2], &bb[1]);
    mac(&accum0, &a[6], &b[5]);

    mac(&accum2, &a[3], &b[0]);
    mac(&accum1, &aa[3], &bb[0]);
    mac(&accum0, &a[7], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    {
        accum2 = accum1;
        accum1 += accum0;
        accum0 = accum2;
    }

    accum2 = widemul(&a[0],&b[0]);
    accum1 -= accum2;
    accum0 += accum2;

    accum2  = widemul(&aa[1],&bb[3]);
    msb(&accum0, &a[1], &b[3]);
    mac(&accum1, &a[5], &b[7]);

    msb(&accum0, &a[2], &b[2]);
    mac(&accum2, &aa[2], &bb[2]);
    mac(&accum1, &a[6], &b[6]);

    msb(&accum0, &a[3], &b[1]);
    mac(&accum1, &a[7], &b[5]);
    mac(&accum2, &aa[3], &bb[1]);

    accum0 += accum2;
    accum1 += accum2;
    mac(&accum0, &a[4], &b[4]);
    mac(&accum1, &aa[0], &bb[0]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[2],&bb[3]);
    msb(&accum0, &a[2], &b[3]);
    mac(&accum1, &a[6], &b[7]);

    mac(&accum2, &aa[3], &bb[2]);
    msb(&accum0, &a[3], &b[2]);
    mac(&accum1, &a[7], &b[6]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(&a[0],&b[1]);
    mac(&accum1, &aa[0], &bb[1]);
    mac(&accum0, &a[4], &b[5]);

    mac(&accum2, &a[1], &b[0]);
    mac(&accum1, &aa[1], &bb[0]);
    mac(&accum0, &a[5], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[3],&bb[3]);
    msb(&accum0, &a[3], &b[3]);
    mac(&accum1, &a[7], &b[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(&a[0],&b[2]);
    mac(&accum1, &aa[0], &bb[2]);
    mac(&accum0, &a[4], &b[6]);

    mac(&accum2, &a[1], &b[1]);
    mac(&accum1, &aa[1], &bb[1]);
    mac(&accum0, &a[5], &b[5]);

    mac(&accum2, &a[2], &b[0]);
    mac(&accum1, &aa[2], &bb[0]);
    mac(&accum0, &a[6], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 56;
    accum1 >>= 56;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0, accum4;
    uint64_t mask = (1ull<<56) - 1;  

    accum0 = widemul_rm(b, &a[0]);
    accum4 = widemul_rm(b, &a[4]);

    c[0] = accum0 & mask; accum0 >>= 56;
    c[4] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[1]);
    mac_rm(&accum4, b, &a[5]);

    c[1] = accum0 & mask; accum0 >>= 56;
    c[5] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[2]);
    mac_rm(&accum4, b, &a[6]);

    c[2] = accum0 & mask; accum0 >>= 56;
    c[6] = accum4 & mask; accum4 >>= 56;

    mac_rm(&accum0, b, &a[3]);
    mac_rm(&accum4, b, &a[7]);

    c[3] = accum0 & mask; accum0 >>= 56;
    c[7] = accum4 & mask; accum4 >>= 56;

    c[4] += accum0 + accum4;
    c[0] += accum4;
 }

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4];

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    }

    accum2  = widemul(&a[0],&a[3]);
    accum1  = widemul(&aa[0],&aa[3]);
    accum0  = widemul(&a[4],&a[7]);

    mac(&accum2, &a[1], &a[2]);
    mac(&accum1, &aa[1], &aa[2]);
    mac(&accum0, &a[5], &a[6]);

    accum1 -= accum2;
    accum0 += accum2;

    c[3] = ((uint64_t)(accum0))<<1 & mask;
    c[7] = ((uint64_t)(accum1))<<1 & mask;

    accum0 >>= 55;
    accum1 >>= 55;

    {
        accum2 = accum1;
        accum1 += accum0;
        accum0 = accum2;
    }

    accum2 = widemul(&a[0],&a[0]);
    accum1 -= accum2;
    accum0 += accum2;

    accum2  = widemul2(&aa[1],&aa[3]);
    msb2(&accum0, &a[1], &a[3]);
    mac2(&accum1, &a[5], &a[7]);

    msb(&accum0, &a[2], &a[2]);
    mac(&accum2, &aa[2], &aa[2]);
    mac(&accum1, &a[6], &a[6]);

    accum0 += accum2;
    accum1 += accum2;
    mac(&accum0, &a[4], &a[4]);
    mac(&accum1, &aa[0], &aa[0]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul2(&aa[2],&aa[3]);
    msb2(&accum0, &a[2], &a[3]);
    mac2(&accum1, &a[6], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[1]);
    mac2(&accum1, &aa[0], &aa[1]);
    mac2(&accum0, &a[4], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(&aa[3],&aa[3]);
    msb(&accum0, &a[3], &a[3]);
    mac(&accum1, &a[7], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[2]);
    mac2(&accum1, &aa[0], &aa[2]);
    mac2(&accum0, &a[4], &a[6]);

    mac(&accum2, &a[1], &a[1]);
    mac(&accum1, &aa[1], &aa[1]);
    mac(&accum0, &a[5], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 56;
    accum1 >>= 56;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p448_strong_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<56)-1;

    /* first, clear high */
    a->limb[4] += a->limb[7]>>56;
    a->limb[0] += a->limb[7]>>56;
    a->limb[7] &= mask;

    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    __int128_t scarry = 0;
    int i;
    for (i=0; i<8; i++) {
        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 56;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<8; i++) {
        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 56;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p448_is_zero (
    const struct p448_t *a
 ) {
    struct p448_t b;
    p448_copy(&b,a);
    p448_strong_reduce(&b);

    uint64_t any = 0;
    int i;
    for (i=0; i<8; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
    }
 }

 void
 q448_serialize (
    uint8_t *serial,
    const word_t x[7]
 ) {
    int i,j;
    for (i=0; i<7; i++) {
        for (j=0; j<8; j++) {
            serial[8*i+j] = x[i]>>(8*j);
        }
    }
 }

 mask_t
 q448_deserialize (
    word_t x[7],
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<7; i++) {
        word_t out = 0;
        for (j=0; j<8; j++) {
            out |= ((word_t)serial[8*i+j])<<(8*j);
        }
        x[i] = out;
    }
    /* TODO: check for reduction */
    return MASK_SUCCESS;
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
        word_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((word_t)serial[7*i+j])<<(8*j);
        }
        x->limb[i] = out;
    }
    /* TODO: check for reduction */
    return MASK_SUCCESS;
 }
--- a/p448.h
+++ b/p448.h
@@ -1,330 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct p448_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 void
 q448_serialize (
    uint8_t *serial,
    const word_t x[7]
 );

 mask_t
 q448_deserialize (
    word_t x[7],
    const uint8_t serial[56]
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x;
    for (i=1; i<8; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = doswap;

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = doNegate;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint64x4_t *aa = (uint64x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    int i;
    a->limb[4] += tmp;
    for (i=7; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/scalarmul.c
+++ b/scalarmul.c
@@ -1,776 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include <stdlib.h>

 #include "scalarmul.h"
 #include "string.h"
 #include "barrett_field.h"

 mask_t
 p448_montgomery_ladder(
    struct p448_t *out,
    const struct p448_t *in,
    const uint64_t *scalar,
    int nbits,
    int n_extra_doubles
 ) {
    struct montgomery_t mont;
    p448_sqr(&mont.z0,in);
    p448_copy(&mont.za,&mont.z0);
    p448_set_ui(&mont.xa,1);
    p448_set_ui(&mont.zd,0);
    p448_set_ui(&mont.xd,1);
    
    int i,j,n=(nbits-1)&63;
    mask_t pflip = 0;
    for (j=(nbits+63)/64-1; j>=0; j--) {
        uint64_t w = scalar[j];
        for (i=n; i>=0; i--) {
            mask_t flip = -((w>>i)&1);
            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
            montgomery_step(&mont);
            pflip = flip;
        }
        n = 63;
    }
    p448_cond_swap(&mont.xa,&mont.xd,pflip);
    p448_cond_swap(&mont.za,&mont.zd,pflip);
    
    for (j=0; j<n_extra_doubles; j++) {
        montgomery_step(&mont);
    }
    
    struct p448_t sign;
    serialize_montgomery(&sign, out, &mont, in);
    
    p448_addw(&sign,1);
    return ~p448_is_zero(&sign);
 }

 static __inline__ void
 cond_negate_tw_niels(
    struct tw_niels_t *n,
    mask_t doNegate
 ) {
    p448_cond_swap(&n->a, &n->b, doNegate);
    p448_cond_neg(&n->c, doNegate);
 }

 static __inline__ void
 cond_negate_tw_pniels(
    struct tw_pniels_t *n,
    mask_t doNegate
 ) {
    cond_negate_tw_niels(&n->n, doNegate);
 }

 void    
 constant_time_lookup_tw_pniels(
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static __inline__ void    
 constant_time_lookup_tw_niels(
    struct tw_niels_t *out,
    const struct tw_niels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static void
 convert_to_signed_window_form(
    word_t *out,
    const word_t *scalar,
    const word_t *prepared_data,
    int nwords
 ) {
    mask_t mask = -(scalar[0]&1);

    word_t carry = add_nr_ext_packed(out, scalar, nwords, prepared_data, nwords, ~mask);
    carry += add_nr_ext_packed(out, out, nwords, prepared_data+nwords, nwords, mask);
    
    assert(!(out[0]&1));
    
    int i;
    for (i=0; i<nwords; i++) {
        out[i] >>= 1;
        if (i<nwords-1) {
            out[i] |= out[i+1]<<(WORD_BITS-1);
        } else {
            out[i] |= carry<<(WORD_BITS-1);
        }
    }
 }

 void
 edwards_scalar_multiply(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 ) {

    const int nbits=448; /* HACK? */
    word_t prepared_data[14] = {
        0x9595b847fdf73126ull,
        0x9bb9b8a856af5200ull,
        0xb3136e22f37d5c4full,
        0x0000000189a19442ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x4000000000000000ull,

        0x721cf5b5529eec33ull,
        0x7a4cf635c8e9c2abull,
        0xeec492d944a725bfull,
        0x000000020cd77058ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x0000000000000000ull
    }; /* TODO: split off */
    
    uint64_t scalar2[7];
    convert_to_signed_window_form(scalar2,scalar,prepared_data,7);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/64] >> (i%64) & 0xF,
        inv = (bits>>3)-1;
    bits ^= inv;
    
    constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);

        bits = scalar2[i/64] >> (i%64) & 0xF;
        inv = (bits>>3)-1;
        bits ^= inv;
    
        constant_time_lookup_tw_pniels(&pn, multiples, 8, bits&7);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
 }

 void
 edwards_scalar_multiply_vlook(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 ) {

    const int nbits=448; /* HACK? */
    word_t prepared_data[14] = {
        0x9595b847fdf73126ull,
        0x9bb9b8a856af5200ull,
        0xb3136e22f37d5c4full,
        0x0000000189a19442ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x4000000000000000ull,

        0x721cf5b5529eec33ull,
        0x7a4cf635c8e9c2abull,
        0xeec492d944a725bfull,
        0x000000020cd77058ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x0000000000000000ull
    }; /* TODO: split off */
    
    uint64_t scalar2[7];
    convert_to_signed_window_form(scalar2,scalar,prepared_data,7);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    double_tw_extensible(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
        add_tw_pniels_to_tw_extensible(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/64] >> (i%64) & 0xF,
        inv = (bits>>3)-1;
    bits ^= inv;

 	copy_tw_pniels(&pn, &multiples[bits&7]);
    cond_negate_tw_pniels(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);
        double_tw_extensible(working);

        bits = scalar2[i/64] >> (i%64) & 0xF;
        inv = (bits>>3)-1;
        bits ^= inv;
    
 		copy_tw_pniels(&pn, &multiples[bits&7]);
        cond_negate_tw_pniels(&pn, inv);
        add_tw_pniels_to_tw_extensible(working, &pn);
    }
 }


 void
 edwards_comb(
    struct tw_extensible_t *working,
    const word_t scalar[7],
    const struct tw_niels_t *table,
    int n,
    int t,
    int s
 ) {
    word_t prepared_data[14] = {
        0xebec9967f5d3f5c2ull,
        0x0aa09b49b16c9a02ull,
        0x7f6126aec172cd8eull,
        0x00000007b027e54dull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x4000000000000000ull,

        0xc873d6d54a7bb0cfull,
        0xe933d8d723a70aadull,
        0xbb124b65129c96fdull,
        0x00000008335dc163ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x0000000000000000ull
    }; /* TODO: split off.  Above is for 450 bits */
    
    word_t scalar2[7];
    convert_to_signed_window_form(scalar2,scalar,prepared_data,7);
    
    /* const int n=3, t=5, s=30; */
    int i,j,k;
    
    struct tw_niels_t ni;
    
    for (i=0; i<s; i++) {
        if (i) double_tw_extensible(working);
        
        for (j=0; j<n; j++) {
            int tab = 0;
 			
 			/*
             * PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
 			 * time of a keygen or sign op.  Surely it is possible to speed it up.
             */
            for (k=0; k<t; k++) {
                int bit = (s-1-i) + k*s + j*(s*t);
                if (bit < 7*WORD_BITS) {
                    tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
                }
            }
            
            mask_t invert = (tab>>(t-1))-1;
            tab ^= invert;
            tab &= (1<<(t-1)) - 1;
            
            constant_time_lookup_tw_niels(&ni, table + (j<<(t-1)), 1<<(t-1), tab);
            cond_negate_tw_niels(&ni, invert);
            if (i||j) {
                add_tw_niels_to_tw_extensible(working, &ni);
            } else {
                convert_tw_niels_to_tw_extensible(working, &ni);
            }
        }
    }
 }

 void
 simultaneous_invert_p448(
    struct p448_t *out,
    const struct p448_t *in,
    int n
 ) {
  if (!n) return;
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<n-1; i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }

 mask_t
 precompute_for_combs(
  struct tw_niels_t *out,
  const struct tw_extensible_t *const_base,
  int n,
  int t,
  int s
 ) {
    if (s < 1) return 0;
  
    struct tw_extensible_t working, start;
    copy_tw_extensible(&working, const_base);
    struct tw_pniels_t pn_tmp;
  
    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc(sizeof(*doubles) * (t-1));
    struct p448_t *zs  = (struct p448_t *) malloc(sizeof(*zs) * (n<<(t-1)));
    struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis) * (n<<(t-1)));
  
    if (!doubles || !zs || !zis) {
        free(doubles);
        free(zs);
        free(zis);
        return 0;
    }
  
    int i,j,k;
    for (i=0; i<n; i++) {

        /* doubling phase */
        for (j=0; j<t; j++) {
            if (j) {
                convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
                add_tw_pniels_to_tw_extensible(&start, &pn_tmp);
            } else {
                copy_tw_extensible(&start, &working);
            }

            if (j==t-1 && i==n-1) {
                break;
            }

            double_tw_extensible(&working);
            if (j<t-1) {
                convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
            }

            for (k=0; k<s-1; k++) {
                double_tw_extensible(&working);
            }
        }

        /* Gray-code phase */
        for (j=0;; j++) {
            int gray = j ^ (j>>1);
            int idx = ((i+1)<<(t-1))-1 ^ gray;

            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
            copy_tw_niels(&out[idx], &pn_tmp.n);
            p448_copy(&zs[idx], &pn_tmp.z);
 			
            if (j >= (1<<(t-1)) - 1) break;
            int delta = (j+1) ^ ((j+1)>>1) ^ gray;

            for (k=0; delta>1; k++)
                delta >>=1;
            
            if (gray & (1<<k)) {
                /* start += doubles[k] */
                add_tw_pniels_to_tw_extensible(&start, &doubles[k]);
            } else {
                /* start -= doubles[k] */
                sub_tw_pniels_from_tw_extensible(&start, &doubles[k]);
            }
            
            
        }
    }
 	
    simultaneous_invert_p448(zis, zs, n<<(t-1));

    p448_t product;
    for (i=0; i<n<<(t-1); i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
    }
 	
 	mask_t ret = ~p448_is_zero(&zis[0]);

    free(doubles);
    free(zs);
    free(zis);

    return ret;
 }

 mask_t
 precompute_for_wnaf(
    struct tw_niels_t *out,
    const struct tw_extensible_t *const_base,
    int tbits
 ) {
    int i;
    struct p448_t *zs  = (struct p448_t *) malloc(sizeof(*zs)<<tbits);
    struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis)<<tbits);

    if (!zs || !zis) {
        free(zs);
        free(zis);
        return 0;
    }

    struct tw_extensible_t base;
    copy_tw_extensible(&base,const_base);
    
    struct tw_pniels_t twop, tmp;
    
    convert_tw_extensible_to_tw_pniels(&tmp, &base);
    p448_copy(&zs[0], &tmp.z);
    copy_tw_niels(&out[0], &tmp.n);

    if (tbits > 0) {
        double_tw_extensible(&base);
        convert_tw_extensible_to_tw_pniels(&twop, &base);
        add_tw_pniels_to_tw_extensible(&base, &tmp);
        
        convert_tw_extensible_to_tw_pniels(&tmp, &base);
        p448_copy(&zs[1], &tmp.z);
        copy_tw_niels(&out[1], &tmp.n);

        for (i=2; i < 1<<tbits; i++) {
            add_tw_pniels_to_tw_extensible(&base, &twop);
            convert_tw_extensible_to_tw_pniels(&tmp, &base);
            p448_copy(&zs[i], &tmp.z);
            copy_tw_niels(&out[i], &tmp.n);
        }
    }
    
    simultaneous_invert_p448(zis, zs, 1<<tbits);

    p448_t product;
    for (i=0; i<1<<tbits; i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
    }

    free(zs);
    free(zis);

    return -1;
 }

 /**
 * @cond internal
 * Control for variable-time scalar multiply algorithms.
 */
 struct smvt_control {
  int power, addend;
 };

 static int
 recode_wnaf(
    struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
    const word_t *scalar,
    int nbits,
    int tableBits)
 {
    int current = 0, position=0, i;

    /* PERF: negate scalar if it's large
     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
     */
    for (i=nbits-1; i >= -2 - tableBits; i--) {
        int bit = (i >= 0)
            ? (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1
            : 0;

        current = 2*current + bit;

        /*
         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
         * So current loses (tableBits+1) bits every time.  It otherwise gains
         * 1 bit per iteration.  The number of iterations is
         * (nbits + 2 + tableBits), and an additional control word is added at
         * the end.  So the total number of control words is at most
         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
         * There's also the stopper with power -1, for a total of +3.
         */
        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
            int delta = (current + 1) >> 1;
            current = -(current & 1);

            int j;
            for (j=i; (delta & 1) == 0; j++) {
                delta >>= 1;
            }
            control[position].power = j+1;
            control[position].addend = delta;
            position++;
            assert(position <= nbits/(tableBits+1) + 2);
        }
    }
  
    control[position].power = -1;
    control[position].addend = 0;
    return position;
 }


 static void
 prepare_wnaf_table(
    struct tw_pniels_t *output,
    struct tw_extensible_t *working,
    int tbits
 ) {
    convert_tw_extensible_to_tw_pniels(&output[0], working);

    if (tbits == 0) return;

    double_tw_extensible(working);
    struct tw_pniels_t twop;
    convert_tw_extensible_to_tw_pniels(&twop, working);

    add_tw_pniels_to_tw_extensible(working, &output[0]);
    convert_tw_extensible_to_tw_pniels(&output[1], working);

    for (int i=2; i < 1<<tbits; i++) {
        add_tw_pniels_to_tw_extensible(working, &twop);
        convert_tw_extensible_to_tw_pniels(&output[i], working);
    }
 }

 void
 edwards_scalar_multiply_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 ) {
    /* HACK: not 448? */
    const int nbits=448, table_bits = 3;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    struct tw_pniels_t precmp[1<<table_bits];
    prepare_wnaf_table(precmp, working, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return;
    }
  
    int conti = 1, i;
    for (i = control[0].power - 1; i >= 0; i--) {
        double_tw_extensible(working);

        if (i == control[conti].power) {
            assert(control[conti].addend);

            if (control[conti].addend > 0) {
                add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
            } else {
                sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
            }
            conti++;
            assert(conti <= control_bits);
        }
    }
 }

 void
 edwards_scalar_multiply_vt_pre(
    struct tw_extensible_t *working,
    const uint64_t scalar[7],
    const struct tw_niels_t *precmp,
    int table_bits
 ) {
    /* HACK: not 448? */
    const int nbits=448;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return;
    }
  
    int conti = 1, i;
    for (i = control[0].power - 1; i >= 0; i--) {
        double_tw_extensible(working);

        if (i == control[conti].power) {
            assert(control[conti].addend);

            if (control[conti].addend > 0) {
                add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
            } else {
                sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
            }
            conti++;
            assert(conti <= control_bits);
        }
    }
 }

 void
 edwards_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar_var[7],
    const uint64_t scalar_pre[7],
    const struct tw_niels_t *precmp,
    int table_bits_pre
 ) {
    /* HACK: not 448? */
    const int nbits_var=448, nbits_pre=448, table_bits_var = 3;
    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
    
    int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
    int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
    (void)ncb_var;
    (void)ncb_pre;
  
    struct tw_pniels_t precmp_var[1<<table_bits_var];
    prepare_wnaf_table(precmp_var, working, table_bits_var);
  
    int contp=0, contv=0, i;
  
    i = control_var[0].power;
    if (i > control_pre[0].power) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        contv++;
    } else if (i == control_pre[0].power && i >=0 ) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
        contv++; contp++;
    } else {
        i = control_pre[0].power;
        convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
        contp++;
    }
    
    if (i < 0) {
        set_identity_tw_extensible(working);
        return;
    }
    
    for (i--; i >= 0; i--) {
        double_tw_extensible(working);

        if (i == control_var[contv].power) {
            assert(control_var[contv].addend);

            if (control_var[contv].addend > 0) {
                add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]);
            } else {
                sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]);
            }
            contv++;
        }

        if (i == control_pre[contp].power) {
            assert(control_pre[contp].addend);

            if (control_pre[contp].addend > 0) {
                add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]);
            } else {
                sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]);
            }
            contp++;
        }
    }
    
    assert(contv == ncb_var);
    assert(contp == ncb_pre);
 }



--- a/scalarmul.h
+++ b/scalarmul.h
@@ -1,117 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_ALGO_H__
 #define __P448_ALGO_H__ 1

 #include "ec_point.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*
 * Out = scalar * in, encoded in inverse square root
 * format.
 *
 * nbits is the number of bits in scalar.
 *
 * The scalar is to be presented in little-endian form,
 * meaning that scalar[0] contains the least significant
 * word of the scalar.
 * 
 * If the point "in" is on the curve, the return
 * value will be set (to -1).
 *
 * If the point "in" is not on the curve, then the
 * output will be incorrect.  If the scalar is even,
 * this condition will be detected by returning 0,
 * unless the output is the identity point (0; TODO).
 * If the scalar is odd, the value returned will be
 * set (to -1; TODO).
 *
 * The input and output points are always even.
 * Therefore on a cofactor-4 curve like Goldilocks,
 * it is sufficient for security to make the scalar
 * even.  (TODO: detect when i/o has cofactor?)
 *
 * This function takes constant time, depending on
 * nbits but not on in or scalar.
 */
 mask_t
 p448_montgomery_ladder(
    struct p448_t *out,
    const struct p448_t *in,
    const uint64_t *scalar,
    int nbits,
    int n_extra_doubles
 );

 void
 edwards_scalar_multiply(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
    /* TODO? int nbits */
 );

 void
 edwards_scalar_multiply_vlook(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
    /* TODO? int nbits */
 );
    
 mask_t
 precompute_for_combs(
  struct tw_niels_t *out,
  const struct tw_extensible_t *const_base,
  int n,
  int t,
  int s
 );
    
 void
 edwards_comb(
    struct tw_extensible_t *working,
    const word_t scalar[7],
    const struct tw_niels_t *table,
    int n,
    int t,
    int s
 );

 void
 edwards_scalar_multiply_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 );
    
 void
 edwards_scalar_multiply_vt_pre(
    struct tw_extensible_t *working,
    const uint64_t scalar[7],
    const struct tw_niels_t *precmp,
    int table_bits
 );

 mask_t
 precompute_for_wnaf(
    struct tw_niels_t *out,
    const struct tw_extensible_t *const_base,
    int tbits
 ); /* TODO: attr don't ignore... */

 void
 edwards_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar_var[7],
    const uint64_t scalar_pre[7],
    const struct tw_niels_t *precmp,
    int table_bits_pre
 );

 #ifdef __cplusplus
 };
 #endif

 #endif /* __P448_ALGO_H__ */
--- a/sha512.c
+++ b/sha512.c
@@ -1,182 +0,0 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include "sha512.h"

 #include <string.h>
 #include <assert.h>

 static inline uint64_t
 rotate_r (
    uint64_t x,
    int d
 ) {
  return (x >> d) | (x << (64-d));
 }

 /* TODO: get from headers */
 static inline uint64_t
 htobe64 (uint64_t x) {
    __asm__ ("bswapq %0" : "+r"(x));
    return x;
 }

 static const uint64_t
 sha512_init_state[8] = {
    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 
    0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 };

 static const uint64_t
 sha512_k[80] = {
    0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
    0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
    0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
    0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
    0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 
    0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
    0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
    0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
    0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
    0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, 
    0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
    0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
    0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
    0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 
    0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
    0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
    0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
    0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 };

 static inline uint64_t S0 (uint64_t h1) {
    return rotate_r(h1, 28) ^ rotate_r(h1, 34) ^ rotate_r(h1, 39);
 }

 static inline uint64_t S1 (uint64_t h4) {
    return rotate_r(h4,14) ^ rotate_r(h4,18) ^ rotate_r(h4,41);
 }

 static inline uint64_t s0 (uint64_t a) {
    return rotate_r(a,1) ^ rotate_r(a,8) ^ a>>7;
 }

 static inline uint64_t s1 (uint64_t b) {
    return rotate_r(b,19) ^ rotate_r(b,61) ^ b>>6;
 }

 static inline uint64_t ch (uint64_t h4, uint64_t h5, uint64_t h6) {
    return h6^(h4 & (h6^h5));
 }

 static inline uint64_t maj(uint64_t h1, uint64_t h2, uint64_t h3) {
    return (h1&h2) ^ (h3&(h1^h2));
 }

 static void
 sha512_process_block (
    struct sha512_ctx_t *ctx
 ) {
    uint64_t i, tmp, a, b,
        *w = (uint64_t *) ctx->block,
        *state = ctx->chain,
        h0 = state[0], h1 = state[1], h2 = state[2], h3 = state[3],
        h4 = state[4], h5 = state[5], h6 = state[6], h7 = state[7];

    /* Clang doesn't unswitch this automatically */
    for (i=0; i<16; i++) {
        /* load up the input word for this round */
        tmp = w[i] = htobe64(w[i]);
        
        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
  
        /* shift register */
        h7 = h6; h6 = h5; h5 = h4;
        h4 = h3 + tmp;
        h3 = h2; h2 = h1; h1 = h0;
        h0 = tmp + maj(h1,h2,h3) + S0(h1);
    }
  
    for (; i<80; i++) {
        /* load up the input word for this round */
        a   = w[(i+1 ) & 15];
        b   = w[(i+14) & 15];
        tmp = w[i&15] = s0(a) + s1(b) + w[i&15] + w[(i+9) & 15];
        tmp = tmp + h7 + S1(h4) + ch(h4,h5,h6) + sha512_k[i];
  
        /* shift register */
        h7 = h6; h6 = h5; h5 = h4;
        h4 = h3 + tmp;
        h3 = h2; h2 = h1; h1 = h0;
        h0 = tmp + maj(h1,h2,h3) + S0(h1);
    }
 
    state[0] += h0;
    state[1] += h1;
    state[2] += h2;
    state[3] += h3;
    state[4] += h4;
    state[5] += h5;
    state[6] += h6;
    state[7] += h7;
 }

 void
 sha512_init (
    struct sha512_ctx_t *ctx
 ) {
    ctx->nbytes = 0;
    memcpy(ctx->chain, sha512_init_state, sizeof(sha512_init_state));
    memset(ctx->block, 0, sizeof(ctx->block));
 }

 void
 sha512_update (
    struct sha512_ctx_t *ctx,
    const unsigned char *data,
    uint64_t bytes
 ) {
    assert(ctx->nbytes < 1ull<<56);
    assert(bytes < 1ull<<56);
    
    while (bytes) {
        uint64_t fill = ctx->nbytes % 128, accept = 128 - fill;
        if (accept > bytes) accept = bytes;
        ctx->nbytes += accept;
        memcpy(ctx->block + fill, data, accept);
        
        if (fill+accept == 128)
            sha512_process_block(ctx);

        bytes -= accept;
        data += accept;
    }
    
    assert(ctx->nbytes < 1ull<<56);
 }

 void
 sha512_final (
    struct sha512_ctx_t *ctx,
    uint8_t result[64]
 ) {
    uint64_t fill = ctx->nbytes % 128, i;
    ctx->block[fill++] = 0x80;
    if (fill > 112) {
        memset(ctx->block + fill, 0, 128-fill);
        sha512_process_block(ctx);
        fill = 0;
    }
    memset(ctx->block + fill, 0, 112-fill);
    *((uint64_t *)&ctx->block[112]) = 0;
    *((uint64_t *)&ctx->block[120]) = htobe64((ctx->nbytes * 8));
    sha512_process_block(ctx);
    for (i=0; i<8; i++) {
        ctx->chain[i] = htobe64(ctx->chain[i]);
    }
    memcpy(result, ctx->chain, sizeof(ctx->chain));
    sha512_init(ctx);
 }
--- a/sha512.h
+++ b/sha512.h
@@ -1,49 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __GOLDI_SHA512_H__
 #define __GOLDI_SHA512_H__ 1

 #include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 /* TODO: KAT */

 /**
 * SHA512 hashing context.
 *
 * This structure is opaque.
 */
 struct sha512_ctx_t {
    /** @privatesection */
    uint64_t chain[8];
    uint8_t block[128];
    uint64_t nbytes;
 };

 void
 sha512_init (
    struct sha512_ctx_t *ctx
 );

 void
 sha512_update (
    struct sha512_ctx_t *ctx,
    const unsigned char *data,
    uint64_t bytes
 );
    
 void
 sha512_final (
    struct sha512_ctx_t *ctx,
    uint8_t result[64]
 );
    
 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
    
 #endif /* __GOLDI_SHA512_H__ */
--- a/word.h
+++ b/word.h
@@ -1,55 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __WORD_H__
 #define __WORD_H__

 #include <stdint.h>

 typedef uint64_t word_t;
 typedef __uint128_t dword_t;
 typedef int64_t sword_t;
 typedef __int128_t dsword_t;

 static const int WORD_BITS = sizeof(word_t) * 8;

 /* TODO: vector width for procs like ARM; gcc support */
 typedef uint64_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;

 /* FIXME this only works on clang */
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));

 #if __AVX2__
 typedef uint32x8_t big_register_t;
 typedef uint64x4_t uint64xn_t;
 #elif __SSE2__ || __ARM_NEON__
 typedef uint32x4_t big_register_t;
 typedef uint64x2_t uint64xn_t;
 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
 typedef uint64_t big_register_t, uint64xn_t;
 #else
 typedef uint64_t uint64xn_t;
 typedef uint32_t big_register_t;
 #endif


 #if __AVX2__ || __SSE2__ || __ARM_NEON__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)(x == (big_register_t)0);
 }
 #else
 #error "TODO: constant-time equality on vectorless platforms"
 #endif

 #endif /* __WORD_H__ */
--- a/x86-64-arith.h
+++ b/x86-64-arith.h
@@ -1,246 +0,0 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __X86_64_ARITH_H__
 #define __X86_64_ARITH_H__

 #include <stdint.h>

 /* TODO: non x86-64 versions of these.
 * TODO: autogenerate
 */

 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"r"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "leaq (,%%rdx,2), %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"r"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
  
 }

 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t c,d, lo = *acc, hi = *acc>>64;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[lo], %[c]; "
       "sbbq %[hi], %[d]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  return ((__uint128_t)(a)) * b;
 }

 static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  return ((__int128_t)(a)) * b;
 }
 
 static __inline__ uint64_t opacify(uint64_t x) {
  __asm__ volatile("" : "+r"(x));
  return x;
 }

 static __inline__ mask_t is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }

 #endif /* __X86_64_ARITH_H__ */