Import the code

12 years ago · 25697caf5c
--- a/+ 42
+++ b/+ 42
@@ -0,0 +1,42 @@
 # Copyright (c) 2014 Cryptography Research, Inc.
 # Released under the MIT License.  See LICENSE.txt for license information.

 CC = clang
 CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror  \
  -mavx2 -DMUST_HAVE_SSSE3 -mbmi2 \
  -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC

 .PHONY: clean all runbench
 .PRECIOUS: build/%.s
 	
 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
  build/p448.o build/ec_point.o build/scalarmul.o

 all: bench
 	
 bench: *.h *.c
 	$(CC) $(CFLAGS) -o $@ *.c
 	
 build/timestamp:
 	mkdir -p build
 	touch $@

 build/%.o: build/%.s
 	$(CC) -c -o $@ $<

 build/%.s: %.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 build/goldilocks.so: $(LIBCOMPONENTS)
 	rm -f $@
 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
 		  -exported_symbols_list exported.sym \
 		  $(LIBCOMPONENTS)
 	
 runbench: bench
 	./$<

 clean:
 	rm -fr build bench *.o *.s
--- a/barrett_field.c
+++ b/barrett_field.c
@@ -0,0 +1,235 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "barrett_field.h"
 #include <assert.h>

 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 ) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] + (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static __inline__ word_t
 add_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
 ) {
    int i;
    dword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] + c[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static __inline__ word_t
 sub_nr_packed(
    word_t *a,
    const word_t *c,
    int nwords
 ) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords; i++) {
        a[i] = carry = carry + a[i] - c[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 ) {
    int i;
    dsword_t carry = 0;
    for (i=0; i<nwords_c; i++) {
        out[i] = carry = carry + a[i] - (c[i]&mask);
        carry >>= WORD_BITS;
    }
    for (; i<nwords_a; i++) {
        out[i] = carry = carry + a[i];
        carry >>= WORD_BITS;
    }
    return carry;
 }

 static word_t
 widemac(
    word_t *accum,
    int nwords_accum,
    const word_t *mier,
    int nwords_mier,
    word_t mand,
    word_t carry
 ) {
    int i;
    assert(nwords_accum >= nwords_mier);
    
    for (i=0; i<nwords_mier; i++) {
        /* UMAAL chain for the wordy part of p */
        dword_t product = ((dword_t)mand) * mier[i];
        product += accum[i];
        product += carry;
        accum[i] = product;
        carry = product >> WORD_BITS;
    }
    
    for (; i<nwords_accum; i++) {
        dword_t sum = ((dword_t)carry) + accum[i];
        accum[i] = sum;
        carry = sum >> WORD_BITS;
    }
    
    return carry;
 }

 void
 barrett_reduce(
    word_t *a,
    int nwords_a,
    word_t a_carry,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    /* TODO: non 2^k-c primes. */
    int repeat, nwords_left_in_a=nwords_a;
    
    /* TODO: is there a point to this a_carry business? */
    assert(a_carry < ((word_t)1)<<p_shift && nwords_a >= nwords_p);
    
    for (; nwords_left_in_a >= nwords_p; nwords_left_in_a--) {
        for (repeat=0; repeat<2; repeat++) {
            /* PERF: surely a more careful implementation could
             * avoid this double round
             */
            word_t mand = a[nwords_left_in_a-1] >> p_shift;
            a[nwords_left_in_a-1] &= (((word_t)1)<<p_shift)-1;
            if (p_shift && !repeat) {
                /* collect high bits when there are any */
                if (nwords_left_in_a < nwords_a) {
                    mand |= a[nwords_left_in_a] << (WORD_BITS-p_shift);
                    a[nwords_left_in_a] = 0;
                } else {
                    mand |= a_carry << (WORD_BITS-p_shift);
                }
            }
            
            word_t carry = widemac(a+nwords_left_in_a-nwords_p, nwords_p, p_lo, nwords_lo, mand, 0);
            assert(!carry);
            (void)carry;
        }
    }
    
    assert(nwords_left_in_a == nwords_p-1);
    
    /* OK, but it still isn't reduced.  Add and subtract p_lo. */
    word_t cout = add_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,-1);
    if (p_shift) {
        cout = (cout<<(WORD_BITS-p_shift)) + (a[nwords_p-1]>>p_shift);
        a[nwords_p-1] &= (((word_t)1)<<p_shift)-1;
    }
    
    /* mask = carry-1: if no carry then do sub, otherwise don't */
    sub_nr_ext_packed(a,a,nwords_p,p_lo,nwords_lo,cout-1);
 }

 /* PERF: This function is horribly slow.  Enough to break 1%. */
 void
 barrett_mul_or_mac(
    word_t *accum,
    int nwords_accum,
    
    const word_t *a,
    int nwords_a,
    
    const word_t *b,
    int nwords_b,
    
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift,
    
    mask_t doMac
 ) {
    assert(nwords_accum >= nwords_p);
    
    /* nwords_tmp = max(nwords_a + 1, nwords_p + 1, nwords_accum if doMac); */
    int nwords_tmp = (nwords_a > nwords_p) ? nwords_a : nwords_p;
    nwords_tmp++;
    if (nwords_tmp < nwords_accum && doMac)
        nwords_tmp = nwords_accum;
    
    word_t tmp[nwords_tmp];
    int bpos, i;
    
    for (i=0; i<nwords_tmp; i++) {
        tmp[i] = 0;
    }
    
    if (doMac) {
        for (i=0; i<nwords_accum; i++) {
            tmp[i] = accum[i];
        }

        barrett_reduce(tmp, nwords_tmp, 0, p_lo, nwords_p, nwords_lo, p_shift);
    }
    
    for (bpos=nwords_b-1; bpos >= 0; bpos--) {
        /* Invariant at the beginning of the loop: the high word is unused. */
        assert(tmp[nwords_tmp-1] == 0);
        
        /* shift up */
        for (i=nwords_tmp-2; i>=0; i--) {
            tmp[i+1] = tmp[i];
        }

        /* mac and reduce */
        word_t carry = widemac(tmp, nwords_tmp, a, nwords_a, b[bpos], 0);
        
        /* the mac can't carry, because nwords_tmp >= nwords_a+1 and its high word is clear */
        assert(!carry);
        barrett_reduce(tmp, nwords_tmp, carry, p_lo, nwords_p, nwords_lo, p_shift);
        
        /* at this point, the number of words used is nwords_p <= nwords_tmp-1,
         * so the high word is again clear */
    }
    
    for (i=0; i<nwords_tmp && i<nwords_accum; i++) {
        accum[i] = tmp[i];
    }
    for (; i<nwords_tmp; i++) {
        assert(tmp[i] == 0);
    }
    for (; i<nwords_accum; i++) {
        accum[i] = 0;
    }
 }
--- a/barrett_field.h
+++ b/barrett_field.h
@@ -0,0 +1,116 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __BARRETT_FIELD_H__
 #define __BARRETT_FIELD_H__ 1

 #include "word.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 void
 barrett_reduce(
    word_t *a,
    int nwords_a,
    word_t a_carry,
    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 );
    
 /*
 * out = a+(c&mask), with carry returned.
 * #out must equal #a (HACK?)
 */
 word_t
 add_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 );
    
 word_t
 sub_nr_ext_packed(
    word_t *out,
    const word_t *a,
    int nwords_a,
    const word_t *c,
    int nwords_c,
    word_t mask
 );

 /*
 * If doMac, accum = accum + a*b mod p.
 * Otherwise, accum = a*b mod p.
 *
 * This function is not __restrict__; you may pass accum,
 * a, b, etc all from the same location.
 */
 void
 barrett_mul_or_mac(
    word_t *accum,
    int nwords_accum,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift,
    
    mask_t doMac
 );
    
 static inline void
 barrett_mul(
    word_t *out,
    int nwords_out,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,0);
 }
    
 static inline void
 barrett_mac(
    word_t *out,
    int nwords_out,

    const word_t *a,
    int nwords_a,

    const word_t *b,
    int nwords_b,

    const word_t *p_lo,
    int nwords_p,
    int nwords_lo,
    int p_shift
 ) {
    barrett_mul_or_mac(out,nwords_out,a,nwords_a,b,nwords_b,p_lo,nwords_p,nwords_lo,p_shift,-1);
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __BARRETT_FIELD_H__ */
--- a/bench.c
+++ b/bench.c
@@ -0,0 +1,780 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <memory.h>

 #include "p448.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"
 #include "goldilocks.h"

 word_t q448_lo[4] = {
    0xdc873d6d54a7bb0dull,
    0xde933d8d723a70aaull,
    0x3bb124b65129c96full,
    0x000000008335dc16ull
 };

 double now() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  
  return tv.tv_sec + tv.tv_usec/1000000.0;
 }

 void p448_randomize( struct crandom_state_t *crand, struct p448_t *a ) {
    crandom_generate(crand, (unsigned char *)a, sizeof(*a));
    p448_strong_reduce(a);
 }

 void q448_randomize( struct crandom_state_t *crand, uint64_t sk[7] ) {
    crandom_generate(crand, (unsigned char *)sk, sizeof(uint64_t)*7);
 }

 void p448_print( const char *descr, const struct p448_t *a ) {
    p448_t b;
    p448_copy(&b, a);
    p448_strong_reduce(&b);
    int j;
    printf("%s = 0x", descr);
    for (j=7; j>=0; j--) {
        printf("%014llx", (unsigned long long)b.limb[j]);
    }
    printf("\n");
 }

 void p448_print_full( const char *descr, const struct p448_t *a ) {
    int j;
    printf("%s = 0x", descr);
    for (j=7; j>=0; j--) {
        printf("%02llx_%014llx ", a->limb[j]>>56, (unsigned long long)a->limb[j]&(1ull<<56)-1);
    }
    printf("\n");
 }

 void q448_print( const char *descr, const uint64_t secret[7] ) {
    int j;
    printf("%s = 0x", descr);
    for (j=6; j>=0; j--) {
        printf("%016llx", (unsigned long long)secret[j]);
    }
    printf("\n");
 }

 int main(int argc, char **argv) {
    (void)argc;
    (void)argv;

    struct tw_extensible_t ext;
    struct extensible_t exta;
    struct tw_niels_t niels;
    struct tw_pniels_t pniels;
    struct affine_t affine;
    struct montgomery_t mb;
    struct p448_t a,b,c,d;
    
    
    double when;
    int i,j;
    
    /* Bad randomness so we can debug. */
    char initial_seed[32];
    for (i=0; i<32; i++) initial_seed[i] = i;
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, initial_seed);
    
    uint64_t sk[7],tk[7];
    q448_randomize(&crand, sk);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_mul(&c, &b, &a);
    }
    when = now() - when;
    printf("mul:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_sqr(&c, &a);
    }
    when = now() - when;
    printf("sqr:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<5000000; i++) {
        p448_mul(&c, &b, &a);
        p448_mul(&a, &b, &c);
    }
    when = now() - when;
    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
    
    when = now();
    for (i=0; i<10000000; i++) {
        p448_mulw(&c, &b, 1234562);
    }
    when = now() - when;
    printf("mulw:        %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<100000; i++) {
        p448_randomize(&crand, &a);
    }
    when = now() - when;
    printf("rand448:     %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        p448_isr(&c, &a);
    }
    when = now() - when;
    printf("isr auto:    %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        p448_isr(&d,&a);
        p448_sqr(&b,&d);
        p448_mul(&c,&b,&a);
        p448_sqr(&b,&c);
        p448_subw(&b,1);
        p448_bias(&b,1);
        if (!p448_is_zero(&b)) {
            printf("ISR validation failure!\n");
            p448_print("a", &a);
            p448_print("s", &d);
        }
    }
    
    when = now();
    for (i=0; i<10000; i++) {
        elligator_2s_inject(&affine, &a);
    }
    when = now() - when;
    printf("elligator:   %5.1fµs\n", when * 1e6 / i);
    
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        elligator_2s_inject(&affine, &a);
        if (!p448_affine_validate(&affine)) {
            printf("Elligator validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        }
    }
    
    when = now();
    for (i=0; i<10000; i++) {
        affine_deserialize(&affine, &a);
    }
    when = now() - when;
    printf("decompress:  %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        extensible_serialize(&a, &exta);
    }
    when = now() - when;
    printf("compress:    %5.1fµs\n", when * 1e6 / i);
    
    int goods = 0;
    for (i=0; i<100; i++) {
        p448_randomize(&crand, &a);
        mask_t good = affine_deserialize(&affine, &a);
        if (good & !p448_affine_validate(&affine)) {
            printf("Deserialize validation failure!\n");
            p448_print("a", &a);
            p448_print("x", &affine.x);
            p448_print("y", &affine.y);
        } else if (good) {
            goods++;
            convert_affine_to_extensible(&exta,&affine);
            extensible_serialize(&b, &exta);
            p448_sub(&c,&b,&a);
            p448_bias(&c,2);
            if (!p448_is_zero(&c)) {
                printf("Reserialize validation failure!\n");
                p448_print("a", &a);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                affine_deserialize(&affine, &b);
                p448_print("b", &b);
                p448_print("x", &affine.x);
                p448_print("y", &affine.y);
                printf("\n");
            }
        }
    }
    if (goods<i/3) {
        printf("Deserialization validation failure! Deserialized %d/%d points\n", goods, i);
    }
    
    uint64_t lsk[12];
    for (i=0;i<10; i++) {
        for (j=11; j>=0; j--) {
            lsk[j] = random();
            lsk[j] = lsk[j]<<22 ^ random();
            lsk[j] = lsk[j]<<22 ^ random();
        }
    }
    
    when = now();
    for (i=0; i<1000000; i++) {
        barrett_reduce(lsk,12,0,q448_lo,7,4,62);
    }
    when = now() - when;
    printf("barrett red: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<100000; i++) {
        barrett_mac(lsk,7,lsk,7,lsk,7,q448_lo,7,4,62);
    }
    when = now() - when;
    printf("barrett mac: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_tw_extensible_add_niels(&ext, &niels);
    }
    when = now() - when;
    printf("exti+niels:  %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_tw_extensible_add_pniels(&ext, &pniels);
    }
    when = now() - when;
    printf("exti+pniels: %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_tw_extensible_double(&ext);
    }
    when = now() - when;
    printf("exti dbl:    %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_isogeny_tw_to_un(&exta, &ext);
    }
    when = now() - when;
    printf("i->a isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_isogeny_un_to_tw(&ext, &exta);
    }
    when = now() - when;
    printf("a->i isog:   %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<1000000; i++) {
        p448_montgomery_step(&mb);
    }
    when = now() - when;
    printf("monty step:  %5.1fns\n", when * 1e9 / i);
 	
    when = now();
    for (i=0; i<1000; i++) {
        p448_montgomery_ladder(&a,&b,sk,448,0);
    }
    when = now() - when;
    printf("full ladder: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        edwards_scalar_multiply(&ext,sk);
    }
    when = now() - when;
    printf("edwards smz: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    int sum = 0;
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        sum += edwards_scalar_multiply_vt(&ext,sk);
    }
    when = now() - when;
    printf("edwards vtm: %5.1fµs (%0.2f avg bits = 1.5 + 448/%0.2f)\n",
        when * 1e6 / i, 1.0*sum/i, 448.0*i/(sum-1.5*i));
    
    struct tw_niels_t wnaft[1<<6];
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,6);
    }
    when = now() - when;
    printf("wnaf6 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,6);
    }
    when = now() - when;
    printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,4);
    }
    when = now() - when;
    printf("wnaf4 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,4);
    }
    when = now() - when;
    printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_wnaf(wnaft,&ext,5);
    }
    when = now() - when;
    printf("wnaf5 pre:   %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        edwards_scalar_multiply_vt_pre(&ext,sk,wnaft,5);
    }
    when = now() - when;
    printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    sum = 0;
    for (i=0; i<1000; i++) {
        q448_randomize(&crand, sk);
        q448_randomize(&crand, tk);
        sum += edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
    }
    when = now() - when;
    printf("vt vf combo: %5.1fµs (avg = %0.3f)\n", when * 1e6 / i, 1.0*sum/i);
    
    when = now();
    for (i=0; i<1000; i++) {
        affine_deserialize(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        edwards_scalar_multiply(&ext,sk);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
    }
    when = now() - when;
    printf("edwards sm:  %5.1fµs\n", when * 1e6 / i);
    
    struct tw_niels_t table[80] __attribute__((aligned(32)));

    while (1) {
        p448_randomize(&crand, &a);
        if (affine_deserialize(&affine, &a)) break;
    }
    convert_affine_to_extensible(&exta,&affine);
    p448_isogeny_un_to_tw(&ext,&exta);
    when = now();
    for (i=0; i<1000; i++) {
        precompute_for_combs(table, &ext, 5, 5, 18);
    }
    when = now() - when;
    printf("pre(5,5,18): %5.1fµs\n", when * 1e6 / i);
 	
    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 5, 5, 18);
    }
    when = now() - when;
    printf("com(5,5,18): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 3, 5, 30);
    }
    when = now() - when;
    printf("com(3,5,30): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 2, 5, 45);
    }
    when = now() - when;
    printf("com(2,5,45): %5.1fµs\n", when * 1e6 / i);

    when = now();
    for (i=0; i<10000; i++) {
        edwards_comb(&ext, sk, table, 8, 4, 14);
    }
    when = now() - when;
    printf("com(4,4,28): %5.1fµs\n", when * 1e6 / i);
    
    when = now();
    for (i=0; i<10000; i++) {
        q448_randomize(&crand, sk);
        edwards_comb(&ext, sk, table, 5, 5, 18);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
    }
    when = now() - when;
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    printf("\nGoldilocks:\n");
    
    int res = goldilocks_init();
    assert(!res);
    
    uint8_t gpk[56],gsk[56],hsk[56],hpk[56];
    
    when = now();
    for (i=0; i<10000; i++) {
        if (i&1) {
            res = goldilocks_keygen(gsk,gpk);
        } else {
            res = goldilocks_keygen(hsk,hpk);
        }
        assert(!res);
    }
    when = now() - when;
    printf("keygen:      %5.1fµs\n", when * 1e6 / i);
    
    uint8_t ss1[64],ss2[64];
    int gres1,gres2;
    when = now();
    for (i=0; i<10000; i++) {
        if (i&1) {
            gres1 = goldilocks_shared_secret(ss1,gsk,hpk);
        } else {
            gres2 = goldilocks_shared_secret(ss2,hsk,gpk);
        }
    }
    when = now() - when;
    printf("ecdh:        %5.1fµs\n", when * 1e6 / i);
    if (gres1 || gres2 || memcmp(ss1,ss2,56)) {
        printf("[FAIL] %d %d\n",gres1,gres2);
        
        printf("ss1 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss1[i]);
        }
        printf("\nss2 = ");
        for (i=0; i<56; i++) {
            printf("%02x", ss2[i]);
        }
        printf("\n");
    }
    
    printf("\nTesting...\n");
    
    int failures=0, successes = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
 		uint64_t two = 2;
        mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
 		if (!good) continue;
 		
 		uint64_t x = rand(), y=rand(), z=x*y;
 		p448_montgomery_ladder(&b,&a,&x,64,0);
        p448_montgomery_ladder(&c,&b,&y,64,0);
        p448_montgomery_ladder(&b,&a,&z,64,0);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
 		if (!p448_is_zero(&d)) {
            printf("Odd ladder validation failure %d!\n", ++failures);
            p448_print("a", &a);
            printf("x=%llx, y=%llx, z=%llx\n", x,y,z);
            p448_print("c", &c);
            p448_print("b", &b);
 			printf("\n");
 		}
 	}
    
    failures = 0;
    for (i=0; i<1000; i++) {
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = affine_deserialize(&affine, &a);
        } while (!good);
        
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
        isogeny_and_serialize(&c, &ext);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (good && !p448_is_zero(&d)){
            printf("Iso+serial validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("b", &b);
            p448_print("c", &c);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Iso+serial variation: only %d/%d successful.\n", successes, i);
    }
        
    failures = 0;
    uint64_t four = 4;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
        
        mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
        good &= p448_montgomery_ladder(&c,&b,sk,448,0);
        
        mask_t goodb = affine_deserialize(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        edwards_scalar_multiply(&ext,sk);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (good != goodb) {
            printf("Compatibility validation failure %d: good: %d != %d\n", ++failures, (int)(-good), (int)(-goodb));
        } else if (good && !p448_is_zero(&d)){
            printf("Compatibility validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Compatibility variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = p448_montgomery_ladder(&b,&a,&four,3,0);
        good &= p448_montgomery_ladder(&c,&b,sk,448,0);
        if (!good) continue;
        
        affine_deserialize(&affine, &a);
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        
        precompute_for_combs(table, &ext, 5, 5, 18);
        edwards_comb(&ext, sk, table, 5, 5, 18);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("Comb validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("Comb variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = affine_deserialize(&affine, &a);
        if (!good) continue;
        
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        struct tw_extensible_t exu;
        copy_tw_extensible(&exu, &ext);
        
        edwards_scalar_multiply(&ext,sk);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);
        
        edwards_scalar_multiply_vt(&exu,sk);
        p448_isogeny_tw_to_un(&exta,&exu);
        extensible_serialize(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("WNAF validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("WNAF variation: only %d/%d successful.\n", successes, i);
    }
        
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        q448_randomize(&crand, sk);
 		if (!i) bzero(&sk, sizeof(sk));
        
        mask_t good = affine_deserialize(&affine, &a);
        if (!good) continue;
        
        convert_affine_to_extensible(&exta,&affine);
        p448_isogeny_un_to_tw(&ext,&exta);
        struct tw_extensible_t exu;
        copy_tw_extensible(&exu, &ext);
        
        edwards_scalar_multiply(&ext,sk);
        p448_isogeny_tw_to_un(&exta,&ext);
        extensible_serialize(&b, &exta);

        precompute_for_wnaf(wnaft,&exu,5);
        edwards_scalar_multiply_vt_pre(&exu,sk,wnaft,5);
        p448_isogeny_tw_to_un(&exta,&exu);
        extensible_serialize(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("PreWNAF validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            p448_print("c", &c);
            p448_print("b", &b);
            for (j=0; j<1<<5; j++) {
                printf("WNAFT %d\n", j);
                p448_print("  a",&wnaft[j].a);
                p448_print("  b",&wnaft[j].b);
                p448_print("  c",&wnaft[j].c);
            }
            printf("\n\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i/3) {
        printf("PreWNAF variation: only %d/%d successful.\n", successes, i);
    }
    
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        struct p448_t aa;
        struct tw_extensible_t exu,exv,exw;
        
        mask_t good;
        do {
            p448_randomize(&crand, &a);
            good = affine_deserialize(&affine, &a);
            convert_affine_to_extensible(&exta,&affine);
            p448_isogeny_un_to_tw(&ext,&exta);
        } while (!good);
        do {
            p448_randomize(&crand, &aa);
            good = affine_deserialize(&affine, &aa);
            convert_affine_to_extensible(&exta,&affine);
            p448_isogeny_un_to_tw(&exu,&exta);
        } while (!good);
        p448_randomize(&crand, &aa);
        
        q448_randomize(&crand, sk);
 		if (i==0 || i==2) bzero(&sk, sizeof(sk));
        q448_randomize(&crand, tk);
 		if (i==0 || i==1) bzero(&tk, sizeof(tk));
        
        copy_tw_extensible(&exv, &ext);
        copy_tw_extensible(&exw, &exu);
        edwards_scalar_multiply(&exv,sk);
        edwards_scalar_multiply(&exw,tk);
        convert_tw_extensible_to_tw_pniels(&pniels, &exw);
        p448_tw_extensible_add_pniels(&exv,&pniels);
        p448_isogeny_tw_to_un(&exta,&exv);
        extensible_serialize(&b, &exta);

        precompute_for_wnaf(wnaft,&exu,5);
        edwards_combo_var_fixed_vt(&ext,sk,tk,wnaft,5);
        p448_isogeny_tw_to_un(&exta,&exv);
        extensible_serialize(&c, &exta);
        
        p448_sub(&d,&b,&c);
        p448_bias(&d,2);
        
        if (!p448_is_zero(&d)){
            printf("PreWNAF combo validation failure %d!\n", ++failures);
            p448_print("a", &a);
            p448_print("A", &aa);
            q448_print("s", sk);
            q448_print("t", tk);
            p448_print("c", &c);
            p448_print("b", &b);
            printf("\n\n");
        } else if (good) {
            successes ++;
        }
    }
    if (successes < i) {
        printf("PreWNAF combo variation: only %d/%d successful.\n", successes, i);
    }
    
    successes = failures = 0;
    for (i=0; i<1000; i++) {
        p448_randomize(&crand, &a);
        
        q448_randomize(&crand, sk);
        q448_randomize(&crand, tk);
        
 		uint64_t two = 2;
        mask_t good = p448_montgomery_ladder(&b,&a,&two,2,0);
 		p448_montgomery_ladder(&b,&a,sk,448,0);
        p448_montgomery_ladder(&d,&b,tk,448,0);
        p448_montgomery_ladder(&b,&a,tk,448,0);
        p448_montgomery_ladder(&c,&b,sk,448,0);
        
        p448_sub(&b,&c,&d);
        p448_bias(&b,2);
        
        mask_t success = p448_is_zero(&b) | ~good;
        
        if (!success) {
            printf("Ladder validation failure %d!\n", ++failures);
            p448_print("a", &a);
            q448_print("s", sk);
            q448_print("t", tk);
            p448_print("c", &c);
            p448_print("d", &d);
            printf("\n");
        }
    }
    
    return 0;
 }
--- a/crandom.c
+++ b/crandom.c
@@ -0,0 +1,381 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* Chacha random number generator code copied from crandom */

 #include "intrinsics.h"
 #include "crandom.h"

 volatile unsigned int crandom_features = 0;

 unsigned int crandom_detect_features() {
  unsigned int out = GEN;
  
 # if (defined(__i386__) || defined(__x86_64__))
    u_int32_t a,b,c,d;
    
    a=1; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    out |= GEN;
    if (d & 1<<26) out |= SSE2;
    if (d & 1<< 9) out |= SSSE3;
    if (c & 1<<25) out |= AESNI;
    if (c & 1<<28) out |= AVX;
    if (b & 1<<5) out  |= AVX2;
    
    a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
    if (c & 1<<11) out |= XOP;
 # endif
  
  return out;
 }

 /* ------------------------------- Vectorized code ------------------------------- */
 #define shuffle(x,i) _mm_shuffle_epi32(x, \
  i + ((i+1)&3)*4 + ((i+2)&3)*16 + ((i+3)&3)*64)

 #define add _mm_add_epi32
 #define add64 _mm_add_epi64

 #define NEED_XOP   (MIGHT_HAVE(XOP))
 #define NEED_SSSE3 (MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP))
 #define NEED_SSE2  (MIGHT_HAVE(SSE2)  && !MUST_HAVE(SSSE3))
 #define NEED_CONV  (!MUST_HAVE(SSE2))

 #if NEED_XOP
 static __inline__ void
 quarter_round_xop(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = xop_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = xop_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = xop_rotate(7,  *b ^ *c);
 }
 #endif

 #if NEED_SSSE3
 static const ssereg shuffle8  = { 0x0605040702010003ull, 0x0E0D0C0F0A09080Bull };
 static const ssereg shuffle16 = { 0x0504070601000302ull, 0x0D0C0F0E09080B0Aull };
  
 INTRINSIC ssereg ssse3_rotate_8(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle8);
 }
  
 INTRINSIC ssereg ssse3_rotate_16(ssereg a) {
    return _mm_shuffle_epi8(a, shuffle16);
 }
  
 static __inline__ void
 quarter_round_ssse3(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = ssse3_rotate_16(*d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = ssse3_rotate_8( *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif /* MIGHT_HAVE(SSSE3) && !MUST_HAVE(XOP) */

 #if NEED_SSE2
 static __inline__ void
 quarter_round_sse2(
    ssereg *a,
    ssereg *b,
    ssereg *c,
    ssereg *d
 ) {
    *a = add(*a,*b); *d = sse2_rotate(16, *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(12, *b ^ *c);
    *a = add(*a,*b); *d = sse2_rotate(8,  *d ^ *a);
    *c = add(*c,*d); *b = sse2_rotate(7,  *b ^ *c);
 }
 #endif

 #define DOUBLE_ROUND(qrf) { \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,1);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,3);       \
  b2 = shuffle(b2,1);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,3);       \
                            \
  qrf(&a1,&b1,&c1,&d1);     \
  qrf(&a2,&b2,&c2,&d2);     \
  b1 = shuffle(b1,3);       \
  c1 = shuffle(c1,2);       \
  d1 = shuffle(d1,1);       \
  b2 = shuffle(b2,3);       \
  c2 = shuffle(c2,2);       \
  d2 = shuffle(d2,1);       \
                          }
                          
 #define OUTPUT_FUNCTION   { \
  output[0] = add(a1,aa);   \
  output[1] = add(b1,bb);   \
  output[2] = add(c1,cc);   \
  output[3] = add(d1,dd);   \
  output[4] = add(a2,aa);   \
  output[5] = add(b2,bb);   \
  output[6] = add(c2,add(cc,p)); \
  output[7] = add(d2,dd);   \
                            \
  output += 8;              \
                            \
  cc = add64(add64(cc,p), p); \
  a1 = a2 = aa;             \
  b1 = b2 = bb;             \
  c1 = cc; c2 = add64(cc,p);\
  d1 = d2 = dd;             \
                          }
 /* ------------------------------------------------------------------------------- */

 INTRINSIC u_int32_t rotate(int r, u_int32_t a) {
    return a<<r ^ a>>(32-r);
 }

 static __inline__ void
 quarter_round(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d) {
    *a = *a + *b; *d = rotate(16, *d^*a);
    *c = *c + *d; *b = rotate(12, *b^*c);
    *a = *a + *b; *d = rotate(8,  *d^*a);
    *c = *c + *d; *b = rotate(7,  *b^*c);
 }

 static void
 crandom_chacha_expand(u_int64_t iv,
                         u_int64_t ctr,
                         int nr,
                         int output_size,
                         const unsigned char *key_,
                         unsigned char *output_) {
 # if MIGHT_HAVE_SSE2
    if (HAVE(SSE2)) {
        ssereg *key = (ssereg *)key_;
        ssereg *output = (ssereg *)output_;
                 
        ssereg a1 = key[0], a2 = a1, aa = a1,
               b1 = key[1], b2 = b1, bb = b1,
               c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1,
               d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull},
               d2 = d1, dd = d1,
               p = {0, 1};
 
        int i,r;
 #   if (NEED_XOP)
        if (HAVE(XOP)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_xop);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSSE3)
        if (HAVE(SSSE3)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_ssse3);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
 #   if (NEED_SSE2)
        if (HAVE(SSE2)) {
            for (i=0; i<output_size; i+=128) {
                for (r=nr; r>0; r-=2)
                    DOUBLE_ROUND(quarter_round_sse2);
                OUTPUT_FUNCTION;
            }
            return;
        }
 #   endif
    }
 # endif

 # if NEED_CONV
    {
        const u_int32_t *key = (const u_int32_t *)key_;
        u_int32_t
        x[16],
        input[16] = {
            key[0], key[1], key[2], key[3],
            key[4], key[5], key[6], key[7],
            iv, iv>>32, ctr, ctr>>32,
            0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
        },
        *output = (u_int32_t *)output_;
        int i, r;

        for (i=0; i<output_size; i+= 64) {
            for (r=0; r<16; r++) {
                x[r] = input[r];
            }
                for (r=nr; r>0; r-=2) {
                quarter_round(&x[0], &x[4],  &x[8], &x[12]);
                quarter_round(&x[1], &x[5],  &x[9], &x[13]);
                quarter_round(&x[2], &x[6], &x[10], &x[14]);
                quarter_round(&x[3], &x[7], &x[11], &x[15]);

                quarter_round(&x[0], &x[5], &x[10], &x[15]);
                quarter_round(&x[1], &x[6], &x[11], &x[12]);
                quarter_round(&x[2], &x[7],  &x[8], &x[13]);
                quarter_round(&x[3], &x[4],  &x[9], &x[14]);
            }
            for (r=0; r<16; r++) {
                output[r] = x[r] + input[r];
            }

            output += 16;
            input[11] ++;
            if (!input[11]) input[12]++;
        }
    }
  
 #endif /* NEED_CONV */
 }

 /* "return 4", cf xkcd #221 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 int
 crandom_init_from_file(
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) {
    state->fill = 0;
    state->reseed_countdown = reseed_interval;
    state->reseed_interval = reseed_interval;
    state->ctr = 0;

    state->randomfd = open(filename, O_RDONLY);
    if (state->randomfd == -1) {
        int err = errno;
        return err ? err : -1;
    }

    ssize_t offset = 0, red;
    do {
        red = read(state->randomfd, state->seed + offset, 32 - offset);
        if (red > 0) offset += red;
    } while (red > 0 && offset < 32);

    if (offset < 32) {
        int err = errno;
        return err ? err : -1;
    }

    bzero(state->buffer, 96);

    state->magic = CRANDOM_MAGIC;
    state->reseeds_mandatory = reseeds_mandatory;

    return 0;
 }

 void
 crandom_init_from_buffer(
    struct crandom_state_t *state,
    const char initial_seed[32]
 ) {
    memcpy(state->seed, initial_seed, 32);
    bzero(state->buffer, 96);
    state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
    state->randomfd = -1;
    state->magic = CRANDOM_MAGIC;
 }

 int
 crandom_generate(
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 ) {
    /* the generator isn't seeded; maybe they ignored the return value of init_from_file */
    if (unlikely(state->magic != CRANDOM_MAGIC)) abort();

    int ret = 0;

    while (length) {
        if (unlikely(state->fill <= 0)) {
            uint64_t iv = 0;
            if (state->reseed_interval) {
                /* it's nondeterministic, stir in some rdtsc() */
                iv = rdtsc();

                state->reseed_countdown--;
                if (unlikely(state->reseed_countdown <= 0)) {
                    /* reseed by xoring in random state */
                    state->reseed_countdown = state->reseed_interval;
                    ssize_t offset = 0, red;
                    do {
                        red = read(state->randomfd, state->buffer + offset, 32 - offset);
                        if (red > 0) offset += red;
                    } while (red > 0 && offset < 32);

                    if (offset < 32) {
                        /* The read failed.  Signal an error with the return code.
                         *
                         * If reseeds are mandatory, crash.
                         *
                         * If not, the generator is still probably safe to use, because reseeding
                         * is basically over-engineering for caution.  Also, the user might ignore
                         * the return code, so we still need to fill the request.
                         *
                         * Set reseed_countdown = 1 so we'll try again later.  If the user's perf
                         * sucks as a result of ignoring the error code while calling us in a loop,
                         * well, he gets what he deserves.
                         */
                        if (state->reseeds_mandatory) abort();

                        ret = errno;
                        if (ret == 0) ret = -1;
                        state->reseed_countdown = 1;
                    }

                    int i;
                    for (i=0; i<32; i++) {
                        /* Stir in the buffer.  If somehow the read failed, it'll be zeros. */
                        state->seed[i] ^= state->buffer[i];
                    }
                }
            }
            crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
            state->ctr++;
            state->fill = sizeof(state->buffer);
        }

        unsigned long long copy = (length > state->fill) ? state->fill : length;
        state->fill -= copy;
        memcpy(output, state->buffer + state->fill, copy);
        bzero(state->buffer + state->fill, copy);
        output += copy; length -= copy;
    }

    return ret;
 }

 void
 crandom_destroy(
    struct crandom_state_t *state
 ) {
    if (state->randomfd) close(state->randomfd);
    /* Ignore the return value, because what would it mean?
     * "Your random device, which you were reading over NFS, lost some data"?
     */

    bzero(state, sizeof(*state));
 }
--- a/crandom.h
+++ b/crandom.h
@@ -0,0 +1,66 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* A miniature version of the (as of yet incomplete) crandom project. */

 #ifndef __GOLDI_CRANDOM_H__
 #define __GOLDI_CRANDOM_H__ 1

 #include <stdint.h>  /* for uint64_t */
 #include <fcntl.h>   /* for open */
 #include <errno.h>   /* for returning errors after open */
 #include <stdlib.h>  /* for abort */
 #include <string.h>  /* for memcpy */
 #include <strings.h> /* for bzero */
 #include <unistd.h>  /* for read */

 struct crandom_state_t {
    unsigned char seed[32];
    unsigned char buffer[96];
    uint64_t ctr;
    uint64_t magic;
    unsigned int fill;
    int reseed_countdown;
    int reseed_interval;
    int reseeds_mandatory;
    int randomfd;
 } __attribute__((aligned(16))) ;

 #ifdef __cplusplus
 extern "C" {
 #endif

 int
 crandom_init_from_file(
    struct crandom_state_t *state,
    const char *filename,
    int reseed_interval,
    int reseeds_mandatory
 ) __attribute__((warn_unused_result));

 void
 crandom_init_from_buffer(
    struct crandom_state_t *state,
    const char initial_seed[32]
 );

 /* TODO : attribute warn for not checking return type? */
 int
 crandom_generate(
    struct crandom_state_t *state,
    unsigned char *output,
    unsigned long long length
 );

 void
 crandom_destroy(
    struct crandom_state_t *state
 );

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDI_CRANDOM_H__ */
--- a/ec_point.c
+++ b/ec_point.c
@@ -0,0 +1,621 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* This file was generated with the assistance of a tool written in SAGE. */
 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 p448_tw_extensible_add_niels (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_bias ( &d->y,     2 );
    p448_bias ( &d->z,     2 );
    p448_sub  (   &L1, &d->y, &d->x );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_bias ( &d->y,     2 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 p448_tw_extensible_add_pniels (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    p448_tw_extensible_add_niels(     e, &a->n );
 }

 void
 p448_tw_extensible_double (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_bias (   &L1,     3 );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 p448_extensible_double (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_bias ( &a->u,     3 );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 p448_isogeny_un_to_tw (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_bias (   &L0,     3 );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 p448_isogeny_tw_to_un (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_bias ( &b->u,     3 );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 p448_montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_bias ( &a->xd,     2 );
    p448_bias ( &a->xa,     2 );
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_bias ( &a->xd,     2 );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_bias ( &a->xd,     2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_bias ( &a->zd,     4 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 p448_montgomery_serialize (
    struct p448_t*             sign,
    struct p448_t*             ser,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L2, &a->z0, &a->zd );
    p448_bias (   &L2,     2 );
    p448_sub  (   &L0,   &L2, &a->xd );
    p448_mul  (   &L2, &a->za,   &L0 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1, &a->z0, &a->xd );
    p448_bias (   &L1,     2 );
    p448_sub  (   &L0,   &L1, &a->zd );
    p448_mul  (   &L3, &a->xa,   &L0 );
    p448_add  (   &L1,   &L3,   &L2 );
    p448_sub  (   &L0,   &L2,   &L3 );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L0,   sbz,   &L2 );
    p448_mul  (   &L2, &a->zd,   &L0 );
    p448_mul  (  sign,   &L2, &a->zd );
    p448_mul  (   ser,   &L2, &a->xd );
    p448_mul  (   &L2,  sign,   ser );
    p448_isr  (   &L1,   &L2 );
    p448_mul  (   ser,  sign,   &L1 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (  sign,   &L2,   &L0 );
 }

 void
 extensible_serialize (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 isogeny_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (   &L1, &a->y, &a->x );
    p448_sqr  (     b,   &L1 );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (   &L1,     b,   &L2 );
    p448_bias (   &L1,     3 );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (     b,   &L2 );
    p448_add  (   &L2,   &L1,   &L1 );
    p448_mulw (   &L1,   &L2, 39082 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (   &L1,   &L0 );
    p448_bias (   &L1,     2 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L0 );
    p448_isr  (   &L0,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     b,   &L1 );
    p448_mul  (     b,   &L2,   &L3 );
 }

 mask_t
 affine_deserialize (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_bias (   &L0,     1 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
    p448_bias (   &L2,     2 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8, L9;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L9, &a->y );
    p448_bias (   &L9,     2 );
    p448_sqr  (   &L2,   &L9 );
    p448_bias (   &L2,     1 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_bias (   &L7,     2 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_mulw (   &L4, &a->y, 78160 );
    p448_mul  (   &L6,   &L7,   &L9 );
    p448_mul  (   &L8,   &L6,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_sqr  (   &L6,   &L5 );
    p448_mul  (   &L5,   &L8,   &L6 );
    p448_mul  (   &L8,   &L7,   &L5 );
    p448_mul  (   &L7,   &L8,   &L5 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L9 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 p448_affine_validate (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 p448_tw_extensible_validate (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_bias (   &L3,     2 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_bias (   &L3,     4 );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/ec_point.h
+++ b/ec_point.h
@@ -0,0 +1,440 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* This file was generated with the assistance of a tool written in SAGE. */
 #ifndef __CC_INCLUDED_P448_EDWARDS_H__
 #define __CC_INCLUDED_P448_EDWARDS_H__

 #include "p448.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*
 * Affine point on an Edwards curve.
 */
 struct affine_t {
    struct p448_t x, y;
 };

 /*
 * Affine point on a twisted Edwards curve.
 */
 struct tw_affine_t {
    struct p448_t x, y;
 };

 /*
 * Montgomery buffer.
 */
 struct montgomery_t {
    struct p448_t z0, xd, zd, xa, za;
 };

 /*
 * Extensible coordinates for Edwards curves, suitable for
 * accumulators.
 * 
 * Represents the point (x/z, y/z).  The extra coordinates
 * t,u satisfy xy = tuz, allowing for conversion to Extended
 * form by multiplying t and u.
 * 
 * The idea is that you don't have to do this multiplication
 * when doubling the accumulator, because the t-coordinate
 * isn't used there.  At the same time, as long as you only
 * have one point in extensible form, additions don't cost
 * extra.
 * 
 * This is essentially a lazier version of Hisil et al's
 * lookahead trick.  It might be worth considering that trick
 * instead.
 */
 struct extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /*
 * Extensible coordinates for twisted Edwards curves,
 * suitable for accumulators.
 */
 struct tw_extensible_t {
    struct p448_t x, y, z, t, u;
 };

 /*
 * Niels coordinates for twisted Edwards curves.  Good for
 * mixed readdition; suitable for fixed tables.
 */
 struct tw_niels_t {
    struct p448_t a, b, c;
 };

 /*
 * Projective niels coordinates for twisted Edwards curves.
 * Good for readdition; suitable for temporary tables.
 */
 struct tw_pniels_t {
    struct tw_niels_t n;
    struct p448_t z;
 };


 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Auto-generated copy method.
 */
 static __inline__ void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) __attribute__((unused,always_inline));

 /*
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /*
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 );

 /*
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
 */
 void
 p448_tw_extensible_add_niels (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 );

 /*
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in projective Niels form.
 */
 void
 p448_tw_extensible_add_pniels (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 );

 /*
 * Double a point on a twisted Edwards curve, in "extensible" coordinates.
 */
 void
 p448_tw_extensible_double (
    struct tw_extensible_t* a
 );

 /*
 * Double a point on an Edwards curve, in "extensible" coordinates.
 */
 void
 p448_extensible_double (
    struct extensible_t* a
 );

 /*
 * 4-isogeny from untwisted to twisted.
 */
 void
 p448_isogeny_un_to_tw (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 );

 /*
 * Dual 4-isogeny from twisted to untwisted.
 */
 void
 p448_isogeny_tw_to_un (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 );

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 );

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 );

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 );

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 );

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 );

 void
 p448_montgomery_step (
    struct montgomery_t* a
 );

 void
 p448_montgomery_serialize (
    struct p448_t*             sign,
    struct p448_t*             ser,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 );

 /*
 * Serialize a point on an Edwards curve
 * The serialized form would be sqrt((z-y)/(z+y)) with sign of xz
 * It would be on 4y^2/(1-d) = x^3 + 2(1+d)/(1-d) * x^2 + x.
 * But 4/(1-d) isn't square, so we need to twist it:
 * -x is on 4y^2/(d-1) = x^3 + 2(d+1)/(d-1) * x^2 + x
 */
 void
 extensible_serialize (
    struct p448_t*             b,
    const struct extensible_t* a
 );

 /*
 * 
 */
 void
 isogeny_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 );

 /*
 * Deserialize a point to an untwisted affine curve
 */
 mask_t
 affine_deserialize (
    struct affine_t*     a,
    const struct p448_t* sz
 );

 void
 set_identity_extensible (
    struct extensible_t* a
 );

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 );

 void
 set_identity_affine (
    struct affine_t* a
 );

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 );

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 );

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 );

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 );

 mask_t
 p448_affine_validate (
    const struct affine_t* a
 );

 /*
 * Check the invariants for struct tw_extensible_t.
 * PERF: This function was automatically generated
 * with no regard for speed.
 */
 mask_t
 p448_tw_extensible_validate (
    const struct tw_extensible_t* ext
 );


 void
 copy_affine (
    struct affine_t*       a,
    const struct affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_tw_affine (
    struct tw_affine_t*       a,
    const struct tw_affine_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
 }

 void
 copy_montgomery (
    struct montgomery_t*       a,
    const struct montgomery_t* ds
 ) {
    p448_copy ( &a->z0, &ds->z0 );
    p448_copy ( &a->xd, &ds->xd );
    p448_copy ( &a->zd, &ds->zd );
    p448_copy ( &a->xa, &ds->xa );
    p448_copy ( &a->za, &ds->za );
 }

 void
 copy_extensible (
    struct extensible_t*       a,
    const struct extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_extensible (
    struct tw_extensible_t*       a,
    const struct tw_extensible_t* ds
 ) {
    p448_copy ( &a->x, &ds->x );
    p448_copy ( &a->y, &ds->y );
    p448_copy ( &a->z, &ds->z );
    p448_copy ( &a->t, &ds->t );
    p448_copy ( &a->u, &ds->u );
 }

 void
 copy_tw_niels (
    struct tw_niels_t*       a,
    const struct tw_niels_t* ds
 ) {
    p448_copy ( &a->a, &ds->a );
    p448_copy ( &a->b, &ds->b );
    p448_copy ( &a->c, &ds->c );
 }

 void
 copy_tw_pniels (
    struct tw_pniels_t*       a,
    const struct tw_pniels_t* ds
 ) {
    copy_tw_niels( &a->n, &ds->n );
    p448_copy ( &a->z, &ds->z );
 }



 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __CC_INCLUDED_P448_EDWARDS_H__ */
--- a/exported.sym
+++ b/exported.sym
@@ -0,0 +1,3 @@
 _goldilocks_init
 _goldilocks_keygen
 _goldilocks_shared_secret
--- a/goldilocks.c
+++ b/goldilocks.c
@@ -0,0 +1,168 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include "goldilocks.h"
 #include "ec_point.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "crandom.h"

 #ifndef GOLDILOCKS_RANDOM_INIT_FILE
 #define GOLDILOCKS_RANDOM_INIT_FILE "/dev/urandom"
 #endif

 #ifndef GOLDILOCKS_RANDOM_RESEED_INTERVAL
 #define GOLDILOCKS_RANDOM_RESEED_INTERVAL 10000
 #endif

 /* We'll check it ourselves */
 #ifndef GOLDILOCKS_RANDOM_RESEEDS_MANDATORY
 #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0
 #endif

 /* TODO: word size; precompute */
 const struct affine_t goldilocks_base_point = {
    {{ 0xf0de840aed939full, 0xc170033f4ba0c7ull, 0xf3932d94c63d96ull, 0x9cecfa96147eaaull,
       0x5f065c3c59d070ull, 0x3a6a26adf73324ull, 0x1b4faff4609845ull, 0x297ea0ea2692ffull
    }},
    {{ 19, 0, 0, 0, 0, 0, 0, 0 }}
 };

 // FIXME: threading
 // TODO: autogen instead of init
 struct {
    struct tw_niels_t combs[80];
    struct tw_niels_t wnafs[32];
    struct crandom_state_t rand;
 } goldilocks_global;

 int
 goldilocks_init() {
    struct extensible_t ext;
    struct tw_extensible_t text;
    
    /* Sanity check: the base point is on the curve. */
    assert(p448_affine_validate(&goldilocks_base_point));
    
    /* Convert it to twisted Edwards. */
    convert_affine_to_extensible(&ext, &goldilocks_base_point);
    p448_isogeny_un_to_tw(&text, &ext);
    
    /* Precompute the tables. */
    precompute_for_combs(goldilocks_global.combs, &text, 5, 5, 18);
    precompute_for_wnaf(goldilocks_global.wnafs, &text, 5);
    
    return crandom_init_from_file(&goldilocks_global.rand,
        GOLDILOCKS_RANDOM_INIT_FILE,
        GOLDILOCKS_RANDOM_RESEED_INTERVAL,
        GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
 }

 // TODO: move to a better place
 // TODO: word size
 void
 p448_serialize(uint8_t *serial, const struct p448_t *x) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
    }
 }

 void
 q448_serialize(uint8_t *serial, const word_t x[7]) {
    int i,j;
    for (i=0; i<7; i++) {
        for (j=0; j<8; j++) {
            serial[8*i+j] = x[i]>>(8*j);
        }
    }
 }

 mask_t
 q448_deserialize(word_t x[7], const uint8_t serial[56]) {
    int i,j;
    for (i=0; i<7; i++) {
        word_t out = 0;
        for (j=0; j<8; j++) {
            out |= ((word_t)serial[8*i+j])<<(8*j);
        }
        x[i] = out;
    }
    // TODO: check for reduction
    return MASK_SUCCESS;
 }

 mask_t
 p448_deserialize(p448_t *x, const uint8_t serial[56]) {
    int i,j;
    for (i=0; i<8; i++) {
        word_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((word_t)serial[7*i+j])<<(8*j);
        }
        x->limb[i] = out;
    }
    // TODO: check for reduction
    return MASK_SUCCESS;
 }

 static word_t
 q448_lo[4] = {
    0xdc873d6d54a7bb0dull,
    0xde933d8d723a70aaull,
    0x3bb124b65129c96full,
    0x000000008335dc16ull
 };

 int
 goldilocks_keygen(
    uint8_t private[56],
    uint8_t public[56]
 ) {
    // TODO: check for init.  Also maybe take CRANDOM object?  API...
    word_t sk[448*2/WORD_BITS];
    
    struct tw_extensible_t exta;
    struct p448_t pk;
    
    int ret = crandom_generate(&goldilocks_global.rand, (unsigned char *)sk, sizeof(sk));
    barrett_reduce(sk,sizeof(sk)/sizeof(sk[0]),0,q448_lo,7,4,62); // TODO word size
    q448_serialize(private, sk);
    
    edwards_comb(&exta, sk, goldilocks_global.combs, 5, 5, 18);
    isogeny_and_serialize(&pk, &exta);
    p448_serialize(public, &pk);
    
    return ret;
 }

 int
 goldilocks_shared_secret(
    uint8_t shared[56],
    const uint8_t private[56],
    const uint8_t public[56]
 ) {
    // TODO: SHA
    word_t sk[448/WORD_BITS];
    struct p448_t pk;
    
    mask_t succ = p448_deserialize(&pk,public);
    succ &= q448_deserialize(sk,private);
    succ &= p448_montgomery_ladder(&pk,&pk,sk,446,2);
    
    p448_serialize(shared,&pk);
    // TODO: hash
    
    if (succ) {
        return 0;
    } else {
        return -1;
    }
 }
--- a/goldilocks.h
+++ b/goldilocks.h
@@ -0,0 +1,34 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __GOLDILOCKS_H__
 #define __GOLDILOCKS_H__ 1

 #include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 int
 goldilocks_init();

 int
 goldilocks_keygen(
    uint8_t private[56],
    uint8_t public[56]
 );

 int
 goldilocks_shared_secret(
    uint8_t shared[56],
    const uint8_t private[56],
    const uint8_t public[56]
 );

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDILOCKS_H__ */
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -0,0 +1,177 @@
 /* Copyright (c) 2011 Stanford University.
 * Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /* cRandom intrinsics header. */

 #ifndef __CRANDOM_INTRINSICS_H__
 #define __CRANDOM_INTRINSICS_H__ 1

 #include <sys/types.h>

 #include <immintrin.h>

 #define INTRINSIC \
  static __inline__ __attribute__((__gnu_inline__, __always_inline__))

 #define GEN    1
 #define SSE2   2
 #define SSSE3  4
 #define AESNI  8
 #define XOP    16
 #define AVX    32
 #define AVX2   64

 INTRINSIC u_int64_t rdtsc() {
  u_int64_t out = 0;
 # if (defined(__i386__) || defined(__x86_64__))
    __asm__ __volatile__ ("rdtsc" : "=A"(out));
 # endif
  return out;
 }

 INTRINSIC u_int64_t opacify(u_int64_t x) {
  __asm__ volatile("mov %0, %0" : "+r"(x));
  return x;
 }

 #ifdef __AVX2__
 #  define MIGHT_HAVE_AVX2 1
 #  ifndef MUST_HAVE_AVX2
 #    define MUST_HAVE_AVX2 0
 #  endif
 #else
 #  define MIGHT_HAVE_AVX2 0
 #  define MUST_HAVE_AVX2  0
 #endif

 #ifdef __AVX__
 #  define MIGHT_HAVE_AVX 1
 #  ifndef MUST_HAVE_AVX
 #    define MUST_HAVE_AVX MUST_HAVE_AVX2
 #  endif
 #else
 #  define MIGHT_HAVE_AVX 0
 #  define MUST_HAVE_AVX  0
 #endif

 #ifdef __SSSE3__
 #  define MIGHT_HAVE_SSSE3 1
 #  ifndef MUST_HAVE_SSSE3
 #    define MUST_HAVE_SSSE3 MUST_HAVE_AVX
 #  endif
 #else
 #  define MIGHT_HAVE_SSSE3 0
 #  define MUST_HAVE_SSSE3 0
 #endif

 #ifdef __SSE2__
 #  define MIGHT_HAVE_SSE2 1
 #  ifndef MUST_HAVE_SSE2
 #    define MUST_HAVE_SSE2 MUST_HAVE_SSSE3
 #  endif
   typedef __m128i ssereg;
 #  define pslldq _mm_slli_epi32
 #  define pshufd _mm_shuffle_epi32

 INTRINSIC ssereg sse2_rotate(int r, ssereg a) {
  return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r);
 }

 #else
 #  define MIGHT_HAVE_SSE2 0
 #  define MUST_HAVE_SSE2  0
 #endif

 #ifdef __AES__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_AESNI 1
 #  ifndef MUST_HAVE_AESNI
 #    define MUST_HAVE_AESNI 0
 #  endif

 INTRINSIC ssereg aeskeygenassist(int rc, ssereg x) {
  ssereg out;
  __asm__("aeskeygenassist %2, %1, %0" : "=x"(out) : "x"(x), "g"(rc));
  return out;
 }

 INTRINSIC ssereg aesenc(ssereg subkey, ssereg block) {
  ssereg out = block;
  __asm__("aesenc %1, %0" : "+x"(out) : "x"(subkey));
  return out;
 }

 INTRINSIC ssereg aesenclast(ssereg subkey, ssereg block) {
  ssereg out = block;
  __asm__("aesenclast %1, %0" : "+x"(out) : "x"(subkey));
  return out;
 }

 #else
 #  define MIGHT_HAVE_AESNI 0
 #  define MUST_HAVE_AESNI 0
 #endif

 #ifdef __XOP__
 /* don't include intrinsics file, because not all platforms have it */
 #  define MIGHT_HAVE_XOP 1
 #  ifndef MUST_HAVE_XOP
 #    define MUST_HAVE_XOP 0
 #  endif
 INTRINSIC ssereg xop_rotate(int amount, ssereg x) {
  ssereg out;
  __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount));
  return out;
 }
 #else
 #  define MIGHT_HAVE_XOP 0
 #  define MUST_HAVE_XOP 0
 #endif

 #define MIGHT_MASK \
  ( SSE2  * MIGHT_HAVE_SSE2   \
  | SSSE3 * MIGHT_HAVE_SSSE3  \
  | AESNI * MIGHT_HAVE_AESNI  \
  | XOP   * MIGHT_HAVE_XOP    \
  | AVX   * MIGHT_HAVE_AVX    \
  | AVX2  * MIGHT_HAVE_AVX2)

 #define MUST_MASK \
  ( SSE2  * MUST_HAVE_SSE2   \
  | SSSE3 * MUST_HAVE_SSSE3  \
  | AESNI * MUST_HAVE_AESNI  \
  | XOP   * MUST_HAVE_XOP    \
  | AVX   * MUST_HAVE_AVX    \
  | AVX2  * MUST_HAVE_AVX2 )

 #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature)
 #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature)

 #ifdef __cplusplus
 #  define extern_c extern "C"
 #else
 #  define extern_c
 #endif

 extern_c
 unsigned int crandom_detect_features();

 #ifndef likely
 #  define likely(x)       __builtin_expect((x),1)
 #  define unlikely(x)     __builtin_expect((x),0)
 #endif

 extern volatile unsigned int crandom_features;
 INTRINSIC int HAVE(unsigned int feature) {
  unsigned int features;
  if (!MIGHT_HAVE(feature)) return 0;
  if (MUST_HAVE(feature))   return 1;
  features = crandom_features;
  if (unlikely(!features))
    crandom_features = features = crandom_detect_features();
  return likely((features & feature) == feature);
 }

 #endif /* __CRANDOM_INTRINSICS_H__ */
--- a/p448.c
+++ b/p448.c
@@ -0,0 +1,387 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"
 #include "x86-64-arith.h"

 void p448_mul(p448_t *__restrict__ cs, const p448_t *as, const p448_t *bs) {
  const uint64_t *a = as->limb, *b = bs->limb;
  uint64_t *c = cs->limb;
  
    __uint128_t accum0 = 0, accum1 = 0, accum2;
  uint64_t mask = (1ull<<56) - 1;  
    
  uint64_t aa[4], bb[4];
  
  /* For some reason clang doesn't vectorize this without prompting? */
  unsigned int i;
  for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
      ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];     
  }
  /*
  for (int i=0; i<4; i++) {
    aa[i] = a[i] + a[i+4];
    bb[i] = b[i] + b[i+4];
  }
  */
  
  accum2  = widemul(&a[0],&b[3]);
  accum1  = widemul(&aa[0],&bb[3]);
  accum0  = widemul(&a[4],&b[7]);
  
  mac(&accum2, &a[1], &b[2]);
  mac(&accum1, &aa[1], &bb[2]);
  mac(&accum0, &a[5], &b[6]);
  
  mac(&accum2, &a[2], &b[1]);
  mac(&accum1, &aa[2], &bb[1]);
  mac(&accum0, &a[6], &b[5]);
  
  mac(&accum2, &a[3], &b[0]);
  mac(&accum1, &aa[3], &bb[0]);
  mac(&accum0, &a[7], &b[4]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[3] = ((uint64_t)(accum0)) & mask;
  c[7] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  {
    accum2 = accum1;
    accum1 += accum0;
    accum0 = accum2;
  }
  
  accum2 = widemul(&a[0],&b[0]);
  accum1 -= accum2;
  accum0 += accum2;
    
  accum2  = widemul(&aa[1],&bb[3]);
  msb(&accum0, &a[1], &b[3]);
  mac(&accum1, &a[5], &b[7]);
  
  msb(&accum0, &a[2], &b[2]);
  mac(&accum2, &aa[2], &bb[2]);
  mac(&accum1, &a[6], &b[6]);
  
  msb(&accum0, &a[3], &b[1]);
  mac(&accum1, &a[7], &b[5]);
  mac(&accum2, &aa[3], &bb[1]);
  
  accum0 += accum2;
  accum1 += accum2;
  mac(&accum0, &a[4], &b[4]);
  mac(&accum1, &aa[0], &bb[0]);
  
  c[0] = ((uint64_t)(accum0)) & mask;
  c[4] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum2  = widemul(&aa[2],&bb[3]);
  msb(&accum0, &a[2], &b[3]);
  mac(&accum1, &a[6], &b[7]);
  
  mac(&accum2, &aa[3], &bb[2]);
  msb(&accum0, &a[3], &b[2]);
  mac(&accum1, &a[7], &b[6]);
  
  accum1 += accum2;
  accum0 += accum2;
  
  accum2  = widemul(&a[0],&b[1]);
  mac(&accum1, &aa[0], &bb[1]);
  mac(&accum0, &a[4], &b[5]);
  
  mac(&accum2, &a[1], &b[0]);
  mac(&accum1, &aa[1], &bb[0]);
  mac(&accum0, &a[5], &b[4]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[1] = ((uint64_t)(accum0)) & mask;
  c[5] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum2  = widemul(&aa[3],&bb[3]);
  msb(&accum0, &a[3], &b[3]);
  mac(&accum1, &a[7], &b[7]);
  
  accum1 += accum2;
  accum0 += accum2;
  
  accum2  = widemul(&a[0],&b[2]);
  mac(&accum1, &aa[0], &bb[2]);
  mac(&accum0, &a[4], &b[6]);
  
  mac(&accum2, &a[1], &b[1]);
  mac(&accum1, &aa[1], &bb[1]);
  mac(&accum0, &a[5], &b[5]);
  
  mac(&accum2, &a[2], &b[0]);
  mac(&accum1, &aa[2], &bb[0]);
  mac(&accum0, &a[6], &b[4]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[2] = ((uint64_t)(accum0)) & mask;
  c[6] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum0 += c[3];
  accum1 += c[7];
  c[3] = ((uint64_t)(accum0)) & mask;
  c[7] = ((uint64_t)(accum1)) & mask;
  
  /* we could almost stop here, but it wouldn't be stable, so... */
  
  accum0 >>= 56;
  accum1 >>= 56;
  c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  c[0] += ((uint64_t)(accum1));
 }

 void p448_mulw(p448_t *__restrict__ cs, const p448_t *as, uint64_t b) {
  const uint64_t *a = as->limb;
  uint64_t *c = cs->limb;
  
  __uint128_t accum0, accum4;
  uint64_t mask = (1ull<<56) - 1;  
  
  accum0 = widemul_rm(b, &a[0]);
  accum4 = widemul_rm(b, &a[4]);
  
  c[0] = accum0 & mask; accum0 >>= 56;
  c[4] = accum4 & mask; accum4 >>= 56;
  
  mac_rm(&accum0, b, &a[1]);
  mac_rm(&accum4, b, &a[5]);
  
  c[1] = accum0 & mask; accum0 >>= 56;
  c[5] = accum4 & mask; accum4 >>= 56;
  
  mac_rm(&accum0, b, &a[2]);
  mac_rm(&accum4, b, &a[6]);
  
  c[2] = accum0 & mask; accum0 >>= 56;
  c[6] = accum4 & mask; accum4 >>= 56;
  
  mac_rm(&accum0, b, &a[3]);
  mac_rm(&accum4, b, &a[7]);
  
  c[3] = accum0 & mask; accum0 >>= 56;
  c[7] = accum4 & mask; accum4 >>= 56;
  
  c[4] += accum0 + accum4;
  c[0] += accum4;
 }

 void p448_sqr(p448_t *__restrict__ cs, const p448_t *as) {
  const uint64_t *a = as->limb;
  uint64_t *c = cs->limb;
  
  __uint128_t accum0 = 0, accum1 = 0, accum2;
  uint64_t mask = (1ull<<56) - 1;  
    
  uint64_t aa[4];
  
  /* For some reason clang doesn't vectorize this without prompting? */
  unsigned int i;
  for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  }
  
  accum2  = widemul(&a[0],&a[3]);
  accum1  = widemul(&aa[0],&aa[3]);
  accum0  = widemul(&a[4],&a[7]);
  
  mac(&accum2, &a[1], &a[2]);
  mac(&accum1, &aa[1], &aa[2]);
  mac(&accum0, &a[5], &a[6]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[3] = ((uint64_t)(accum0))<<1 & mask;
  c[7] = ((uint64_t)(accum1))<<1 & mask;
  
  accum0 >>= 55;
  accum1 >>= 55;
  
  {
    accum2 = accum1;
    accum1 += accum0;
    accum0 = accum2;
  }
  
  accum2 = widemul(&a[0],&a[0]);
  accum1 -= accum2;
  accum0 += accum2;
    
  accum2  = widemul2(&aa[1],&aa[3]);
  msb2(&accum0, &a[1], &a[3]);
  mac2(&accum1, &a[5], &a[7]);
  
  msb(&accum0, &a[2], &a[2]);
  mac(&accum2, &aa[2], &aa[2]);
  mac(&accum1, &a[6], &a[6]);
  
  accum0 += accum2;
  accum1 += accum2;
  mac(&accum0, &a[4], &a[4]);
  mac(&accum1, &aa[0], &aa[0]);
  
  c[0] = ((uint64_t)(accum0)) & mask;
  c[4] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum2  = widemul2(&aa[2],&aa[3]);
  msb2(&accum0, &a[2], &a[3]);
  mac2(&accum1, &a[6], &a[7]);
  
  accum1 += accum2;
  accum0 += accum2;
  
  accum2  = widemul2(&a[0],&a[1]);
  mac2(&accum1, &aa[0], &aa[1]);
  mac2(&accum0, &a[4], &a[5]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[1] = ((uint64_t)(accum0)) & mask;
  c[5] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum2  = widemul(&aa[3],&aa[3]);
  msb(&accum0, &a[3], &a[3]);
  mac(&accum1, &a[7], &a[7]);
  
  accum1 += accum2;
  accum0 += accum2;
  
  accum2  = widemul2(&a[0],&a[2]);
  mac2(&accum1, &aa[0], &aa[2]);
  mac2(&accum0, &a[4], &a[6]);
  
  mac(&accum2, &a[1], &a[1]);
  mac(&accum1, &aa[1], &aa[1]);
  mac(&accum0, &a[5], &a[5]);
  
  accum1 -= accum2;
  accum0 += accum2;
  
  c[2] = ((uint64_t)(accum0)) & mask;
  c[6] = ((uint64_t)(accum1)) & mask;
  
  accum0 >>= 56;
  accum1 >>= 56;
  
  accum0 += c[3];
  accum1 += c[7];
  c[3] = ((uint64_t)(accum0)) & mask;
  c[7] = ((uint64_t)(accum1)) & mask;
  
  /* we could almost stop here, but it wouldn't be stable, so... */
  
  accum0 >>= 56;
  accum1 >>= 56;
  c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  c[0] += ((uint64_t)(accum1));
 }

 static __inline__ void p448_sqr_inplace(p448_t *x) {
    p448_t y;
    p448_sqr(&y,x);
    *x = y;
 }

 static __inline__ void p448_mul_inplace(p448_t *x, const p448_t *z) {
    p448_t y;
    p448_mul(&y,x,z);
    *x = y;
 }

 static __inline__ void p448_repunit(p448_t *x, int space, int teeth) {
  int i,j;
  p448_t working = *x;
  for (i=0; i<teeth-1; i++) {
    for (j=0; j<space-(i?0:1); j++)
      p448_sqr_inplace(&working);
    if (i==teeth-2)
      p448_mul_inplace(x,&working);
    else
      p448_mul_inplace(&working,x);
  }
 }

 void
 p448_strong_reduce(p448_t *a) {
  uint64_t mask = (1ull<<56)-1;
  
  /* first, clear high */
  a->limb[4] += a->limb[7]>>56;
  a->limb[0] += a->limb[7]>>56;
  a->limb[7] &= mask;
  
  /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
  
  /* compute total_value - p.  No need to reduce mod p. */
  
  __int128_t scarry = 0;
  int i;
  for (i=0; i<8; i++) {
    scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
    a->limb[i] = scarry & mask;
    scarry >>= 56;
  }
  
  /* uncommon case: it was >= p, so now scarry = 0 and this = x
   * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
   * so let's add back in p.  will carry back off the top for 2^448.
   */
  
  assert(is_zero(scarry) | is_zero(scarry+1));

  uint64_t scarry_mask = scarry & mask;
  __uint128_t carry = 0;
  
  /* add it back */
  for (i=0; i<8; i++) {
    carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
    a->limb[i] = carry & mask;
    carry >>= 56;
  }
  
  assert(is_zero(carry + scarry));
 }

 mask_t p448_is_zero(const struct p448_t *a) {
  struct p448_t b;
  p448_copy(&b,a);
  p448_strong_reduce(&b);

  uint64_t any = 0;
  int i;
  for (i=0; i<8; i++) {
    any |= b.limb[i];
  }
  return is_zero(any);
 }
--- a/p448.h
+++ b/p448.h
@@ -0,0 +1,242 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct p448_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui(p448_t *out,
            uint64_t x)
    __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap(p448_t *a,
               p448_t *b,
               mask_t do_swap)
    __attribute__((unused,always_inline));

 static __inline__ void
 p448_add(p448_t *out,
         const p448_t *a,
         const p448_t *b)
    __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub(p448_t *out,
         const p448_t *a,
         const p448_t *b)
    __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg(p448_t *out,
         const p448_t *a)
    __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg(p448_t *a,
              mask_t doNegate)
   __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw(p448_t *a,
          uint64_t x)
    __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw(p448_t *a,
          uint64_t x)
    __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy(p448_t *out, const p448_t *a)
    __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce(p448_t *inout)
    __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce(p448_t *inout);

 mask_t
 p448_is_zero(const p448_t *in);
             
 static __inline__ void
 p448_bias(p448_t *inout, int amount)
    __attribute__((unused,always_inline));
         
 void
 p448_mul(p448_t *__restrict__ out,
         const p448_t *a,
         const p448_t *b);

 void
 p448_mulw(p448_t *__restrict__ out,
          const p448_t *a,
          uint64_t b);

 void
 p448_sqr(p448_t *__restrict__ out,
         const p448_t *a);
         
 static __inline__ void
 p448_sqrn(p448_t *__restrict__ y, const p448_t *x, int n)
    __attribute__((unused,always_inline));

 void
 p448_set_ui(p448_t *out,
            uint64_t x) {
    int i;
    out->limb[0] = x;
    for (i=1; i<8; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap(p448_t *a, p448_t *b, mask_t doswap) {
  big_register_t *aa = (big_register_t*)a;
  big_register_t *bb = (big_register_t*)b;
  big_register_t m = doswap;
  
  unsigned int i;
  for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
      big_register_t x = m & (aa[i]^bb[i]);
      aa[i] ^= x;
      bb[i] ^= x;
  }
 }

 void
 p448_add(p448_t *out, const p448_t *a, const p448_t *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p448_sub(p448_t *out, const p448_t *a, const p448_t *b) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg(p448_t *out, const p448_t *a) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = doNegate;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw(p448_t *a, uint64_t x) {
  a->limb[0] += x;
 }
             
 void
 p448_subw(p448_t *a, uint64_t x) {
  a->limb[0] -= x;
 }

 void
 p448_copy(p448_t *out, const p448_t *a) {
  *out = *a;
 }

 void
 p448_bias(p448_t *a, int amt) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint64x4_t *aa = (uint64x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
 }

 void
 p448_weak_reduce(p448_t *a) {
    /* TODO: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    int i;
    a->limb[4] += tmp;
    for (i=7; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void p448_sqrn(p448_t *__restrict__ y, const p448_t *x, int n) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/scalarmul.c
+++ b/scalarmul.c
@@ -0,0 +1,728 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #include <stdlib.h>

 #include "scalarmul.h"
 #include "string.h"
 #include "barrett_field.h"

 mask_t
 p448_montgomery_ladder(
    struct p448_t *out,
    const struct p448_t *in,
    const uint64_t *scalar,
    int nbits,
    int n_extra_doubles
 ) {
    struct montgomery_t mont;
    p448_sqr(&mont.z0,in);
    p448_copy(&mont.za,&mont.z0);
    p448_set_ui(&mont.xa,1);
    p448_set_ui(&mont.zd,0);
    p448_set_ui(&mont.xd,1);
    
    int i,j,n=(nbits-1)&63;
    mask_t pflip = 0;
    for (j=(nbits+63)/64-1; j>=0; j--) {
        uint64_t w = scalar[j];
        for (i=n; i>=0; i--) {
            mask_t flip = -((w>>i)&1);
            p448_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            p448_cond_swap(&mont.za,&mont.zd,flip^pflip);
            p448_montgomery_step(&mont);
            pflip = flip;
        }
        n = 63;
    }
    p448_cond_swap(&mont.xa,&mont.xd,pflip);
    p448_cond_swap(&mont.za,&mont.zd,pflip);
    
    for (j=0; j<n_extra_doubles; j++) {
        p448_montgomery_step(&mont);
    }
    
    struct p448_t sign;
    p448_montgomery_serialize(&sign, out, &mont, in);
    
    p448_addw(&sign,1);
    return ~p448_is_zero(&sign);
 }

 static __inline__ void
 niels_cond_negate(
    struct tw_niels_t *n,
    mask_t doNegate
 ) {
    p448_cond_swap(&n->a, &n->b, doNegate);
    p448_cond_neg(&n->c, doNegate); /* TODO: bias amt? */
 }

 static __inline__ void
 pniels_cond_negate(
    struct tw_pniels_t *n,
    mask_t doNegate
 ) {
    niels_cond_negate(&n->n, doNegate);
 }

 void    
 constant_time_lookup_pniels(
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static __inline__ void    
 constant_time_lookup_niels(
    struct tw_niels_t *out,
    const struct tw_niels_t *in,
    int nin,
    int idx
 ) {
    big_register_t big_one = 1, big_i = idx;
    big_register_t *o = (big_register_t *)out;
    const big_register_t *i = (const big_register_t *)in;
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
        }
    }
 }

 static void
 convert_to_signed_window_form(
    word_t *out,
    const word_t *scalar,
    const word_t *prepared_data,
    int nwords
 ) {
    mask_t mask = -(scalar[0]&1);

    word_t carry = add_nr_ext_packed(out, scalar, nwords, prepared_data, nwords, ~mask);
    carry += add_nr_ext_packed(out, out, nwords, prepared_data+nwords, nwords, mask);
    
    assert(!(out[0]&1));
    
    int i;
    for (i=0; i<nwords; i++) {
        out[i] >>= 1;
        if (i<nwords-1) {
            out[i] |= out[i+1]<<(WORD_BITS-1);
        } else {
            out[i] |= carry<<(WORD_BITS-1);
        }
    }
 }

 void
 edwards_scalar_multiply(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 ) {

    const int nbits=448; /* HACK? */
    word_t prepared_data[14] = {
        0x9595b847fdf73126ull,
        0x9bb9b8a856af5200ull,
        0xb3136e22f37d5c4full,
        0x0000000189a19442ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x4000000000000000ull,

        0x721cf5b5529eec33ull,
        0x7a4cf635c8e9c2abull,
        0xeec492d944a725bfull,
        0x000000020cd77058ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x0000000000000000ull
    }; /* TODO: split off */
    
    uint64_t scalar2[7];
    convert_to_signed_window_form(scalar2,scalar,prepared_data,7);

    struct tw_extensible_t tabulator;
    copy_tw_extensible(&tabulator, working);
    p448_tw_extensible_double(&tabulator);

    struct tw_pniels_t pn, multiples[8];
    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
    convert_tw_extensible_to_tw_pniels(&multiples[0], working);

    int i;
    for (i=1; i<8; i++) {
        p448_tw_extensible_add_pniels(working, &pn);
        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
    }

    i = nbits - 4;
    int bits = scalar2[i/64] >> (i%64) & 0xF,
        inv = (bits>>3)-1;
    bits ^= inv;
    
    constant_time_lookup_pniels(&pn, multiples, 8, bits&7);
    pniels_cond_negate(&pn, inv);
    convert_tw_pniels_to_tw_extensible(working, &pn);
 		

    for (i-=4; i>=0; i-=4) {
        p448_tw_extensible_double(working);
        p448_tw_extensible_double(working);
        p448_tw_extensible_double(working);
        p448_tw_extensible_double(working);

        bits = scalar2[i/64] >> (i%64) & 0xF;
        inv = (bits>>3)-1;
        bits ^= inv;
    
        constant_time_lookup_pniels(&pn, multiples, 8, bits&7);
        pniels_cond_negate(&pn, inv);
        p448_tw_extensible_add_pniels(working, &pn);
    }
 }


 void
 edwards_comb(
    struct tw_extensible_t *working,
    const word_t scalar[7],
    const struct tw_niels_t *table,
    int n,
    int t,
    int s
 ) {
    word_t prepared_data[14] = {
        0xebec9967f5d3f5c2ull,
        0x0aa09b49b16c9a02ull,
        0x7f6126aec172cd8eull,
        0x00000007b027e54dull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x4000000000000000ull,

        0xc873d6d54a7bb0cfull,
        0xe933d8d723a70aadull,
        0xbb124b65129c96fdull,
        0x00000008335dc163ull,
        0x0000000000000000ull,
        0x0000000000000000ull,
        0x0000000000000000ull
    }; /* TODO: split off.  Above is for 450 bits */
    
    word_t scalar2[7];
    convert_to_signed_window_form(scalar2,scalar,prepared_data,7);
    
    /* const int n=3, t=5, s=30; */
    int i,j,k;
    
    struct tw_niels_t ni;
    
    for (i=0; i<s; i++) {
        if (i) p448_tw_extensible_double(working);
        
        for (j=0; j<n; j++) {
            int tab = 0;
 			
 			/*
             * PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
 			 * time of a keygen or sign op.  Surely it is possible to speed it up.
             */
            for (k=0; k<t; k++) {
                int bit = (s-1-i) + k*s + j*(s*t);
                if (bit < 7*WORD_BITS) {
                    tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
                }
            }
            
            mask_t invert = (tab>>(t-1))-1;
            tab ^= invert;
            tab &= (1<<(t-1)) - 1;
            
            constant_time_lookup_niels(&ni, table + (j<<(t-1)), 1<<(t-1), tab);
            niels_cond_negate(&ni, invert);
            if (i||j) {
                p448_tw_extensible_add_niels(working, &ni);
            } else {
                convert_tw_niels_to_tw_extensible(working, &ni);
            }
        }
    }
 }

 void
 simultaneous_invert_p448(
    struct p448_t *out,
    const struct p448_t *in,
    int n
 ) {
  if (!n) return;
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<n-1; i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }

 mask_t
 precompute_for_combs(
  struct tw_niels_t *out,
  const struct tw_extensible_t *const_base,
  int n,
  int t,
  int s
 ) {
    if (s < 1) return 0;
  
    struct tw_extensible_t working, start;
    copy_tw_extensible(&working, const_base);
    struct tw_pniels_t pn_tmp;
  
    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc(sizeof(*doubles) * (t-1));
    struct p448_t *zs  = (struct p448_t *) malloc(sizeof(*zs) * (n<<(t-1)));
    struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis) * (n<<(t-1)));
  
    if (!doubles || !zs || !zis) {
        free(doubles);
        free(zs);
        free(zis);
        return 0;
    }
  
    int i,j,k;
    for (i=0; i<n; i++) {

        /* doubling phase */
        for (j=0; j<t; j++) {
            if (j) {
                convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
                p448_tw_extensible_add_pniels(&start, &pn_tmp);
            } else {
                copy_tw_extensible(&start, &working);
            }

            if (j==t-1 && i==n-1) {
                break;
            }

            p448_tw_extensible_double(&working);
            if (j<t-1) {
                convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
            }

            for (k=0; k<s-1; k++) {
                p448_tw_extensible_double(&working);
            }
        }

        /* Gray-code phase */
        for (j=0;; j++) {
            int gray = j ^ (j>>1);
            int idx = ((i+1)<<(t-1))-1 ^ gray;

            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
            copy_tw_niels(&out[idx], &pn_tmp.n);
            p448_copy(&zs[idx], &pn_tmp.z);
 			
            if (j >= (1<<(t-1)) - 1) break;
            int delta = (j+1) ^ ((j+1)>>1) ^ gray;

            for (k=0; delta>1; k++)
                delta >>=1;
            
            if (gray & (1<<k)) {
                /* start += doubles[k] */
                p448_tw_extensible_add_pniels(&start, &doubles[k]);
            } else {
                /* start -= doubles[k] */
                /* PERF: uncond negate */
                copy_tw_pniels(&pn_tmp, &doubles[k]);
                pniels_cond_negate(&pn_tmp, -1);
                p448_tw_extensible_add_pniels(&start, &pn_tmp);
            }
            
            
        }
    }
 	
    simultaneous_invert_p448(zis, zs, n<<(t-1));

    p448_t product;
    for (i=0; i<n<<(t-1); i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
    }
 	
 	mask_t ret = ~p448_is_zero(&zis[0]);

    free(doubles);
    free(zs);
    free(zis);

    return ret;
 }

 mask_t
 precompute_for_wnaf(
    struct tw_niels_t *out,
    const struct tw_extensible_t *const_base,
    int tbits
 ) {
    int i;
    struct p448_t *zs  = (struct p448_t *) malloc(sizeof(*zs)<<tbits);
    struct p448_t *zis = (struct p448_t *) malloc(sizeof(*zis)<<tbits);

    if (!zs || !zis) {
        free(zs);
        free(zis);
        return 0;
    }

    struct tw_extensible_t base;
    copy_tw_extensible(&base,const_base);
    
    struct tw_pniels_t twop, tmp;
    
    convert_tw_extensible_to_tw_pniels(&tmp, &base);
    p448_copy(&zs[0], &tmp.z);
    copy_tw_niels(&out[0], &tmp.n);

    if (tbits > 0) {
        p448_tw_extensible_double(&base);
        convert_tw_extensible_to_tw_pniels(&twop, &base);
        p448_tw_extensible_add_pniels(&base, &tmp);
        
        convert_tw_extensible_to_tw_pniels(&tmp, &base);
        p448_copy(&zs[1], &tmp.z);
        copy_tw_niels(&out[1], &tmp.n);

        for (i=2; i < 1<<tbits; i++) {
            p448_tw_extensible_add_pniels(&base, &twop);
            convert_tw_extensible_to_tw_pniels(&tmp, &base);
            p448_copy(&zs[i], &tmp.z);
            copy_tw_niels(&out[i], &tmp.n);
        }
    }
    
    simultaneous_invert_p448(zis, zs, 1<<tbits);

    p448_t product;
    for (i=0; i<1<<tbits; i++) {
        p448_mul(&product, &out[i].a, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].a, &product);
        
        p448_mul(&product, &out[i].b, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].b, &product);
        
        p448_mul(&product, &out[i].c, &zis[i]);
        p448_strong_reduce(&product);
        p448_copy(&out[i].c, &product);
    }

    free(zs);
    free(zis);

    return -1;
 }

 struct smvt_control {
  int power, addend;
 };

 static int
 recode_wnaf(
    struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
    const word_t *scalar,
    int nbits,
    int tableBits)
 {
    int current = 0, position=0, i;

    /* PERF: negate scalar if it's large
     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
     */
    for (i=nbits-1; i >= -2 - tableBits; i--) {
        int bit = (i >= 0)
            ? (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1
            : 0;

        current = 2*current + bit;

        /*
         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
         * So current loses (tableBits+1) bits every time.  It otherwise gains
         * 1 bit per iteration.  The number of iterations is
         * (nbits + 2 + tableBits), and an additional control word is added at
         * the end.  So the total number of control words is at most
         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
         * There's also the stopper with power -1, for a total of +3.
         */
        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
            int delta = (current + 1) >> 1;
            current = -(current & 1);

            int j;
            for (j=i; (delta & 1) == 0; j++) {
                delta >>= 1;
            }
            control[position].power = j+1;
            control[position].addend = delta;
            position++;
            assert(position <= nbits/(tableBits+1) + 2);
        }
    }
  
    control[position].power = -1;
    control[position].addend = 0;
    return position;
 }


 static void
 prepare_wnaf_table(
    struct tw_pniels_t *output,
    struct tw_extensible_t *working,
    int tbits
 ) {
    convert_tw_extensible_to_tw_pniels(&output[0], working);

    if (tbits == 0) return;

    p448_tw_extensible_double(working);
    struct tw_pniels_t twop;
    convert_tw_extensible_to_tw_pniels(&twop, working);

    p448_tw_extensible_add_pniels(working, &output[0]);
    convert_tw_extensible_to_tw_pniels(&output[1], working);

    for (int i=2; i < 1<<tbits; i++) {
        p448_tw_extensible_add_pniels(working, &twop);
        convert_tw_extensible_to_tw_pniels(&output[i], working);
    }
 }

 int
 edwards_scalar_multiply_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 ) {
    /* HACK: not 448? */
    const int nbits=448, table_bits = 3;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    struct tw_pniels_t precmp[1<<table_bits];
    prepare_wnaf_table(precmp, working, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return control_bits;
    }
  
    int conti = 1, i;
  
    struct tw_pniels_t neg;
    for (i = control[0].power - 1; i >= 0; i--) {
        p448_tw_extensible_double(working);

        if (i == control[conti].power) {
            assert(control[conti].addend);

            if (control[conti].addend > 0) {
                p448_tw_extensible_add_pniels(working, &precmp[control[conti].addend >> 1]);
            } else {
                /* PERF: uncond negate */
                copy_tw_pniels(&neg, &precmp[(-control[conti].addend) >> 1]);
                pniels_cond_negate(&neg, -1);
                p448_tw_extensible_add_pniels(working, &neg);
            }
            conti++;
            assert(conti <= control_bits);
        }
    }
    return control_bits; /* TODO: don't return anything, this is just for testing */
 }

 void
 edwards_scalar_multiply_vt_pre(
    struct tw_extensible_t *working,
    const uint64_t scalar[7],
    const struct tw_niels_t *precmp,
    int table_bits
 ) {
    /* HACK: not 448? */
    const int nbits=448;
    struct smvt_control control[nbits/(table_bits+1)+3];
    
    int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  
    if (control_bits > 0) {
        assert(control[0].addend > 0);
        assert(control[0].power >= 0);
        convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
    } else {
        set_identity_tw_extensible(working);
        return;
    }
  
    int conti = 1, i;
  
    struct tw_niels_t neg;
    for (i = control[0].power - 1; i >= 0; i--) {
        p448_tw_extensible_double(working);

        if (i == control[conti].power) {
            assert(control[conti].addend);

            if (control[conti].addend > 0) {
                p448_tw_extensible_add_niels(working, &precmp[control[conti].addend >> 1]);
            } else {
                /* PERF: uncond negate */
                copy_tw_niels(&neg, &precmp[(-control[conti].addend) >> 1]);
                niels_cond_negate(&neg, -1);
                p448_tw_extensible_add_niels(working, &neg);
            }
            conti++;
            assert(conti <= control_bits);
        }
    }
 }

 int
 edwards_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar_var[7],
    const uint64_t scalar_pre[7],
    const struct tw_niels_t *precmp,
    int table_bits_pre
 ) {
    /* HACK: not 448? */
    const int nbits_var=448, nbits_pre=448, table_bits_var = 3;
    struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
    struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
    
    int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
    int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
    (void)ncb_var;
    (void)ncb_pre;
  
    struct tw_pniels_t precmp_var[1<<table_bits_var];
    prepare_wnaf_table(precmp_var, working, table_bits_var);
  
    int contp=0, contv=0, i;
  
    i = control_var[0].power;
    if (i > control_pre[0].power) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        contv++;
    } else if (i == control_pre[0].power && i >=0 ) {
        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
        p448_tw_extensible_add_niels(working, &precmp[control_pre[0].addend >> 1]);
        contv++; contp++;
    } else {
        i = control_pre[0].power;
        convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
        contp++;
    }
    
    if (i < 0) {
        set_identity_tw_extensible(working);
        return ncb_pre;
    }
    
    struct tw_pniels_t pneg;
    struct tw_niels_t neg;
    for (i--; i >= 0; i--) {
        p448_tw_extensible_double(working);

        if (i == control_var[contv].power) {
            assert(control_var[contv].addend);

            if (control_var[contv].addend > 0) {
                p448_tw_extensible_add_pniels(working, &precmp_var[control_var[contv].addend >> 1]);
            } else {
                /* PERF: uncond negate */
                copy_tw_pniels(&pneg, &precmp_var[(-control_var[contv].addend) >> 1]);
                pniels_cond_negate(&pneg, -1);
                p448_tw_extensible_add_pniels(working, &pneg);
            }
            contv++;
        }

        if (i == control_pre[contp].power) {
            assert(control_pre[contp].addend);

            if (control_pre[contp].addend > 0) {
                p448_tw_extensible_add_niels(working, &precmp[control_pre[contp].addend >> 1]);
            } else {
                /* PERF: uncond negate */
                copy_tw_niels(&neg, &precmp[(-control_pre[contp].addend) >> 1]);
                niels_cond_negate(&neg, -1);
                p448_tw_extensible_add_niels(working, &neg);
            }
            contp++;
        }
    }
    
    assert(contv == ncb_var);
    assert(contp == ncb_pre);
    
    return ncb_pre;
 }



--- a/scalarmul.h
+++ b/scalarmul.h
@@ -0,0 +1,112 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_ALGO_H__
 #define __P448_ALGO_H__ 1

 #include "ec_point.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*
 * Out = scalar * in, encoded in inverse square root
 * format.
 *
 * nbits is the number of bits in scalar.
 *
 * The scalar is to be presented in little-endian form,
 * meaning that scalar[0] contains the least significant
 * word of the scalar.
 * 
 * If the point "in" is on the curve, the return
 * value will be set (to -1).
 *
 * If the point "in" is not on the curve, then the
 * output will be incorrect.  If the scalar is even,
 * this condition will be detected by returning 0,
 * unless the output is the identity point (0; TODO).
 * If the scalar is odd, the value returned will be
 * set (to -1; TODO).
 *
 * The input and output points are always even.
 * Therefore on a cofactor-4 curve like Goldilocks,
 * it is sufficient for security to make the scalar
 * even.  (TODO: detect when i/o has cofactor?)
 *
 * This function takes constant time, depending on
 * nbits but not on in or scalar.
 */
 mask_t
 p448_montgomery_ladder(
    struct p448_t *out,
    const struct p448_t *in,
    const uint64_t *scalar,
    int nbits,
    int n_extra_doubles
 );

 void
 edwards_scalar_multiply(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
    /* TODO? int nbits */
 );
    
 mask_t
 precompute_for_combs(
  struct tw_niels_t *out,
  const struct tw_extensible_t *const_base,
  int n,
  int t,
  int s
 );
    
 void
 edwards_comb(
    struct tw_extensible_t *working,
    const word_t scalar[7],
    const struct tw_niels_t *table,
    int n,
    int t,
    int s
 );

 /* TODO: void.  int is just for diagnostic purposes. */
 int
 edwards_scalar_multiply_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar[7]
 );
    
 void
 edwards_scalar_multiply_vt_pre(
    struct tw_extensible_t *working,
    const uint64_t scalar[7],
    const struct tw_niels_t *precmp,
    int table_bits
 );

 mask_t
 precompute_for_wnaf(
    struct tw_niels_t *out,
    const struct tw_extensible_t *const_base,
    int tbits
 ); /* TODO: attr don't ignore... */

 /* TODO: void.  int is just for diagnostic purposes. */
 int
 edwards_combo_var_fixed_vt(
    struct tw_extensible_t *working,
    const uint64_t scalar_var[7],
    const uint64_t scalar_pre[7],
    const struct tw_niels_t *precmp,
    int table_bits_pre
 );

 #ifdef __cplusplus
 };
 #endif

 #endif /* __P448_ALGO_H__ */
--- a/word.h
+++ b/word.h
@@ -0,0 +1,55 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __WORD_H__
 #define __WORD_H__

 #include <stdint.h>

 typedef uint64_t word_t;
 typedef __uint128_t dword_t;
 typedef int64_t sword_t;
 typedef __int128_t dsword_t;

 static const int WORD_BITS = sizeof(word_t) * 8;

 /* TODO: vector width for procs like ARM; gcc support */
 typedef uint64_t mask_t, vecmask_t __attribute__((ext_vector_type(4)));

 static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1;

 /* FIXME this only works on clang */
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
 typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));

 #if __AVX2__
 typedef uint32x8_t big_register_t;
 typedef uint64x4_t uint64xn_t;
 #elif __SSE2__ || __ARM_NEON__
 typedef uint32x4_t big_register_t;
 typedef uint64x2_t uint64xn_t;
 #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
 typedef uint64_t big_register_t, uint64xn_t;
 #else
 typedef uint64_t uint64xn_t;
 typedef uint32_t big_register_t;
 #endif


 #if __AVX2__ || __SSE2__ || __ARM_NEON__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)(x == (big_register_t)0);
 }
 #else
 #error "Todo: constant-time equality on vectorless platforms"
 #endif

 #endif /* __WORD_H__ */
--- a/x86-64-arith.h
+++ b/x86-64-arith.h
@@ -0,0 +1,246 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __X86_64_ARITH_H__
 #define __X86_64_ARITH_H__

 #include <stdint.h>

 /* TODO: non x86-64 versions of these.
 * TODO: autogenerate
 */

 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"r"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "leaq (,%%rdx,2), %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"r"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
  
 }

 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t c,d, lo = *acc, hi = *acc>>64;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[lo], %[c]; "
       "sbbq %[hi], %[d]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  return ((__uint128_t)(a)) * b;
 }

 static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  return ((__int128_t)(a)) * b;
 }
 
 static __inline__ uint64_t opacify(uint64_t x) {
  __asm__ volatile("" : "+r"(x));
  return x;
 }

 static __inline__ mask_t is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }

 #endif /* __X86_64_ARITH_H__ */