Big changes for curve flexibility. For details see HISTORY.txt.

Very experimental Ed480-Ridinghood support is now in. It's not fully optimized, but in general the current build is 8-15% slower than Goldilocks. It only works on arch_x86_64, though arch_ref64 support ought to be easy. Support on other arches will be trickier, which is of course why I chose Goldilocks over Ridinghood in the first place. Next up, E-521. Hopefully. The code is starting to get spread out over a lot of files. Some are per field*arch, some per field, some per curve, some global. It's hard to do much about this, though, with a rather ugly .c.inc system. There's currently no way to make a Ridinghood eBAT. In fact, I haven't tested eBAT support in this commit. I also haven't tested NEON, but at least ARCH_32 works on Intel.
10 years ago · 1f480b0f95
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,44 @@
 October 23, 2014:
    Pushing through changes for curve flexibility.  First up is
    Ed480-Ridinghood, because it has the same number of words.  Next
    is E-521.
    
    Experimental support for Ed480-Ridinghood.  To use, compile with
        make ... FIELD=p480 -XCFLAGS=-DGOLDI_FIELD_BITS=480
    
    I still need to figure out what to do about the fact that the library
    is called "goldilocks", but in will soon support curves that are not
    ed448-goldilocks, at least experimentally.
        
    Currently the whole system's header "goldilocks.h" doesn't have
    a simpler way to override field size, but it does work (as a hack)
    with -DGOLDI_FIELD_BITS=...
    
    There is no support yet for coexistence of multiple fields in one
    library.  The field routines will have unique names, but scalarmul*
    won't, and the top-level goldilocks routines have fixed names.
    
    Current timings on Haswell:
        Goldilocks: 178kcy keygen, 536kcy ecdh
        Ridinghood: 193kcy keygen, 617kcy ecdh
    
    Note that Ridinghood ECDH does worse than 480/448.  This is at least
    in part because I haven't calculated the overflow handling limits yet
    in ec_point.h (this is a disadvantage of dropping the automated
    tool for generating that file).  So I'm reducing much more often
    than I need to.  (There's a really loud TODO in ec_point.h for that.)
    
    Also, I haven't tested the limits on these reductions in a while, so
    it could be that there are actual (security-critical) bugs in this
    area, at least for p448.  Now that there's field flexibility, it's
    probably a good idea to make a field impl with extra words to check
    this.
    
    Furthermore, field_mulw_scc will perform differently on these two
    curves based on whether the curve constant is positive or negative.
    I should probably go optimize the "hot" routines like montgomery_step
    to have separate cases for positive and negative.

 September 29, 2014:
    Yesterday I put in some more architecture detection, but it should
    really be based on the arch directory, because what's in there really
--- a/+ 9
+++ b/+ 9
@@ -20,12 +20,13 @@ else
 ARCH ?= arch_arm_32
 endif

 FIELD ?= p448

 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 	 
 	 
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 LANGFLAGS = -std=c99 -fno-strict-aliasing
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 OFLAGS = -O3
@@ -63,7 +64,8 @@ ASFLAGS = $(ARCHFLAGS)
 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp

 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o build/arithmetic.o
  build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \
 	build/f_arithmetic.o build/arithmetic.o

 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o
@@ -113,7 +115,10 @@ build/%.s: src/%.c $(HEADERS)
 build/%.s: test/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 build/%.s: src/$(ARCH)/%.c $(HEADERS)
 build/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 build/%.s: src/$(FIELD)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<

 doc/timestamp:
@@ -131,7 +136,7 @@ $(BATNAME): include/* src/* src/*/* test/batarch.map
          targ="$@/crypto_$$prim/ed448goldilocks"; \
 	  (while read arch where; do \
 	    mkdir -p $$targ/`basename $$arch`; \
 	    cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/$$where/*.c src/$$where/*.h $$targ/`basename $$arch`; \
 	    cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
 	    cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
 	    perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $$targ/`basename $$arch`/config.h; \
 	    perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h;  \
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -22,14 +22,18 @@
 #define GOLDI_IMPLEMENT_SIGNATURES 1
 #endif

 /** The size of the Goldilocks field, in bits. */
 /** The size of the Goldilocks field, in bits. 
 * Ifdef'd so you can override when testing experimental Ed480-Ridinghood or E-521.
 */
 #ifndef GOLDI_FIELD_BITS
 #define GOLDI_FIELD_BITS          448
 #endif

 /** The size of the Goldilocks scalars, in bits. */
 #define GOLDI_SCALAR_BITS         446
 #define GOLDI_SCALAR_BITS         (GOLDI_FIELD_BITS-2)

 /** The same size, in bytes. */
 #define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)
 #define GOLDI_FIELD_BYTES         ((GOLDI_FIELD_BITS+7)/8)

 /** The size of a Goldilocks public key, in bytes. */
 #define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES
--- a/include/ridinghood.h
+++ b/include/ridinghood.h
@@ -0,0 +1,376 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 /**
 * @file goldilocks.h
 * @author Mike Hamburg
 * @brief Goldilocks high-level functions.
 */
 #ifndef __GOLDILOCKS_H__
 #define __GOLDILOCKS_H__ 1

 #include <stdint.h>

 #ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
 /** If nonzero, implement precomputation for verify and ECDH. */
 #define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
 #endif

 #ifndef GOLDI_IMPLEMENT_SIGNATURES
 /** If nonzero, implement signatures. */
 #define GOLDI_IMPLEMENT_SIGNATURES 1
 #endif

 /** The size of the Goldilocks field, in bits. */
 #define GOLDI_FIELD_BITS          448

 /** The size of the Goldilocks scalars, in bits. */
 #define GOLDI_SCALAR_BITS         446

 /** The same size, in bytes. */
 #define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)

 /** The size of a Goldilocks public key, in bytes. */
 #define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES

 /** The extra bytes in a Goldilocks private key for the symmetric key. */
 #define GOLDI_SYMKEY_BYTES        32

 /** The size of a shared secret. */
 #define GOLDI_SHARED_SECRET_BYTES 64

 /** The size of a Goldilocks private key, in bytes. */
 #define GOLDI_PRIVATE_KEY_BYTES   (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)

 /** The size of a Goldilocks signature, in bytes. */
 #define GOLDI_SIGNATURE_BYTES     (2*GOLDI_FIELD_BYTES)

 /**
 * @brief Serialized form of a Goldilocks public key.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_public_key_t {
    uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
 };

 /**
 * @brief Serialized form of a Goldilocks private key.
 *
 * Contains 56 bytes of actual private key, 56 bytes of
 * public key, and 32 bytes of symmetric key for randomization.
 *
 * @warning This isn't even my final form!
 */
 struct goldilocks_private_key_t {
    uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
 };

 #ifdef __cplusplus
 extern "C" {
 #endif

 /** @brief No error. */
 static const int GOLDI_EOK      = 0;

 /** @brief Error: your key or other state is corrupt. */
 static const int GOLDI_ECORRUPT = 44801;

 /** @brief Error: other party's key is corrupt. */
 static const int GOLDI_EINVAL   = 44802;

 /** @brief Error: not enough entropy. */
 static const int GOLDI_ENODICE  = 44804;

 /** @brief Error: you need to initialize the library first. */
 static const int GOLDI_EUNINIT  = 44805;

 /** @brief Error: called init() but we are already initialized. */
 static const int GOLDI_EALREADYINIT  = 44805;

 /**
 * @brief Initialize Goldilocks' precomputed tables and
 * random number generator.  This function must be called before
 * any of the other Goldilocks routines (except
 * goldilocks_shared_secret in the current version) and should be
 * called only once per process.
 *
 * There is currently no way to tear down this state.  It is possible
 * that a future version of this library will not require this function.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EALREADYINIT Already initialized.
 * @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
 * @retval Nonzero An error occurred.
 */
 int
 goldilocks_init (void)
 __attribute__((warn_unused_result,visibility ("default")));


 /**
 * @brief Generate a new random keypair.
 * @param [out] privkey The generated private key.
 * @param [out] pubkey The generated public key.
 *
 * @warning This isn't even my final form!
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ENODICE Insufficient entropy.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_keygen (
    struct goldilocks_private_key_t *privkey,
    struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));

 /**
 * @brief Derive a key from its compressed form.
 * @param [out] privkey The derived private key.
 * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
 *
 * @warning This isn't even my final form!
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_derive_private_key (
    struct goldilocks_private_key_t *privkey,
    const unsigned char proto[GOLDI_SYMKEY_BYTES]
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Compress a private key (by copying out the proto-key)
 * @param [out] proto The proto-key.
 * @param [in] privkey The private key.
 *
 * @warning This isn't even my final form!
 * @todo test.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 void
 goldilocks_underive_private_key (
    unsigned char proto[GOLDI_SYMKEY_BYTES],
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Extract the public key from a private key.
 *
 * This is essentially a memcpy from the public part of the privkey.
 *    
 * @param [out] pubkey The extracted private key.
 * @param [in] privkey The private key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT The private key is corrupt.
 */
 int
 goldilocks_private_to_public (
    struct goldilocks_public_key_t *pubkey,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2),visibility ("default")));

 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
 *
 * This function uses some compile-time flags whose merit remains to
 * be decided.
 *
 * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
 * of zeros to the secret before hashing.  In the case that the other
 * party's key is detectably corrupt, instead the symmetric part
 * of the secret key is used to produce a pseudorandom value.
 *
 * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
 * the two parties' public keys is prepended to the hash.
 *
 * In the current version, this function can safely be run even without
 * goldilocks_init().  But this property is not guaranteed for future
 * versions, so call it anyway.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] shared The shared secret established with the other party.
 * @param [in] my_privkey My private key.
 * @param [in] your_pubkey The other party's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EINVAL   The other party's key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_shared_secret (
    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
    const struct goldilocks_private_key_t *my_privkey,
    const struct goldilocks_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

 #if GOLDI_IMPLEMENT_SIGNATURES
 /**
 * @brief Sign a message.
 *
 * The signature is deterministic, using the symmetric secret found in the
 * secret key to form a nonce.
 *
 * The technique used in signing is a modified Schnorr system, like EdDSA.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] signature_out Space for the output signature.
 * @param [in] message The message to be signed.
 * @param [in] message_len The length of the message to be signed.
 * @param [in] privkey My private key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_sign (
    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_private_key_t *privkey
 ) __attribute__((nonnull(1,2,4),visibility ("default")));

 /**
 * @brief Verify a signature.
 *
 * This function is fairly strict.  It will correctly detect when
 * the signature has the wrong cofactor component, or when the sig
 * values aren't less than p or q.
 * 
 * Currently this function does not detect when the public key is weird,
 * eg 0, has cofactor, etc.  As a result, a party with a bogus public
 * key could create signatures that succeed on some systems and fail on
 * others.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] signature The signature.
 * @param [in] message The message to be verified.
 * @param [in] message_len The length of the message to be verified.
 * @param [in] pubkey The signer's public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EINVAL The public key or signature is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_verify (
    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
    const uint8_t *message,
    uint64_t message_len,
    const struct goldilocks_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
 #endif

 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS

 /** A public key which has been expanded by precomputation for higher speed. */
 struct goldilocks_precomputed_public_key_t;

 /**
 * @brief Expand a public key by precomputation.
 *
 * @todo Give actual error returns, instead of ambiguous NULL.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] pub The public key.
 * @retval NULL We ran out of memory, or the 
 */
 struct goldilocks_precomputed_public_key_t *
 goldilocks_precompute_public_key (
    const struct goldilocks_public_key_t *pub
 ) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));

 /**
 * @brief Overwrite an expanded public key with zeros, then destroy it.
 *
 * If the input is NULL, this function does nothing.
 *
 * @param [in] precom The public key.
 */
 void
 goldilocks_destroy_precomputed_public_key (
    struct goldilocks_precomputed_public_key_t *precom
 ) __attribute__((visibility ("default")));

 /**
 * @brief Verify a signature.
 *
 * This function is fairly strict.  It will correctly detect when
 * the signature has the wrong cofactor component, or when the sig
 * values aren't less than p or q.
 *
 * @warning This isn't even my final form!
 *
 * @param [in] signature The signature.
 * @param [in] message The message to be verified.
 * @param [in] message_len The length of the message to be verified.
 * @param [in] pubkey The signer's public key, expanded by precomputation.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_EINVAL The public key or signature is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_verify_precomputed (
   const uint8_t signature[GOLDI_SIGNATURE_BYTES],
   const uint8_t *message,
   uint64_t message_len,
   const struct goldilocks_precomputed_public_key_t *pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
   
 /**
 * @brief Generate a Diffie-Hellman shared secret in constant time.
 * Uses a precomputation on the other party's public key for efficiency.
 *
 * This function uses some compile-time flags whose merit remains to
 * be decided.
 *
 * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
 * of zeros to the secret before hashing.  In the case that the other
 * party's key is detectably corrupt, instead the symmetric part
 * of the secret key is used to produce a pseudorandom value.
 *
 * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
 * the two parties' public keys is prepended to the hash.
 *
 * In the current version, this function can safely be run even without
 * goldilocks_init().  But this property is not guaranteed for future
 * versions, so call it anyway.
 *
 * @warning This isn't even my final form!
 *
 * @param [out] shared The shared secret established with the other party.
 * @param [in] my_privkey My private key.
 * @param [in] your_pubkey The other party's precomputed public key.
 *
 * @retval GOLDI_EOK Success.
 * @retval GOLDI_ECORRUPT My key is corrupt.
 * @retval GOLDI_EINVAL   The other party's key is corrupt.
 * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
 */
 int
 goldilocks_shared_secret_precomputed (
   uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
   const struct goldilocks_private_key_t *my_privkey,
   const struct goldilocks_precomputed_public_key_t *your_pubkey
 ) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));

 #endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __GOLDILOCKS_H__ */
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -12,7 +12,8 @@
 #include "ec_point.h"
 #include "magic.h"

 #define is32 (GOLDI_BITS == 32)
 #define is32 (GOLDI_BITS == 32 || FIELD_BITS == 480)
 /* TODO XXX PERF FIXME: better detection of overflow conditions */

 /* I wanted to just use if (is32)
 * But clang's -Wunreachable-code flags it.
@@ -52,60 +53,6 @@ field_mulw_scc_wr (
        field_weak_reduce(out);
 }

 static __inline__ void
 field_sqrn (
    field_t *__restrict__ y,
    const field_t *x,
    int n
 ) {
    field_t tmp;
    assert(n>0);
    if (n&1) {
        field_sqr(y,x);
        n--;
    } else {
        field_sqr(&tmp,x);
        field_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        field_sqr(&tmp,y);
        field_sqr(y,&tmp);
    }
 }

 void 
 field_isr ( /* TODO: MAGIC */
    struct field_t*       a,
    const struct field_t* x
 ) {
    struct field_t L0, L1, L2;
    field_sqr  (   &L1,     x );
    field_mul  (   &L2,     x,   &L1 );
    field_sqr  (   &L1,   &L2 );
    field_mul  (   &L2,     x,   &L1 );
    field_sqrn (   &L1,   &L2,     3 );
    field_mul  (   &L0,   &L2,   &L1 );
    field_sqrn (   &L1,   &L0,     3 );
    field_mul  (   &L0,   &L2,   &L1 );
    field_sqrn (   &L2,   &L0,     9 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqr  (   &L0,   &L1 );
    field_mul  (   &L2,     x,   &L0 );
    field_sqrn (   &L0,   &L2,    18 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqrn (   &L0,   &L2,    37 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L0,   &L1,    37 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L0,   &L1,   111 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqr  (   &L0,   &L2 );
    field_mul  (   &L1,     x,   &L0 );
    field_sqrn (   &L0,   &L1,   223 );
    field_mul  (     a,   &L2,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
@@ -396,7 +343,7 @@ montgomery_step (
    field_sqr  ( &a->za, &a->zd );
    field_sqr  ( &a->xd,   &L0 );
    field_sqr  (   &L0,   &L1 );
    field_mulw ( &a->zd, &a->xd, 1-EDWARDS_D );
    field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
    field_sub  (   &L1, &a->xd,   &L0 );
    field_bias (   &L1,     2 );
    IF32( field_weak_reduce(   &L1 ) );
@@ -444,11 +391,9 @@ serialize_montgomery (
    field_mul  (   &L3,   &L1,   &L2 );
    field_copy (   &L2, &a->z0 );
    field_addw (   &L2,     1 );
    field_sqr  (   &L1,   &L2 );
    field_mulw (   &L2,   &L1, 1-EDWARDS_D );
    field_neg  (   &L1,   &L2 );
    field_sqr  (   &L0,   &L2 );
    field_mulw_scc_wr (   &L1,   &L0, EDWARDS_D-1 );
    field_add  (   &L2, &a->z0, &a->z0 );
    field_bias (   &L2,     1 );
    field_add  (   &L0,   &L2,   &L2 );
    field_add  (   &L2,   &L0,   &L1 );
    IF32( field_weak_reduce(   &L2 ) );
@@ -512,13 +457,9 @@ untwist_and_double_and_serialize (
    IF32( field_weak_reduce(     b ) );
    field_sqr  (   &L2, &a->z );
    field_sqr  (   &L1,   &L2 );
    field_add  (   &L2,     b,     b );
    field_mulw (     b,   &L2, 1-EDWARDS_D );
    field_neg  (   &L2,     b );
    field_bias (   &L2,     2 );
    field_mulw (   &L0,   &L2, 1-EDWARDS_D );
    field_neg  (     b,   &L0 );
    field_bias (     b,     2 );
    field_add  (   b,     b,     b );
    field_mulw_scc (     &L2,   b, EDWARDS_D-1 );
    field_mulw_scc (   b,   &L2, EDWARDS_D-1 );
    field_mul  (   &L0,   &L2,   &L1 );
    field_mul  (   &L2,     b,   &L0 );
    field_isr  (   &L0,   &L2 );
@@ -654,10 +595,8 @@ deserialize_affine (
    field_copy (   &L3,   &L1 );
    field_addw (   &L3,     1 );
    field_sqr  (   &L2,   &L3 );
    field_mulw (   &L3,   &L2, 1-EDWARDS_D );
    field_neg  ( &a->x,   &L3 );
    field_add  (   &L3,   &L1,   &L1 );
    field_bias (   &L3,     1 );
    field_mulw_scc (   &a->x,   &L2, EDWARDS_D-1 ); /* PERF MULW */
    field_add  (   &L3,   &L1,   &L1 ); /* FIXME: i adjusted the bias here, was it right? */
    field_add  ( &a->y,   &L3,   &L3 );
    field_add  (   &L3, &a->y, &a->x );
    IF32( field_weak_reduce(   &L3 ) );
@@ -694,11 +633,9 @@ deserialize_and_twist_approx (
    field_sqr  ( &a->z,    sz );
    field_copy ( &a->y, &a->z );
    field_addw ( &a->y,     1 );
    field_sqr  ( &a->x, &a->y );
    field_mulw ( &a->y, &a->x, 1-EDWARDS_D );
    field_neg  ( &a->x, &a->y );
    field_sqr  ( &L0, &a->y );
    field_mulw_scc ( &a->x, &L0, EDWARDS_D-1 );
    field_add  ( &a->y, &a->z, &a->z );
    field_bias ( &a->y,     1 );
    field_add  ( &a->u, &a->y, &a->y );
    field_add  ( &a->y, &a->u, &a->x );
    IF32( field_weak_reduce( &a->y ) );
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -543,8 +543,6 @@ copy_tw_pniels (
    field_copy ( &a->z, &ds->z );
 }



 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -1,40 +1,16 @@
 /**
 * @file field.h
 * @brief Field switch code.
 * @brief Generic field header.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */

 #ifndef __FIELD_H__
 #define __FIELD_H__

 #include <string.h>
 #include "constant_time.h"

 #include "p448.h"
 #define FIELD_BITS           448
 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
 #define field_add            p448_add
 #define field_sub            p448_sub
 #define field_mulw           p448_mulw
 #define field_addw           p448_addw
 #define field_subw           p448_subw
 #define field_neg            p448_neg
 #define field_set_ui         p448_set_ui
 #define field_bias           p448_bias
 #define field_cond_neg       p448_cond_neg
 #define field_inverse        p448_inverse
 #define field_eq             p448_eq
 #define field_isr            p448_isr
 #define field_simultaneous_invert p448_simultaneous_invert
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_is_zero        p448_is_zero
 #include "f_field.h"

 /** @brief Bytes in a field element */
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
@@ -42,6 +18,22 @@
 /** @brief Words in a field element */
 #define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))

 /* TODO: standardize notation */
 /** @brief The number of words in the Goldilocks field. */
 #define GOLDI_FIELD_WORDS DIV_CEIL(FIELD_BITS,WORD_BITS)

 /** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */
 #define COFACTOR_BITS 2

 /** @brief The number of bits in a Goldilocks scalar. */
 #define SCALAR_BITS (FIELD_BITS - COFACTOR_BITS)

 /** @brief The number of bytes in a Goldilocks scalar. */
 #define SCALAR_BYTES (1+(SCALAR_BITS)/8)

 /** @brief The number of words in the Goldilocks field. */
 #define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)

 /**
 * @brief For GMP tests: little-endian representation of the field modulus.
 */
@@ -119,5 +111,31 @@ field_eq (
    const struct field_t *a,
    const struct field_t *b
 );
    
 /**
 * Square x, n times.
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 field_sqrn (
    field_t *__restrict__ y,
    const field_t *x,
    int n
 ) {
    field_t tmp;
    assert(n>0);
    if (n&1) {
        field_sqr(y,x);
        n--;
    } else {
        field_sqr(&tmp,x);
        field_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        field_sqr(&tmp,y);
        field_sqr(y,&tmp);
    }
 }

 #endif /* __FIELD_H__ */
 #endif // __FIELD_H__
--- a/src/include/magic.h
+++ b/src/include/magic.h
@@ -4,16 +4,24 @@
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
 * @brief Curve-independent declarations of magic numbers.
 */


 #ifndef __GOLDI_MAGIC_H__
 #define __GOLDI_MAGIC_H__ 1

 #include "word.h"
 #include "p448.h"
 #include "ec_point.h"

 /**
 * @brief If true, use wider tables for the precomputed combs.
 */
 #ifndef USE_BIG_COMBS
 #if defined(__ARM_NEON__)
 #define USE_BIG_COMBS 1
 #else
 #define USE_BIG_COMBS (WORD_BITS==64)
 #endif
 #endif

 /* TODO: standardize notation */

@@ -32,16 +40,13 @@
 /** @brief The number of words in the Goldilocks field. */
 #define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)

 #include "f_magic.h"

 /**
 * @brief sqrt(d-1), used for point formats and twisting.
 */
 extern const struct field_t sqrt_d_minus_1;

 /**
 * @brief The Edwards "d" term for this curve.
 */
 static const int64_t EDWARDS_D = -39081;

 /**
 * @brief The base point for Goldilocks.
 */
@@ -76,34 +81,10 @@ extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS];
 */
 #define SCALARMUL_WNAF_COMBO_TABLE_BITS 4

 /**
 * @brief If true, use wider tables for the precomputed combs.
 */
 #ifndef USE_BIG_COMBS
 #if defined(__ARM_NEON__)
 #define USE_BIG_COMBS 1
 #else
 #define USE_BIG_COMBS (WORD_BITS==64)
 #endif
 #endif

 /** @brief The number of combs to use for signed comb algo */
 #define COMB_N (USE_BIG_COMBS ? 5  : 8)

 /** @brief The number of teeth of the combs for signed comb algo */
 #define COMB_T (USE_BIG_COMBS ? 5  : 4)

 /** @brief The spacing the of combs for signed comb algo */
 #define COMB_S (USE_BIG_COMBS ? 18 : 14)

 /**
 * @brief The bit width of the precomputed WNAF tables.  Size is 2^this elements.
 */
 #define WNAF_PRECMP_BITS 5

 /**
 * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 #endif /* __GOLDI_MAGIC_H__ */
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -37,9 +37,12 @@ typedef int64_t sword_t;
 typedef __int128_t dsword_t;
 #define PRIxWORD PRIx64
 #define PRIxWORDfull "%016" PRIx64
 #define PRIxWORD58   "%014" PRIx64
 #define PRIxWORD56   "%014" PRIx64
 #define PRIxWORD60   "%015" PRIx60
 #define U64LE(x) x##ull
 #define U58LE(x) x##ull
 #define U56LE(x) x##ull
 #define U60LE(x) x##ull
 #define letohWORD letoh64
 #define GOLDI_BITS 64
 #else
@@ -51,9 +54,11 @@ typedef int32_t sword_t;
 typedef int64_t dsword_t;
 #define PRIxWORD PRIx32
 #define PRIxWORDfull "%08" PRIx32
 #define PRIxWORD58   "%07" PRIx32
 #define PRIxWORD56   "%07" PRIx32
 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 #define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
 #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
 #define letohWORD letoh32
 #define GOLDI_BITS 32
 #endif
--- a/src/p448/arch_32/arch_config.h
+++ b/src/p448/arch_32/arch_config.h
--- a/src/p448/arch_32/p448.c
+++ b/src/p448/arch_32/p448.c
--- a/src/p448/arch_32/p448.h
+++ b/src/p448/arch_32/p448.h
--- a/src/p448/arch_arm_32/arch_config.h
+++ b/src/p448/arch_arm_32/arch_config.h
--- a/src/p448/arch_arm_32/p448.c
+++ b/src/p448/arch_arm_32/p448.c
--- a/src/p448/arch_arm_32/p448.h
+++ b/src/p448/arch_arm_32/p448.h
--- a/src/p448/arch_neon/arch_config.h
+++ b/src/p448/arch_neon/arch_config.h
--- a/src/p448/arch_neon/neon_emulation.h
+++ b/src/p448/arch_neon/neon_emulation.h
--- a/src/p448/arch_neon/p448.c
+++ b/src/p448/arch_neon/p448.c
--- a/src/p448/arch_neon/p448.h
+++ b/src/p448/arch_neon/p448.h
--- a/src/p448/arch_neon_experimental/arch_config.h
+++ b/src/p448/arch_neon_experimental/arch_config.h
--- a/src/p448/arch_neon_experimental/p448.c
+++ b/src/p448/arch_neon_experimental/p448.c
--- a/src/p448/arch_neon_experimental/p448.h
+++ b/src/p448/arch_neon_experimental/p448.h
--- a/src/p448/arch_ref64/arch_config.h
+++ b/src/p448/arch_ref64/arch_config.h
--- a/src/p448/arch_ref64/p448.c
+++ b/src/p448/arch_ref64/p448.c
--- a/src/p448/arch_ref64/p448.h
+++ b/src/p448/arch_ref64/p448.h
--- a/src/p448/arch_x86_64/arch_config.h
+++ b/src/p448/arch_x86_64/arch_config.h
--- a/src/p448/arch_x86_64/p448.c
+++ b/src/p448/arch_x86_64/p448.c
--- a/src/p448/arch_x86_64/p448.h
+++ b/src/p448/arch_x86_64/p448.h
--- a/src/p448/arch_x86_64/x86-64-arith.h
+++ b/src/p448/arch_x86_64/x86-64-arith.h
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -0,0 +1,43 @@
 /**
 * @cond internal
 * @file f_arithmetic.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Field-specific arithmetic.
 */

 #include "ec_point.h"

 void 
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 ) {
    struct field_t L0, L1, L2;
    field_sqr  (   &L1,     x );
    field_mul  (   &L2,     x,   &L1 );
    field_sqr  (   &L1,   &L2 );
    field_mul  (   &L2,     x,   &L1 );
    field_sqrn (   &L1,   &L2,     3 );
    field_mul  (   &L0,   &L2,   &L1 );
    field_sqrn (   &L1,   &L0,     3 );
    field_mul  (   &L0,   &L2,   &L1 );
    field_sqrn (   &L2,   &L0,     9 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqr  (   &L0,   &L1 );
    field_mul  (   &L2,     x,   &L0 );
    field_sqrn (   &L0,   &L2,    18 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqrn (   &L0,   &L2,    37 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L0,   &L1,    37 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L0,   &L1,   111 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqr  (   &L0,   &L2 );
    field_mul  (   &L1,     x,   &L0 );
    field_sqrn (   &L0,   &L1,   223 );
    field_mul  (     a,   &L2,   &L0 );
 }
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -0,0 +1,39 @@
 /**
 * @file f_field.h
 * @brief Field-specific code.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */
 #ifndef __F_FIELD_H__
 #define __F_FIELD_H__ 1

 #include <string.h>
 #include "constant_time.h"

 #include "p448.h"
 #define FIELD_BITS           448
 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
 #define field_add            p448_add
 #define field_sub            p448_sub
 #define field_mulw           p448_mulw
 #define field_addw           p448_addw
 #define field_subw           p448_subw
 #define field_neg            p448_neg
 #define field_set_ui         p448_set_ui
 #define field_bias           p448_bias
 #define field_cond_neg       p448_cond_neg
 #define field_inverse        p448_inverse
 #define field_eq             p448_eq
 #define field_isr            p448_isr
 #define field_simultaneous_invert p448_simultaneous_invert
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_is_zero        p448_is_zero

 #endif /* __F_FIELD_H__ */
--- a/src/p448/f_magic.h
+++ b/src/p448/f_magic.h
@@ -0,0 +1,35 @@
 /**
 * @file f_magic.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
 */

 #ifndef __GOLDI_F_MAGIC_H__
 #define __GOLDI_F_MAGIC_H__ 1

 #include "field.h"
 #include "ec_point.h"

 /**
 * @brief The Edwards "d" term for this curve.
 */
 static const int64_t EDWARDS_D = -39081;

 /** @brief The number of combs to use for signed comb algo */
 #define COMB_N (USE_BIG_COMBS ? 5  : 8)

 /** @brief The number of teeth of the combs for signed comb algo */
 #define COMB_T (USE_BIG_COMBS ? 5  : 4)

 /** @brief The spacing the of combs for signed comb algo */
 #define COMB_S (USE_BIG_COMBS ? 18 : 14)

 /**
 * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 #endif /* __GOLDI_F_MAGIC_H__ */
--- a/src/p448/field.h
+++ b/src/p448/field.h
@@ -0,0 +1,123 @@
 /**
 * @file field.h
 * @brief Field switch code.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */
 #ifndef __FIELD_H__
 #define __FIELD_H__

 #include <string.h>
 #include "constant_time.h"

 #include "p448.h"
 #define FIELD_BITS           448
 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
 #define field_add            p448_add
 #define field_sub            p448_sub
 #define field_mulw           p448_mulw
 #define field_addw           p448_addw
 #define field_subw           p448_subw
 #define field_neg            p448_neg
 #define field_set_ui         p448_set_ui
 #define field_bias           p448_bias
 #define field_cond_neg       p448_cond_neg
 #define field_inverse        p448_inverse
 #define field_eq             p448_eq
 #define field_isr            p448_isr
 #define field_simultaneous_invert p448_simultaneous_invert
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_is_zero        p448_is_zero

 /** @brief Bytes in a field element */
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)

 /** @brief Words in a field element */
 #define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))

 /**
 * @brief For GMP tests: little-endian representation of the field modulus.
 */
 extern const uint8_t FIELD_MODULUS[FIELD_BYTES];

 /**
 * Copy one field element to another.
 */
 static inline void
 __attribute__((unused,always_inline))        
 field_copy (
    struct field_t *__restrict__ a,
    const struct field_t *__restrict__ b
 ) {
    memcpy(a,b,sizeof(*a));
 }

 /**
 * Negate a in place if doNegate.
 */
 static inline void
 __attribute__((unused,always_inline)) 
 field_cond_neg(
    field_t *a,
    mask_t doNegate
 ) {
 	struct field_t negated;
    field_neg(&negated, a);
    field_bias(&negated, 2);
 	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
 }

 /**
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 );
    
 /**
 * Batch inverts out[i] = 1/in[i]
 * 
 * If any input is zero, all the outputs will be zero.
 */     
 void
 field_simultaneous_invert (
    struct field_t *__restrict__ out,
    const struct field_t *in,
    unsigned int n
 );

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 field_inverse (
    struct field_t*       a,
    const struct field_t* x
 );

 /**
 * Returns -1 if a==b, 0 otherwise.
 */
 mask_t
 field_eq (
    const struct field_t *a,
    const struct field_t *b
 );

 #endif /* __FIELD_H__ */
--- a/src/p448/magic.c
+++ b/src/p448/magic.c
@@ -39,10 +39,10 @@ const struct affine_t goldilocks_base_point = {
       0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
    }},
 #else
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
    {{ U56LE(0xf0de840aed939f), U56LE(0xc170033f4ba0c7),
       U56LE(0xf3932d94c63d96), U56LE(0x9cecfa96147eaa),
       U56LE(0x5f065c3c59d070), U56LE(0x3a6a26adf73324),
       U56LE(0x1b4faff4609845), U56LE(0x297ea0ea2692ff)
    }},
 #endif
    {{ 19 }}
@@ -69,13 +69,13 @@ sqrt_d_minus_1 = {{
    0xbdeea38,0x748734a,0x5a189aa,0x49443b8,
    0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c
 #else
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
    U58LE(0x51e65ca6f14c06),
    U58LE(0xa49f7b424d9770),
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
    U56LE(0xd2e21836749f46),
    U56LE(0x888db42b4f0179),
    U56LE(0x5a189aabdeea38),
    U56LE(0x51e65ca6f14c06),
    U56LE(0xa49f7b424d9770),
    U56LE(0xdcac4628c5f656),
    U56LE(0x49443b8748734a),
    U56LE(0x12fec0c0b25b7a)
 #endif
 }};
--- a/src/p480/arch_x86_64/arch_config.h
+++ b/src/p480/arch_x86_64/arch_config.h
@@ -0,0 +1 @@
 #define WORD_BITS 64
--- a/src/p480/arch_x86_64/p480.c
+++ b/src/p480/arch_x86_64/p480.c
@@ -0,0 +1,435 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p480.h"
 #include "x86-64-arith.h"

 void
 p480_mul (
    p480_t *__restrict__ cs,
    const p480_t *as,
    const p480_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<60) - 1;  

    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
        ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
    }
    /*
    for (int i=0; i<4; i++) {
    aa[i] = a[i] + a[i+4];
    bb[i] = b[i] + b[i+4];
    }
    */

    accum2  = widemul(&a[0],&b[3]);
    accum0  = widemul(&aa[0],&bb[3]);
    accum1  = widemul(&a[4],&b[7]);

    mac(&accum2, &a[1], &b[2]);
    mac(&accum0, &aa[1], &bb[2]);
    mac(&accum1, &a[5], &b[6]);

    mac(&accum2, &a[2], &b[1]);
    mac(&accum0, &aa[2], &bb[1]);
    mac(&accum1, &a[6], &b[5]);

    mac(&accum2, &a[3], &b[0]);
    mac(&accum0, &aa[3], &bb[0]);
    mac(&accum1, &a[7], &b[4]);

    accum0 -= accum2;
    accum1 += accum2;

    c[3] = ((uint64_t)(accum1)) & mask;
    c[7] = ((uint64_t)(accum0)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;
    
    mac(&accum0, &aa[1],&bb[3]);
    mac(&accum1, &a[5], &b[7]);
    mac(&accum0, &aa[2], &bb[2]);
    mac(&accum1, &a[6], &b[6]);
    mac(&accum0, &aa[3], &bb[1]);
    accum1 += accum0;

    accum2 = widemul(&a[0],&b[0]);
    accum1 -= accum2;
    accum0 += accum2;
    
    msb(&accum0, &a[1], &b[3]);
    msb(&accum0, &a[2], &b[2]);
    mac(&accum1, &a[7], &b[5]);
    msb(&accum0, &a[3], &b[1]);
    mac(&accum1, &aa[0], &bb[0]);
    mac(&accum0, &a[4], &b[4]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum2  = widemul(&a[2],&b[7]);
    mac(&accum0, &a[6], &bb[3]);
    mac(&accum1, &aa[2], &bbb[3]);

    mac(&accum2, &a[3], &b[6]);
    mac(&accum0, &a[7], &bb[2]);
    mac(&accum1, &aa[3], &bbb[2]);

    mac(&accum2, &a[0],&b[1]);
    mac(&accum1, &aa[0], &bb[1]);
    mac(&accum0, &a[4], &b[5]);

    mac(&accum2, &a[1], &b[0]);
    mac(&accum1, &aa[1], &bb[0]);
    mac(&accum0, &a[5], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum2  = widemul(&a[3],&b[7]);
    mac(&accum0, &a[7], &bb[3]);
    mac(&accum1, &aa[3], &bbb[3]);

    mac(&accum2, &a[0],&b[2]);
    mac(&accum1, &aa[0], &bb[2]);
    mac(&accum0, &a[4], &b[6]);

    mac(&accum2, &a[1], &b[1]);
    mac(&accum1, &aa[1], &bb[1]);
    mac(&accum0, &a[5], &b[5]);

    mac(&accum2, &a[2], &b[0]);
    mac(&accum1, &aa[2], &bb[0]);
    mac(&accum0, &a[6], &b[4]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 60;
    accum1 >>= 60;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p480_mulw (
    p480_t *__restrict__ cs,
    const p480_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0, accum4;
    uint64_t mask = (1ull<<60) - 1;  

    accum0 = widemul_rm(b, &a[0]);
    accum4 = widemul_rm(b, &a[4]);

    c[0] = accum0 & mask; accum0 >>= 60;
    c[4] = accum4 & mask; accum4 >>= 60;

    mac_rm(&accum0, b, &a[1]);
    mac_rm(&accum4, b, &a[5]);

    c[1] = accum0 & mask; accum0 >>= 60;
    c[5] = accum4 & mask; accum4 >>= 60;

    mac_rm(&accum0, b, &a[2]);
    mac_rm(&accum4, b, &a[6]);

    c[2] = accum0 & mask; accum0 >>= 60;
    c[6] = accum4 & mask; accum4 >>= 60;

    mac_rm(&accum0, b, &a[3]);
    mac_rm(&accum4, b, &a[7]);

    c[3] = accum0 & mask; accum0 >>= 60;
    c[7] = accum4 & mask; accum4 >>= 60;
    
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 60;

    accum4 += c[0];
    c[0] = accum4 & mask;
    c[1] += accum4 >> 60;
 }

 void
 p480_sqr (
    p480_t *__restrict__ cs,
    const p480_t *as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<60) - 1;  

    uint64_t aa[4] __attribute__((aligned(32)));

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
    }

    accum2  = widemul(&a[0],&a[3]);
    accum0  = widemul(&aa[0],&aa[3]);
    accum1  = widemul(&a[4],&a[7]);

    mac(&accum2, &a[1], &a[2]);
    mac(&accum0, &aa[1], &aa[2]);
    mac(&accum1, &a[5], &a[6]);

    accum0 -= accum2;
    accum1 += accum2;

    c[3] = ((uint64_t)(accum1))<<1 & mask;
    c[7] = ((uint64_t)(accum0))<<1 & mask;

    accum0 >>= 59;
    accum1 >>= 59;

    mac2(&accum0, &aa[1],&aa[3]);
    mac2(&accum1, &a[5], &a[7]);
    mac(&accum0, &aa[2], &aa[2]);
    accum1 += accum0;

    msb2(&accum0, &a[1], &a[3]);
    mac(&accum1, &a[6], &a[6]);
    
    accum2 = widemul(&a[0],&a[0]);
    accum1 -= accum2;
    accum0 += accum2;

    msb(&accum0, &a[2], &a[2]);
    mac(&accum1, &aa[0], &aa[0]);
    mac(&accum0, &a[4], &a[4]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum2  = widemul2(&aa[2],&aa[3]);
    msb2(&accum0, &a[2], &a[3]);
    mac2(&accum1, &a[6], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[1]);
    mac2(&accum1, &aa[0], &aa[1]);
    mac2(&accum0, &a[4], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum2  = widemul(&aa[3],&aa[3]);
    msb(&accum0, &a[3], &a[3]);
    mac(&accum1, &a[7], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul2(&a[0],&a[2]);
    mac2(&accum1, &aa[0], &aa[2]);
    mac2(&accum0, &a[4], &a[6]);

    mac(&accum2, &a[1], &a[1]);
    mac(&accum1, &aa[1], &aa[1]);
    mac(&accum0, &a[5], &a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 60;
    accum1 >>= 60;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 60;
    accum1 >>= 60;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p480_strong_reduce (
    p480_t *a
 ) {
    uint64_t mask = (1ull<<60)-1;

    /* first, clear high */
    a->limb[4] += a->limb[7]>>60;
    a->limb[0] += a->limb[7]>>60;
    a->limb[7] &= mask;

    /* now the total is less than 2^480 - 2^(480-60) + 2^(480-60+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    __int128_t scarry = 0;
    int i;
    for (i=0; i<8; i++) {
        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 60;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^480
    * so let's add back in p.  will carry back off the top for 2^480.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<8; i++) {
        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 60;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p480_is_zero (
    const struct p480_t *a
 ) {
    struct p480_t b;
    p480_copy(&b,a);
    p480_strong_reduce(&b);

    uint64_t any = 0;
    int i;
    for (i=0; i<8; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p480_serialize (
    uint8_t *serial,
    const struct p480_t *x
 ) {
    int i,j,k=0;
    p480_t red;
    p480_copy(&red, x);
    p480_strong_reduce(&red);
    word_t r = 0;
    for (i=0; i<8; i+=2) {
        r = red.limb[i];
        for (j=0; j<7; j++) {
            serial[k++] = r;
            r >>= 8;
        }
        assert(r<16);
        r += red.limb[i+1]<<4;
        for (j=0; j<8; j++) {
            serial[k++] = r;
            r >>= 8;
        }
        assert(r==0);
    }
 }

 mask_t
 p480_deserialize (
    p480_t *x,
    const uint8_t serial[60]
 ) {
    int i,j,k=0;

    for (i=0; i<8; i+=2) {
        word_t r = 0;
        for (j=0; j<8; j++) {
            r |= ((word_t)serial[k++])<<(8*j);
        }
        x->limb[i] = r & ((1ull<<60)-1);
        r >>= 60;
        for (j=0; j<7; j++) {
            r |= ((word_t)serial[k++])<<(8*j+4);
        }
        x->limb[i+1] = r;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 60 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    word_t ge = -1, mask = (1ull<<60)-1;
    for (i=0; i<4; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
 }

--- a/src/p480/arch_x86_64/p480.h
+++ b/src/p480/arch_x86_64/p480.h
@@ -0,0 +1,257 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __p480_H__
 #define __p480_H__ 1

 #include <stdint.h>
 #include <assert.h>

 #include "word.h"

 typedef struct p480_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p480_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p480_set_ui (
    p480_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p480_add (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_sub (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_neg (
    p480_t *out,
    const p480_t *a
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p480_addw (
    p480_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_subw (
    p480_t *a,
    uint64_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_copy (
    p480_t *out,
    const p480_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p480_weak_reduce (
    p480_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p480_strong_reduce (
    p480_t *inout
 );

 mask_t
 p480_is_zero (
    const p480_t *in
 );
  
 static __inline__ void
 p480_bias (
    p480_t *inout,
    int amount
 ) __attribute__((unused,always_inline));
         
 void
 p480_mul (
    p480_t *__restrict__ out,
    const p480_t *a,
    const p480_t *b
 );

 void
 p480_mulw (
    p480_t *__restrict__ out,
    const p480_t *a,
    uint64_t b
 );

 void
 p480_sqr (
    p480_t *__restrict__ out,
    const p480_t *a
 );

 void
 p480_serialize (
    uint8_t *serial,
    const struct p480_t *x
 );

 mask_t
 p480_deserialize (
    p480_t *x,
    const uint8_t serial[60]
 );

 /* -------------- Inline functions begin here -------------- */

 void
 p480_set_ui (
    p480_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x;
    for (i=1; i<8; i++) {
      out->limb[i] = 0;
    }
 }

 void
 p480_add (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    */
 }

 void
 p480_sub (
    p480_t *out,
    const p480_t *a,
    const p480_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p480_neg (
    struct p480_t *out,
    const p480_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p480_addw (
    p480_t *a,
    uint64_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p480_subw (
    p480_t *a,
    uint64_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p480_copy (
    p480_t *out,
    const p480_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
    }
 }

 void
 p480_bias (
    p480_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
    
 #if __AVX2__
    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
    uint64x4_t *aa = (uint64x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
 #elif __SSE2__
    uint64x2_t lo = {co1,co1}, hi = {co2,co1};
    uint64x2_t *aa = (uint64x2_t*) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 #else
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
        a->limb[i] += (i==4) ? co2 : co1;
    }
 #endif
 }

 void
 p480_weak_reduce (
    p480_t *a
 ) {
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
    uint64_t mask = (1ull<<60) - 1;
    uint64_t tmp = a->limb[7] >> 60;
    int i;
    a->limb[4] += tmp;
    for (i=7; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>60);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __p480_H__ */
--- a/src/p480/arch_x86_64/x86-64-arith.h
+++ b/src/p480/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,279 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #ifndef __X86_64_ARITH_H__
 #define __X86_64_ARITH_H__

 #include <stdint.h>

 /* TODO: non x86-64 versions of these.
 * FUTURE: autogenerate
 */

 static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax;"
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"r"(a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"d"(a));
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
  #ifndef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b];"
       : [c]"=a"(c), [d]"=d"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "cc");
  return (((__uint128_t)(d))<<64) | c;
  #else
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx;"
       "leaq (,%%rdx,2), %%rdx;"
       "mulx %[b], %[c], %[d];"
       : [c]"=r"(c), [d]"=r"(d)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx");
  return (((__uint128_t)(d))<<64) | c;
  #endif
 }

 static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       "addq %[c], %[lo2]; "
       "adcq %[d], %[hi2]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       "addq %%rax, %[lo2]; "
       "adcq %%rdx, %[hi2]; "
       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
 }

 static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"d"(a)
       : "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"r"(a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "addq %[c], %[lo]; "
       "adcq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "addq %%rax, %[lo]; "
       "adcq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
 }

 static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t lo = *acc, hi = *acc>>64;
  #ifdef __BMI2__
  uint64_t c,d;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "addq %%rdx, %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[c], %[lo]; "
       "sbbq %[d], %[hi]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  #else
  __asm__ volatile
      ("movq %[a], %%rax; "
       "addq %%rax, %%rax; "
       "mulq %[b]; "
       "subq %%rax, %[lo]; "
       "sbbq %%rdx, %[hi]; "
       : [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rax", "rdx", "cc");
  #endif
  *acc = (((__uint128_t)(hi))<<64) | lo;
  
 }

 static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
  uint64_t c,d, lo = *acc, hi = *acc>>64;
  __asm__ volatile
      ("movq %[a], %%rdx; "
       "mulx %[b], %[c], %[d]; "
       "subq %[lo], %[c]; "
       "sbbq %[hi], %[d]; "
       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
       : [b]"m"(*b), [a]"m"(*a)
       : "rdx", "cc");
  *acc = (((__uint128_t)(d))<<64) | c;
 }

 static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
  return ((__uint128_t)(a)) * b;
 }

 static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
  return ((__int128_t)(a)) * b;
 }
 
 static __inline__ uint64_t opacify(uint64_t x) {
  __asm__ volatile("" : "+r"(x));
  return x;
 }

 static __inline__ mask_t is_zero(uint64_t x) {
  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
  return ~x;
 }

 #endif /* __X86_64_ARITH_H__ */
--- a/src/p480/f_arithmetic.c
+++ b/src/p480/f_arithmetic.c
@@ -0,0 +1,43 @@
 /**
 * @cond internal
 * @file f_arithmetic.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Field-specific arithmetic.
 */

 #include "ec_point.h"

 void 
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 ) {
    struct field_t L0, L1, L2, L3;
    field_sqr  (   &L2,     x );
    field_mul  (   &L1,     x,   &L2 );
    field_sqrn (   &L0,   &L1,     2 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqrn (   &L0,   &L2,     4 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqr  (   &L0,   &L1 );
    field_mul  (   &L2,     x,   &L0 );
    field_sqrn (   &L0,   &L2,     8 );
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqrn (   &L0,   &L2,    17 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L0,   &L1,    17 );
    field_mul  (   &L1,   &L2,   &L0 );
    field_sqrn (   &L3,   &L1,    17 );
    field_mul  (   &L0,   &L2,   &L3 );
    field_sqrn (   &L2,   &L0,    51 );
    field_mul  (   &L0,   &L1,   &L2 );
    field_sqrn (   &L1,   &L0,   119 );
    field_mul  (   &L2,   &L0,   &L1 );
    field_sqr  (   &L0,   &L2 );
    field_mul  (   &L1,     x,   &L0 );
    field_sqrn (   &L0,   &L1,   239 );
    field_mul  (     a,   &L2,   &L0 );
 }
--- a/src/p480/f_field.h
+++ b/src/p480/f_field.h
@@ -0,0 +1,39 @@
 /**
 * @file f_field.h
 * @brief Field-specific code.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */
 #ifndef __F_FIELD_H__
 #define __F_FIELD_H__ 1

 #include <string.h>
 #include "constant_time.h"

 #include "p480.h"
 #define FIELD_BITS           480
 #define field_t              p480_t
 #define field_mul            p480_mul
 #define field_sqr            p480_sqr
 #define field_add            p480_add
 #define field_sub            p480_sub
 #define field_mulw           p480_mulw
 #define field_addw           p480_addw
 #define field_subw           p480_subw
 #define field_neg            p480_neg
 #define field_set_ui         p480_set_ui
 #define field_bias           p480_bias
 #define field_cond_neg       p480_cond_neg
 #define field_inverse        p480_inverse
 #define field_eq             p480_eq
 #define field_isr            p480_isr
 #define field_simultaneous_invert p480_simultaneous_invert
 #define field_weak_reduce    p480_weak_reduce
 #define field_strong_reduce  p480_strong_reduce
 #define field_serialize      p480_serialize
 #define field_deserialize    p480_deserialize
 #define field_is_zero        p480_is_zero

 #endif /* __F_FIELD_H__ */
--- a/src/p480/f_magic.h
+++ b/src/p480/f_magic.h
@@ -0,0 +1,35 @@
 /**
 * @file f_magic.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
 */

 #ifndef __GOLDI_F_MAGIC_H__
 #define __GOLDI_F_MAGIC_H__ 1

 #include "field.h"
 #include "ec_point.h"

 /**
 * @brief The Edwards "d" term for this curve.
 */
 static const int64_t EDWARDS_D = 53825;

 /** @brief The number of combs to use for signed comb algo */
 #define COMB_N (USE_BIG_COMBS ? 6  : 5)

 /** @brief The number of teeth of the combs for signed comb algo */
 #define COMB_T (USE_BIG_COMBS ? 5  : 4)

 /** @brief The spacing the of combs for signed comb algo */
 #define COMB_S (USE_BIG_COMBS ? 16 : 24)

 /**
 * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
 */
 #define CRANDOM_MAGIC 0x72657475726e2034ull

 #endif /* __GOLDI_F_MAGIC_H__ */
--- a/src/p480/magic.c
+++ b/src/p480/magic.c
@@ -0,0 +1,68 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "field.h"
 #include "magic.h"
 #include "barrett_field.h"

 /* FUTURE: automatically generate this file? */

 const uint8_t FIELD_MODULUS[FIELD_BYTES] = {
      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 /*!*/ 0xfe, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 };

 const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
    U64LE(0x58b51bc56ea8f0c4),
    U64LE(0xd361f6a2348b50c9),
    U64LE(0x08089c139c0002ae),
    U64LE(0x0001d2ac3d9503a0),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    0x40000000,
    
    U64LE(0xcb9c25073e36965b),
    U64LE(0x6f2d48d8460f1661),
    U64LE(0x0ab6256f7aaaae3e),
    U64LE(0x00026e3afcc6af80),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    U64LE(0x0000000000000000),
    0x00000000
 };

 const struct affine_t goldilocks_base_point = {
    {{
        U60LE(0x849ff7f845c30d3),
        U60LE(0x7dda488553a4c5b),
        U60LE(0x1d3a2d9844831ea),
        U60LE(0xb33ecf6ade470a2),
        U60LE(0x8b3cb95210bd3c3),
        U60LE(0xfc955e59aeefa65),
        U60LE(0x3ab247cd530013c),
        U60LE(0x7ca42af3d564280)
    }},
    {{ 5 }}
 };

 static const word_t curve_prime_order_lo[(240+WORD_BITS-1)/WORD_BITS] = {
    U64LE(0x72e70941cf8da597),
    U64LE(0x9bcb52361183c598),
    U64LE(0x02ad895bdeaaab8f),
    U64LE(0x9b8ebf31abe0)
 };
 const struct barrett_prime_t curve_prime_order = {
    GOLDI_FIELD_WORDS,
    30 % WORD_BITS,
    sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]),
    curve_prime_order_lo
 };

 const struct field_t
 sqrt_d_minus_1 = {{
    232 /* Whoa, it comes out even. */
 }};
--- a/src/p521/f_arithmetic.c
+++ b/src/p521/f_arithmetic.c
@@ -0,0 +1,43 @@
 /**
 * @cond internal
 * @file f_arithmetic.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @brief Field-specific arithmetic.
 */

 #include "ec_point.h"

 void 
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 ) {
    struct field_t L0, L1, L2;
    field_sqr  (   &L1,     x );
    field_mul  (   &L0,     x,   &L1 );
    field_sqrn (   &L2,   &L0,     2 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqrn (   &L2,   &L1,     4 );
    field_mul  (   &L0,   &L1,   &L2 );
    field_sqrn (   &L2,   &L0,     8 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqrn (   &L2,   &L1,    16 );
    field_mul  (   &L0,   &L1,   &L2 );
    field_sqrn (   &L2,   &L0,    32 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqr  (   &L2,   &L1 );
    field_mul  (   &L0,     x,   &L2 );
    field_sqrn (   &L2,   &L0,    64 );
    field_mul  (   &L0,   &L1,   &L2 );
    field_sqrn (   &L2,   &L0,   129 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqr  (   &L2,   &L1 );
    field_mul  (   &L0,     x,   &L2 );
    field_sqrn (   &L2,   &L0,   259 );
    field_mul  (   &L1,   &L0,   &L2 );
    field_sqr  (   &L0,   &L1 );
    field_mul  (     a,     x,   &L0 );
 }
--- a/src/p521/f_field.h
+++ b/src/p521/f_field.h
@@ -0,0 +1,39 @@
 /**
 * @file f_field.h
 * @brief Field-specific code.
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 */
 #ifndef __F_FIELD_H__
 #define __F_FIELD_H__ 1

 #include <string.h>
 #include "constant_time.h"

 #include "p521.h"
 #define FIELD_BITS           521
 #define field_t              p521_t
 #define field_mul            p521_mul
 #define field_sqr            p521_sqr
 #define field_add            p521_add
 #define field_sub            p521_sub
 #define field_mulw           p521_mulw
 #define field_addw           p521_addw
 #define field_subw           p521_subw
 #define field_neg            p521_neg
 #define field_set_ui         p521_set_ui
 #define field_bias           p521_bias
 #define field_cond_neg       p521_cond_neg
 #define field_inverse        p521_inverse
 #define field_eq             p521_eq
 #define field_isr            p521_isr
 #define field_simultaneous_invert p521_simultaneous_invert
 #define field_weak_reduce    p521_weak_reduce
 #define field_strong_reduce  p521_strong_reduce
 #define field_serialize      p521_serialize
 #define field_deserialize    p521_deserialize
 #define field_is_zero        p521_is_zero

 #endif /* __F_FIELD_H__ */
--- a/test/bench.c
+++ b/test/bench.c
@@ -39,13 +39,12 @@ static void q448_randomize( struct crandom_state_t *crand, word_t sk[SCALAR_WORD
 }

 static void field_print( const char *descr, const struct field_t *a ) {
    field_t b;
    field_copy(&b, a);
    field_strong_reduce(&b);
    int j;
    unsigned char ser[FIELD_BYTES];
    field_serialize(ser,a);
    printf("%s = 0x", descr);
    for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) {
        printf(PRIxWORD58, b.limb[j]);
    for (j=FIELD_BYTES - 1; j>=0; j--) {
        printf("%02x", ser[j]);
    }
    printf("\n");
 }
@@ -58,7 +57,7 @@ field_print_full (
    int j;
    printf("%s = 0x", descr);
    for (j=15; j>=0; j--) {
        printf("%02" PRIxWORD "_" PRIxWORD58 " ",
        printf("%02" PRIxWORD "_" PRIxWORD56 " ",
            a->limb[j]>>28, a->limb[j]&((1<<28)-1));
    }
    printf("\n");
--- a/test/test.c
+++ b/test/test.c
@@ -84,13 +84,12 @@ void field_print (
    const char *descr,
    const struct field_t *a
 ) {
    field_t b;
    field_copy(&b, a);
    field_strong_reduce(&b);
    int j;
    unsigned char ser[FIELD_BYTES];
    field_serialize(ser,a);
    printf("%s = 0x", descr);
    for (j=FIELD_WORDS - 1; j>=0; j--) {
        printf(PRIxWORD58, b.limb[LIMBPERM(j)]);
    for (j=FIELD_BYTES - 1; j>=0; j--) {
        printf("%02x", ser[j]);
    }
    printf("\n");
 }
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -22,6 +22,8 @@ static mask_t mpz_to_field (

 static mask_t field_assert_eq_gmp(
    const char *descr,
    const struct field_t *a,
    const struct field_t *b,
    const struct field_t *x,
    const mpz_t y,
    float lowBound,
@@ -40,7 +42,7 @@ static mask_t field_assert_eq_gmp(
    
    unsigned int i;
    for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
        int radix_bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
        int radix_bits = sizeof(x->limb[0]) * FIELD_BITS / sizeof(*x);
        word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
            (1ull<<radix_bits) - 2 : (1ull<<radix_bits) - 1; // FIELD_MAGIC
        if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
@@ -54,6 +56,8 @@ static mask_t field_assert_eq_gmp(
    if (memcmp(xser,yser,FIELD_BYTES)) {
        youfail();
        printf("    Failed arithmetic test %s\n", descr);
        field_print("    a", a);
        field_print("    b", b);
        field_print("    goldi", x);
        printf("    gmp   = 0x");
        int j;
@@ -82,28 +86,30 @@ static mask_t test_add_sub (
    
    field_add(&tt,&xx,&yy);
    mpz_add(t,x,y);
    succ &= field_assert_eq_gmp("add",&tt,t,0,2.1);
    succ &= field_assert_eq_gmp("add",&xx,&yy,&tt,t,0,2.1);
    
    field_sub(&tt,&xx,&yy);
    field_bias(&tt,2);
    mpz_sub(t,x,y);
    succ &= field_assert_eq_gmp("sub",&tt,t,0,3.1);
    succ &= field_assert_eq_gmp("sub",&xx,&yy,&tt,t,0,3.1);
    
    field_copy(&tt,&xx);
    field_addw(&tt,word);
    mpz_add_ui(t,x,word);
    succ &= field_assert_eq_gmp("addw",&tt,t,0,2.1);
    succ &= field_assert_eq_gmp("addw",&xx,&yy,&tt,t,0,2.1);
    
    field_copy(&tt,&xx);
    field_subw(&tt,word);
    field_bias(&tt,1);
    mpz_sub_ui(t,x,word);
    succ &= field_assert_eq_gmp("subw",&tt,t,0,2.1);
    
    succ &= field_assert_eq_gmp("subw",&xx,&yy,&tt,t,0,2.1);

    /*
    if (!succ) {
        field_print("    x", &xx);
        field_print("    y", &yy);
    }
    */
    
    mpz_clear(t);
    
@@ -124,19 +130,19 @@ static mask_t test_mul_sqr (
    
    field_mul(&tt,&xx,&yy);
    mpz_mul(t,x,y);
    succ &= field_assert_eq_gmp("mul",&tt,t,0,1.1);
    succ &= field_assert_eq_gmp("mul",&xx,&yy,&tt,t,0,1.1);
    
    field_mulw(&tt,&xx,word);
    mpz_mul_ui(t,x,word);
    succ &= field_assert_eq_gmp("mulw",&tt,t,0,1.1);
    succ &= field_assert_eq_gmp("mulw",&xx,&yy,&tt,t,0,1.1);
    
    field_sqr(&tt,&xx);
    mpz_mul(t,x,x);
    succ &= field_assert_eq_gmp("sqrx",&tt,t,0,1.1);
    succ &= field_assert_eq_gmp("sqrx",&xx,&yy,&tt,t,0,1.1);
    
    field_sqr(&tt,&yy);
    mpz_mul(t,y,y);
    succ &= field_assert_eq_gmp("sqy",&tt,t,0,1.1);
    succ &= field_assert_eq_gmp("sqy",&xx,&yy,&tt,t,0,1.1);
    
    if (!succ) {
        field_print("    x", &xx);
@@ -148,6 +154,36 @@ static mask_t test_mul_sqr (
    return succ;
 }

 static mask_t test_isr (
    const mpz_t x
 ) {
    struct field_t xx,yy,ss,tt;
    mask_t succ = 0;
    succ  = mpz_to_field(&xx,x);
    
    field_isr(&ss,&xx);
    field_sqr(&tt,&ss);
    field_mul(&yy,&xx,&tt);
    
    field_addw(&tt,1);
    succ |= field_is_zero(&tt);
    
    field_subw(&tt,2);
    field_bias(&tt,1);
    succ |= field_is_zero(&tt);
    
    field_addw(&tt,1);
    if (~succ) {
        youfail();
        printf("ISR failure.\n");
        field_print("    x", &xx);
        field_print("    s", &ss);
        field_print("    t", &tt);
    }
    
    return succ;
 }

 int test_arithmetic (void) {
    int j, ntests = 100000;
    
@@ -168,8 +204,8 @@ int test_arithmetic (void) {
        if (j<256) {
            mpz_set_ui(x,0);
            mpz_set_ui(y,0);
            mpz_setbit(x,(j%16)*28); // FIELD_MAGIC
            mpz_setbit(y,(j/16)*28); // FIELD_MAGIC
            mpz_setbit(x,(j%16)*28);
            mpz_setbit(y,(j/16)*28);
        } else if (j&1) {
            mpz_rrandomb(x, state, FIELD_BITS);
            mpz_rrandomb(y, state, FIELD_BITS);
@@ -183,6 +219,9 @@ int test_arithmetic (void) {
        succ &= test_add_sub(x,y,word);
        succ &= test_mul_sqr(x,y,word);
        
        if (j < 1000)
            succ &= test_isr(x);
        
        // TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
    }
    
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -3,6 +3,7 @@
 #include <stdio.h>

 #include "ec_point.h"
 #include "magic.h"
 #include "field.h"
 #include "crandom.h"

@@ -256,6 +257,15 @@ int test_pointops (void) {
    struct crandom_state_t crand;
    crandom_init_from_buffer(&crand, "test_pointops random initializer");
    
    struct extensible_t ext_base;
    if (!validate_affine(&goldilocks_base_point)) {
        youfail();
        printf("  Base point isn't on the curve.\n");
        return -1;
    }
    convert_affine_to_extensible(&ext_base, &goldilocks_base_point);
    if (!validate_ext(&ext_base, 2, "base")) return -1;
    
    int i, ret;
    for (i=0; i<1000; i++) {
        uint8_t ser[FIELD_BYTES];
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -39,8 +39,14 @@ single_scalarmul_compatibility_test (
    if (!succ) {
        return 1;
    }
    
    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}}; // FIELD_MAGIC

 #if FIELD_BITS == 448
    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
 #elif FIELD_BITS == 480
    struct { int n,t,s; } params[] = {{5,6,16},{6,5,16},{4,5,24},{4,4,30},{1,2,240}};
 #else
    struct { int n,t,s; } params[] = {{5,5,(SCALAR_BITS+24)/25},{1,2,(SCALAR_BITS+1)/2}};
 #endif
    const int nparams = sizeof(params)/sizeof(params[0]);
    struct fixed_base_table_t fbt;
    const int nsizes = 6;