From 1f480b0f95fb66adf8a01e3d32cf9621629821de Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 23 Oct 2014 17:41:51 -0700
Subject: [PATCH] Big changes for curve flexibility.  For details see
 HISTORY.txt.

Very experimental Ed480-Ridinghood support is now in.  It's not fully optimized,
but in general the current build is 8-15% slower than Goldilocks.  It only works on
arch_x86_64, though arch_ref64 support ought to be easy.  Support on other arches
will be trickier, which is of course why I chose Goldilocks over Ridinghood in the
first place.

Next up, E-521.  Hopefully.

The code is starting to get spread out over a lot of files.  Some are per field*arch,
some per field, some per curve, some global.  It's hard to do much about this, though,
with a rather ugly .c.inc system.

There's currently no way to make a Ridinghood eBAT.  In fact, I haven't tested eBAT
support in this commit.  I also haven't tested NEON, but at least ARCH_32 works on
Intel.
---
 HISTORY.txt                                   |  41 ++
 Makefile                                      |  13 +-
 include/goldilocks.h                          |  10 +-
 include/ridinghood.h                          | 376 +++++++++++++++
 src/ec_point.c                                |  87 +---
 src/include/ec_point.h                        |   2 -
 src/include/field.h                           |  74 +--
 src/include/magic.h                           |  47 +-
 src/include/word.h                            |  11 +-
 src/{ => p448}/arch_32/arch_config.h          |   0
 src/{ => p448}/arch_32/p448.c                 |   0
 src/{ => p448}/arch_32/p448.h                 |   0
 src/{ => p448}/arch_arm_32/arch_config.h      |   0
 src/{ => p448}/arch_arm_32/p448.c             |   0
 src/{ => p448}/arch_arm_32/p448.h             |   0
 src/{ => p448}/arch_neon/arch_config.h        |   0
 src/{ => p448}/arch_neon/neon_emulation.h     |   0
 src/{ => p448}/arch_neon/p448.c               |   0
 src/{ => p448}/arch_neon/p448.h               |   0
 .../arch_neon_experimental/arch_config.h      |   0
 src/{ => p448}/arch_neon_experimental/p448.c  |   0
 src/{ => p448}/arch_neon_experimental/p448.h  |   0
 src/{ => p448}/arch_ref64/arch_config.h       |   0
 src/{ => p448}/arch_ref64/p448.c              |   0
 src/{ => p448}/arch_ref64/p448.h              |   0
 src/{ => p448}/arch_x86_64/arch_config.h      |   0
 src/{ => p448}/arch_x86_64/p448.c             |   0
 src/{ => p448}/arch_x86_64/p448.h             |   0
 src/{ => p448}/arch_x86_64/x86-64-arith.h     |   0
 src/p448/f_arithmetic.c                       |  43 ++
 src/p448/f_field.h                            |  39 ++
 src/p448/f_magic.h                            |  35 ++
 src/p448/field.h                              | 123 +++++
 src/{ => p448}/magic.c                        |  24 +-
 src/p480/arch_x86_64/arch_config.h            |   1 +
 src/p480/arch_x86_64/p480.c                   | 435 ++++++++++++++++++
 src/p480/arch_x86_64/p480.h                   | 257 +++++++++++
 src/p480/arch_x86_64/x86-64-arith.h           | 279 +++++++++++
 src/p480/f_arithmetic.c                       |  43 ++
 src/p480/f_field.h                            |  39 ++
 src/p480/f_magic.h                            |  35 ++
 src/p480/magic.c                              |  68 +++
 src/p521/f_arithmetic.c                       |  43 ++
 src/p521/f_field.h                            |  39 ++
 test/bench.c                                  |  11 +-
 test/test.c                                   |   9 +-
 test/test_arithmetic.c                        |  63 ++-
 test/test_pointops.c                          |  10 +
 test/test_scalarmul.c                         |  10 +-
 49 files changed, 2082 insertions(+), 185 deletions(-)
 create mode 100644 include/ridinghood.h
 rename src/{ => p448}/arch_32/arch_config.h (100%)
 rename src/{ => p448}/arch_32/p448.c (100%)
 rename src/{ => p448}/arch_32/p448.h (100%)
 rename src/{ => p448}/arch_arm_32/arch_config.h (100%)
 rename src/{ => p448}/arch_arm_32/p448.c (100%)
 rename src/{ => p448}/arch_arm_32/p448.h (100%)
 rename src/{ => p448}/arch_neon/arch_config.h (100%)
 rename src/{ => p448}/arch_neon/neon_emulation.h (100%)
 rename src/{ => p448}/arch_neon/p448.c (100%)
 rename src/{ => p448}/arch_neon/p448.h (100%)
 rename src/{ => p448}/arch_neon_experimental/arch_config.h (100%)
 rename src/{ => p448}/arch_neon_experimental/p448.c (100%)
 rename src/{ => p448}/arch_neon_experimental/p448.h (100%)
 rename src/{ => p448}/arch_ref64/arch_config.h (100%)
 rename src/{ => p448}/arch_ref64/p448.c (100%)
 rename src/{ => p448}/arch_ref64/p448.h (100%)
 rename src/{ => p448}/arch_x86_64/arch_config.h (100%)
 rename src/{ => p448}/arch_x86_64/p448.c (100%)
 rename src/{ => p448}/arch_x86_64/p448.h (100%)
 rename src/{ => p448}/arch_x86_64/x86-64-arith.h (100%)
 create mode 100644 src/p448/f_arithmetic.c
 create mode 100644 src/p448/f_field.h
 create mode 100644 src/p448/f_magic.h
 create mode 100644 src/p448/field.h
 rename src/{ => p448}/magic.c (82%)
 create mode 100644 src/p480/arch_x86_64/arch_config.h
 create mode 100644 src/p480/arch_x86_64/p480.c
 create mode 100644 src/p480/arch_x86_64/p480.h
 create mode 100644 src/p480/arch_x86_64/x86-64-arith.h
 create mode 100644 src/p480/f_arithmetic.c
 create mode 100644 src/p480/f_field.h
 create mode 100644 src/p480/f_magic.h
 create mode 100644 src/p480/magic.c
 create mode 100644 src/p521/f_arithmetic.c
 create mode 100644 src/p521/f_field.h

diff --git a/HISTORY.txt b/HISTORY.txt
index 9a4a24e..c6eba60 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,44 @@
+October 23, 2014:
+    Pushing through changes for curve flexibility.  First up is
+    Ed480-Ridinghood, because it has the same number of words.  Next
+    is E-521.
+    
+    Experimental support for Ed480-Ridinghood.  To use, compile with
+        make ... FIELD=p480 -XCFLAGS=-DGOLDI_FIELD_BITS=480
+    
+    I still need to figure out what to do about the fact that the library
+    is called "goldilocks", but in will soon support curves that are not
+    ed448-goldilocks, at least experimentally.
+        
+    Currently the whole system's header "goldilocks.h" doesn't have
+    a simpler way to override field size, but it does work (as a hack)
+    with -DGOLDI_FIELD_BITS=...
+    
+    There is no support yet for coexistence of multiple fields in one
+    library.  The field routines will have unique names, but scalarmul*
+    won't, and the top-level goldilocks routines have fixed names.
+    
+    Current timings on Haswell:
+        Goldilocks: 178kcy keygen, 536kcy ecdh
+        Ridinghood: 193kcy keygen, 617kcy ecdh
+    
+    Note that Ridinghood ECDH does worse than 480/448.  This is at least
+    in part because I haven't calculated the overflow handling limits yet
+    in ec_point.h (this is a disadvantage of dropping the automated
+    tool for generating that file).  So I'm reducing much more often
+    than I need to.  (There's a really loud TODO in ec_point.h for that.)
+    
+    Also, I haven't tested the limits on these reductions in a while, so
+    it could be that there are actual (security-critical) bugs in this
+    area, at least for p448.  Now that there's field flexibility, it's
+    probably a good idea to make a field impl with extra words to check
+    this.
+    
+    Furthermore, field_mulw_scc will perform differently on these two
+    curves based on whether the curve constant is positive or negative.
+    I should probably go optimize the "hot" routines like montgomery_step
+    to have separate cases for positive and negative.
+
 September 29, 2014:
     Yesterday I put in some more architecture detection, but it should
     really be based on the arch directory, because what's in there really
diff --git a/Makefile b/Makefile
index b9d53a9..83a8066 100644
--- a/Makefile
+++ b/Makefile
@@ -20,12 +20,13 @@ else
 ARCH ?= arch_arm_32
 endif
 
+FIELD ?= p448
 
 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 	 
 	 
-INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
+INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 LANGFLAGS = -std=c99 -fno-strict-aliasing
 GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 OFLAGS = -O3
@@ -63,7 +64,8 @@ ASFLAGS = $(ARCHFLAGS)
 HEADERS= Makefile $(shell find . -name "*.h") build/timestamp
 
 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
-  build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o build/arithmetic.o
+  build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \
+	build/f_arithmetic.o build/arithmetic.o
 
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o
@@ -113,7 +115,10 @@ build/%.s: src/%.c $(HEADERS)
 build/%.s: test/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 
-build/%.s: src/$(ARCH)/%.c $(HEADERS)
+build/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS)
+	$(CC) $(CFLAGS) -S -c -o $@ $<
+
+build/%.s: src/$(FIELD)/%.c $(HEADERS)
 	$(CC) $(CFLAGS) -S -c -o $@ $<
 
 doc/timestamp:
@@ -131,7 +136,7 @@ $(BATNAME): include/* src/* src/*/* test/batarch.map
           targ="$@/crypto_$$prim/ed448goldilocks"; \
 	  (while read arch where; do \
 	    mkdir -p $$targ/`basename $$arch`; \
-	    cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/$$where/*.c src/$$where/*.h $$targ/`basename $$arch`; \
+	    cp include/*.h src/*.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
 	    cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
 	    perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $$targ/`basename $$arch`/config.h; \
 	    perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h;  \
diff --git a/include/goldilocks.h b/include/goldilocks.h
index 2c3919c..1631c2f 100644
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -22,14 +22,18 @@
 #define GOLDI_IMPLEMENT_SIGNATURES 1
 #endif
 
-/** The size of the Goldilocks field, in bits. */
+/** The size of the Goldilocks field, in bits. 
+ * Ifdef'd so you can override when testing experimental Ed480-Ridinghood or E-521.
+ */
+#ifndef GOLDI_FIELD_BITS
 #define GOLDI_FIELD_BITS          448
+#endif
 
 /** The size of the Goldilocks scalars, in bits. */
-#define GOLDI_SCALAR_BITS         446
+#define GOLDI_SCALAR_BITS         (GOLDI_FIELD_BITS-2)
 
 /** The same size, in bytes. */
-#define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)
+#define GOLDI_FIELD_BYTES         ((GOLDI_FIELD_BITS+7)/8)
 
 /** The size of a Goldilocks public key, in bytes. */
 #define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES
diff --git a/include/ridinghood.h b/include/ridinghood.h
new file mode 100644
index 0000000..2c3919c
--- /dev/null
+++ b/include/ridinghood.h
@@ -0,0 +1,376 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file goldilocks.h
+ * @author Mike Hamburg
+ * @brief Goldilocks high-level functions.
+ */
+#ifndef __GOLDILOCKS_H__
+#define __GOLDILOCKS_H__ 1
+
+#include <stdint.h>
+
+#ifndef GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+/** If nonzero, implement precomputation for verify and ECDH. */
+#define GOLDI_IMPLEMENT_PRECOMPUTED_KEYS 1
+#endif
+
+#ifndef GOLDI_IMPLEMENT_SIGNATURES
+/** If nonzero, implement signatures. */
+#define GOLDI_IMPLEMENT_SIGNATURES 1
+#endif
+
+/** The size of the Goldilocks field, in bits. */
+#define GOLDI_FIELD_BITS          448
+
+/** The size of the Goldilocks scalars, in bits. */
+#define GOLDI_SCALAR_BITS         446
+
+/** The same size, in bytes. */
+#define GOLDI_FIELD_BYTES         (GOLDI_FIELD_BITS/8)
+
+/** The size of a Goldilocks public key, in bytes. */
+#define GOLDI_PUBLIC_KEY_BYTES    GOLDI_FIELD_BYTES
+
+/** The extra bytes in a Goldilocks private key for the symmetric key. */
+#define GOLDI_SYMKEY_BYTES        32
+
+/** The size of a shared secret. */
+#define GOLDI_SHARED_SECRET_BYTES 64
+
+/** The size of a Goldilocks private key, in bytes. */
+#define GOLDI_PRIVATE_KEY_BYTES   (2*GOLDI_FIELD_BYTES + GOLDI_SYMKEY_BYTES)
+
+/** The size of a Goldilocks signature, in bytes. */
+#define GOLDI_SIGNATURE_BYTES     (2*GOLDI_FIELD_BYTES)
+
+/**
+ * @brief Serialized form of a Goldilocks public key.
+ *
+ * @warning This isn't even my final form!
+ */
+struct goldilocks_public_key_t {
+    uint8_t opaque[GOLDI_PUBLIC_KEY_BYTES]; /**< Serialized data. */
+};
+
+/**
+ * @brief Serialized form of a Goldilocks private key.
+ *
+ * Contains 56 bytes of actual private key, 56 bytes of
+ * public key, and 32 bytes of symmetric key for randomization.
+ *
+ * @warning This isn't even my final form!
+ */
+struct goldilocks_private_key_t {
+    uint8_t opaque[GOLDI_PRIVATE_KEY_BYTES]; /**< Serialized data. */
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief No error. */
+static const int GOLDI_EOK      = 0;
+
+/** @brief Error: your key or other state is corrupt. */
+static const int GOLDI_ECORRUPT = 44801;
+
+/** @brief Error: other party's key is corrupt. */
+static const int GOLDI_EINVAL   = 44802;
+
+/** @brief Error: not enough entropy. */
+static const int GOLDI_ENODICE  = 44804;
+
+/** @brief Error: you need to initialize the library first. */
+static const int GOLDI_EUNINIT  = 44805;
+
+/** @brief Error: called init() but we are already initialized. */
+static const int GOLDI_EALREADYINIT  = 44805;
+
+/**
+ * @brief Initialize Goldilocks' precomputed tables and
+ * random number generator.  This function must be called before
+ * any of the other Goldilocks routines (except
+ * goldilocks_shared_secret in the current version) and should be
+ * called only once per process.
+ *
+ * There is currently no way to tear down this state.  It is possible
+ * that a future version of this library will not require this function.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EALREADYINIT Already initialized.
+ * @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
+ * @retval Nonzero An error occurred.
+ */
+int
+goldilocks_init (void)
+__attribute__((warn_unused_result,visibility ("default")));
+
+
+/**
+ * @brief Generate a new random keypair.
+ * @param [out] privkey The generated private key.
+ * @param [out] pubkey The generated public key.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ENODICE Insufficient entropy.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_keygen (
+    struct goldilocks_private_key_t *privkey,
+    struct goldilocks_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Derive a key from its compressed form.
+ * @param [out] privkey The derived private key.
+ * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_derive_private_key (
+    struct goldilocks_private_key_t *privkey,
+    const unsigned char proto[GOLDI_SYMKEY_BYTES]
+) __attribute__((nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Compress a private key (by copying out the proto-key)
+ * @param [out] proto The proto-key.
+ * @param [in] privkey The private key.
+ *
+ * @warning This isn't even my final form!
+ * @todo test.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+void
+goldilocks_underive_private_key (
+    unsigned char proto[GOLDI_SYMKEY_BYTES],
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Extract the public key from a private key.
+ *
+ * This is essentially a memcpy from the public part of the privkey.
+ *    
+ * @param [out] pubkey The extracted private key.
+ * @param [in] privkey The private key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT The private key is corrupt.
+ */
+int
+goldilocks_private_to_public (
+    struct goldilocks_public_key_t *pubkey,
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2),visibility ("default")));
+
+/**
+ * @brief Generate a Diffie-Hellman shared secret in constant time.
+ *
+ * This function uses some compile-time flags whose merit remains to
+ * be decided.
+ *
+ * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
+ * of zeros to the secret before hashing.  In the case that the other
+ * party's key is detectably corrupt, instead the symmetric part
+ * of the secret key is used to produce a pseudorandom value.
+ *
+ * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
+ * the two parties' public keys is prepended to the hash.
+ *
+ * In the current version, this function can safely be run even without
+ * goldilocks_init().  But this property is not guaranteed for future
+ * versions, so call it anyway.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] shared The shared secret established with the other party.
+ * @param [in] my_privkey My private key.
+ * @param [in] your_pubkey The other party's public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EINVAL   The other party's key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_shared_secret (
+    uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
+    const struct goldilocks_private_key_t *my_privkey,
+    const struct goldilocks_public_key_t *your_pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
+
+#if GOLDI_IMPLEMENT_SIGNATURES
+/**
+ * @brief Sign a message.
+ *
+ * The signature is deterministic, using the symmetric secret found in the
+ * secret key to form a nonce.
+ *
+ * The technique used in signing is a modified Schnorr system, like EdDSA.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] signature_out Space for the output signature.
+ * @param [in] message The message to be signed.
+ * @param [in] message_len The length of the message to be signed.
+ * @param [in] privkey My private key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_sign (
+    uint8_t signature_out[GOLDI_SIGNATURE_BYTES],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_private_key_t *privkey
+) __attribute__((nonnull(1,2,4),visibility ("default")));
+
+/**
+ * @brief Verify a signature.
+ *
+ * This function is fairly strict.  It will correctly detect when
+ * the signature has the wrong cofactor component, or when the sig
+ * values aren't less than p or q.
+ * 
+ * Currently this function does not detect when the public key is weird,
+ * eg 0, has cofactor, etc.  As a result, a party with a bogus public
+ * key could create signatures that succeed on some systems and fail on
+ * others.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] signature The signature.
+ * @param [in] message The message to be verified.
+ * @param [in] message_len The length of the message to be verified.
+ * @param [in] pubkey The signer's public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EINVAL The public key or signature is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_verify (
+    const uint8_t signature[GOLDI_SIGNATURE_BYTES],
+    const uint8_t *message,
+    uint64_t message_len,
+    const struct goldilocks_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
+#endif
+
+#if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
+
+/** A public key which has been expanded by precomputation for higher speed. */
+struct goldilocks_precomputed_public_key_t;
+
+/**
+ * @brief Expand a public key by precomputation.
+ *
+ * @todo Give actual error returns, instead of ambiguous NULL.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] pub The public key.
+ * @retval NULL We ran out of memory, or the 
+ */
+struct goldilocks_precomputed_public_key_t *
+goldilocks_precompute_public_key (
+    const struct goldilocks_public_key_t *pub
+) __attribute__((warn_unused_result,nonnull(1),visibility ("default")));
+
+/**
+ * @brief Overwrite an expanded public key with zeros, then destroy it.
+ *
+ * If the input is NULL, this function does nothing.
+ *
+ * @param [in] precom The public key.
+ */
+void
+goldilocks_destroy_precomputed_public_key (
+    struct goldilocks_precomputed_public_key_t *precom
+) __attribute__((visibility ("default")));
+
+/**
+ * @brief Verify a signature.
+ *
+ * This function is fairly strict.  It will correctly detect when
+ * the signature has the wrong cofactor component, or when the sig
+ * values aren't less than p or q.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [in] signature The signature.
+ * @param [in] message The message to be verified.
+ * @param [in] message_len The length of the message to be verified.
+ * @param [in] pubkey The signer's public key, expanded by precomputation.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_EINVAL The public key or signature is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_verify_precomputed (
+   const uint8_t signature[GOLDI_SIGNATURE_BYTES],
+   const uint8_t *message,
+   uint64_t message_len,
+   const struct goldilocks_precomputed_public_key_t *pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,4),visibility ("default")));
+   
+/**
+ * @brief Generate a Diffie-Hellman shared secret in constant time.
+ * Uses a precomputation on the other party's public key for efficiency.
+ *
+ * This function uses some compile-time flags whose merit remains to
+ * be decided.
+ *
+ * If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
+ * of zeros to the secret before hashing.  In the case that the other
+ * party's key is detectably corrupt, instead the symmetric part
+ * of the secret key is used to produce a pseudorandom value.
+ *
+ * If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
+ * the two parties' public keys is prepended to the hash.
+ *
+ * In the current version, this function can safely be run even without
+ * goldilocks_init().  But this property is not guaranteed for future
+ * versions, so call it anyway.
+ *
+ * @warning This isn't even my final form!
+ *
+ * @param [out] shared The shared secret established with the other party.
+ * @param [in] my_privkey My private key.
+ * @param [in] your_pubkey The other party's precomputed public key.
+ *
+ * @retval GOLDI_EOK Success.
+ * @retval GOLDI_ECORRUPT My key is corrupt.
+ * @retval GOLDI_EINVAL   The other party's key is corrupt.
+ * @retval GOLDI_EUNINIT You must call goldilocks_init() first.
+ */
+int
+goldilocks_shared_secret_precomputed (
+   uint8_t shared[GOLDI_SHARED_SECRET_BYTES],
+   const struct goldilocks_private_key_t *my_privkey,
+   const struct goldilocks_precomputed_public_key_t *your_pubkey
+) __attribute__((warn_unused_result,nonnull(1,2,3),visibility ("default")));
+
+#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __GOLDILOCKS_H__ */
diff --git a/src/ec_point.c b/src/ec_point.c
index eabc3a3..0b9c8bf 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -12,7 +12,8 @@
 #include "ec_point.h"
 #include "magic.h"
 
-#define is32 (GOLDI_BITS == 32)
+#define is32 (GOLDI_BITS == 32 || FIELD_BITS == 480)
+/* TODO XXX PERF FIXME: better detection of overflow conditions */
 
 /* I wanted to just use if (is32)
  * But clang's -Wunreachable-code flags it.
@@ -52,60 +53,6 @@ field_mulw_scc_wr (
         field_weak_reduce(out);
 }
 
-static __inline__ void
-field_sqrn (
-    field_t *__restrict__ y,
-    const field_t *x,
-    int n
-) {
-    field_t tmp;
-    assert(n>0);
-    if (n&1) {
-        field_sqr(y,x);
-        n--;
-    } else {
-        field_sqr(&tmp,x);
-        field_sqr(y,&tmp);
-        n-=2;
-    }
-    for (; n; n-=2) {
-        field_sqr(&tmp,y);
-        field_sqr(y,&tmp);
-    }
-}
-
-void 
-field_isr ( /* TODO: MAGIC */
-    struct field_t*       a,
-    const struct field_t* x
-) {
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L1,     x );
-    field_mul  (   &L2,     x,   &L1 );
-    field_sqr  (   &L1,   &L2 );
-    field_mul  (   &L2,     x,   &L1 );
-    field_sqrn (   &L1,   &L2,     3 );
-    field_mul  (   &L0,   &L2,   &L1 );
-    field_sqrn (   &L1,   &L0,     3 );
-    field_mul  (   &L0,   &L2,   &L1 );
-    field_sqrn (   &L2,   &L0,     9 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L0,   &L1 );
-    field_mul  (   &L2,     x,   &L0 );
-    field_sqrn (   &L0,   &L2,    18 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqrn (   &L0,   &L2,    37 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L0,   &L1,    37 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L0,   &L1,   111 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqr  (   &L0,   &L2 );
-    field_mul  (   &L1,     x,   &L0 );
-    field_sqrn (   &L0,   &L1,   223 );
-    field_mul  (     a,   &L2,   &L0 );
-}
-
 void
 add_tw_niels_to_tw_extensible (
     struct tw_extensible_t*  d,
@@ -396,7 +343,7 @@ montgomery_step (
     field_sqr  ( &a->za, &a->zd );
     field_sqr  ( &a->xd,   &L0 );
     field_sqr  (   &L0,   &L1 );
-    field_mulw ( &a->zd, &a->xd, 1-EDWARDS_D );
+    field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
     field_sub  (   &L1, &a->xd,   &L0 );
     field_bias (   &L1,     2 );
     IF32( field_weak_reduce(   &L1 ) );
@@ -444,11 +391,9 @@ serialize_montgomery (
     field_mul  (   &L3,   &L1,   &L2 );
     field_copy (   &L2, &a->z0 );
     field_addw (   &L2,     1 );
-    field_sqr  (   &L1,   &L2 );
-    field_mulw (   &L2,   &L1, 1-EDWARDS_D );
-    field_neg  (   &L1,   &L2 );
+    field_sqr  (   &L0,   &L2 );
+    field_mulw_scc_wr (   &L1,   &L0, EDWARDS_D-1 );
     field_add  (   &L2, &a->z0, &a->z0 );
-    field_bias (   &L2,     1 );
     field_add  (   &L0,   &L2,   &L2 );
     field_add  (   &L2,   &L0,   &L1 );
     IF32( field_weak_reduce(   &L2 ) );
@@ -512,13 +457,9 @@ untwist_and_double_and_serialize (
     IF32( field_weak_reduce(     b ) );
     field_sqr  (   &L2, &a->z );
     field_sqr  (   &L1,   &L2 );
-    field_add  (   &L2,     b,     b );
-    field_mulw (     b,   &L2, 1-EDWARDS_D );
-    field_neg  (   &L2,     b );
-    field_bias (   &L2,     2 );
-    field_mulw (   &L0,   &L2, 1-EDWARDS_D );
-    field_neg  (     b,   &L0 );
-    field_bias (     b,     2 );
+    field_add  (   b,     b,     b );
+    field_mulw_scc (     &L2,   b, EDWARDS_D-1 );
+    field_mulw_scc (   b,   &L2, EDWARDS_D-1 );
     field_mul  (   &L0,   &L2,   &L1 );
     field_mul  (   &L2,     b,   &L0 );
     field_isr  (   &L0,   &L2 );
@@ -654,10 +595,8 @@ deserialize_affine (
     field_copy (   &L3,   &L1 );
     field_addw (   &L3,     1 );
     field_sqr  (   &L2,   &L3 );
-    field_mulw (   &L3,   &L2, 1-EDWARDS_D );
-    field_neg  ( &a->x,   &L3 );
-    field_add  (   &L3,   &L1,   &L1 );
-    field_bias (   &L3,     1 );
+    field_mulw_scc (   &a->x,   &L2, EDWARDS_D-1 ); /* PERF MULW */
+    field_add  (   &L3,   &L1,   &L1 ); /* FIXME: i adjusted the bias here, was it right? */
     field_add  ( &a->y,   &L3,   &L3 );
     field_add  (   &L3, &a->y, &a->x );
     IF32( field_weak_reduce(   &L3 ) );
@@ -694,11 +633,9 @@ deserialize_and_twist_approx (
     field_sqr  ( &a->z,    sz );
     field_copy ( &a->y, &a->z );
     field_addw ( &a->y,     1 );
-    field_sqr  ( &a->x, &a->y );
-    field_mulw ( &a->y, &a->x, 1-EDWARDS_D );
-    field_neg  ( &a->x, &a->y );
+    field_sqr  ( &L0, &a->y );
+    field_mulw_scc ( &a->x, &L0, EDWARDS_D-1 );
     field_add  ( &a->y, &a->z, &a->z );
-    field_bias ( &a->y,     1 );
     field_add  ( &a->u, &a->y, &a->y );
     field_add  ( &a->y, &a->u, &a->x );
     IF32( field_weak_reduce( &a->y ) );
diff --git a/src/include/ec_point.h b/src/include/ec_point.h
index 657ee88..74bbe91 100644
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -543,8 +543,6 @@ copy_tw_pniels (
     field_copy ( &a->z, &ds->z );
 }
 
-
-
 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
diff --git a/src/include/field.h b/src/include/field.h
index bf36e95..b3160a7 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -1,40 +1,16 @@
 /**
  * @file field.h
- * @brief Field switch code.
+ * @brief Generic field header.
  * @copyright
  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
  *   Released under the MIT License.  See LICENSE.txt for license information.
  * @author Mike Hamburg
  */
+
 #ifndef __FIELD_H__
 #define __FIELD_H__
 
-#include <string.h>
-#include "constant_time.h"
-
-#include "p448.h"
-#define FIELD_BITS           448
-#define field_t              p448_t
-#define field_mul            p448_mul
-#define field_sqr            p448_sqr
-#define field_add            p448_add
-#define field_sub            p448_sub
-#define field_mulw           p448_mulw
-#define field_addw           p448_addw
-#define field_subw           p448_subw
-#define field_neg            p448_neg
-#define field_set_ui         p448_set_ui
-#define field_bias           p448_bias
-#define field_cond_neg       p448_cond_neg
-#define field_inverse        p448_inverse
-#define field_eq             p448_eq
-#define field_isr            p448_isr
-#define field_simultaneous_invert p448_simultaneous_invert
-#define field_weak_reduce    p448_weak_reduce
-#define field_strong_reduce  p448_strong_reduce
-#define field_serialize      p448_serialize
-#define field_deserialize    p448_deserialize
-#define field_is_zero        p448_is_zero
+#include "f_field.h"
 
 /** @brief Bytes in a field element */
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
@@ -42,6 +18,22 @@
 /** @brief Words in a field element */
 #define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))
 
+/* TODO: standardize notation */
+/** @brief The number of words in the Goldilocks field. */
+#define GOLDI_FIELD_WORDS DIV_CEIL(FIELD_BITS,WORD_BITS)
+
+/** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */
+#define COFACTOR_BITS 2
+
+/** @brief The number of bits in a Goldilocks scalar. */
+#define SCALAR_BITS (FIELD_BITS - COFACTOR_BITS)
+
+/** @brief The number of bytes in a Goldilocks scalar. */
+#define SCALAR_BYTES (1+(SCALAR_BITS)/8)
+
+/** @brief The number of words in the Goldilocks field. */
+#define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)
+
 /**
  * @brief For GMP tests: little-endian representation of the field modulus.
  */
@@ -119,5 +111,31 @@ field_eq (
     const struct field_t *a,
     const struct field_t *b
 );
+    
+/**
+ * Square x, n times.
+ */
+static __inline__ void
+__attribute__((unused,always_inline))
+field_sqrn (
+    field_t *__restrict__ y,
+    const field_t *x,
+    int n
+) {
+    field_t tmp;
+    assert(n>0);
+    if (n&1) {
+        field_sqr(y,x);
+        n--;
+    } else {
+        field_sqr(&tmp,x);
+        field_sqr(y,&tmp);
+        n-=2;
+    }
+    for (; n; n-=2) {
+        field_sqr(&tmp,y);
+        field_sqr(y,&tmp);
+    }
+}
 
-#endif /* __FIELD_H__ */
+#endif // __FIELD_H__
diff --git a/src/include/magic.h b/src/include/magic.h
index 70be081..c7e296d 100644
--- a/src/include/magic.h
+++ b/src/include/magic.h
@@ -4,16 +4,24 @@
  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
  *   Released under the MIT License.  See LICENSE.txt for license information.
  * @author Mike Hamburg
- * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
+ * @brief Curve-independent declarations of magic numbers.
  */
 
-
 #ifndef __GOLDI_MAGIC_H__
 #define __GOLDI_MAGIC_H__ 1
 
 #include "word.h"
-#include "p448.h"
-#include "ec_point.h"
+
+/**
+ * @brief If true, use wider tables for the precomputed combs.
+ */
+#ifndef USE_BIG_COMBS
+#if defined(__ARM_NEON__)
+#define USE_BIG_COMBS 1
+#else
+#define USE_BIG_COMBS (WORD_BITS==64)
+#endif
+#endif
 
 /* TODO: standardize notation */
 
@@ -32,16 +40,13 @@
 /** @brief The number of words in the Goldilocks field. */
 #define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS)
 
+#include "f_magic.h"
+
 /**
  * @brief sqrt(d-1), used for point formats and twisting.
  */
 extern const struct field_t sqrt_d_minus_1;
 
-/**
- * @brief The Edwards "d" term for this curve.
- */
-static const int64_t EDWARDS_D = -39081;
-
 /**
  * @brief The base point for Goldilocks.
  */
@@ -76,34 +81,10 @@ extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS];
  */
 #define SCALARMUL_WNAF_COMBO_TABLE_BITS 4
 
-/**
- * @brief If true, use wider tables for the precomputed combs.
- */
-#ifndef USE_BIG_COMBS
-#if defined(__ARM_NEON__)
-#define USE_BIG_COMBS 1
-#else
-#define USE_BIG_COMBS (WORD_BITS==64)
-#endif
-#endif
-
-/** @brief The number of combs to use for signed comb algo */
-#define COMB_N (USE_BIG_COMBS ? 5  : 8)
-
-/** @brief The number of teeth of the combs for signed comb algo */
-#define COMB_T (USE_BIG_COMBS ? 5  : 4)
-
-/** @brief The spacing the of combs for signed comb algo */
-#define COMB_S (USE_BIG_COMBS ? 18 : 14)
-
 /**
  * @brief The bit width of the precomputed WNAF tables.  Size is 2^this elements.
  */
 #define WNAF_PRECMP_BITS 5
 
-/**
- * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
- */
-#define CRANDOM_MAGIC 0x72657475726e2034ull
 
 #endif /* __GOLDI_MAGIC_H__ */
diff --git a/src/include/word.h b/src/include/word.h
index ddc8d36..297bb96 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -37,9 +37,12 @@ typedef int64_t sword_t;
 typedef __int128_t dsword_t;
 #define PRIxWORD PRIx64
 #define PRIxWORDfull "%016" PRIx64
-#define PRIxWORD58   "%014" PRIx64
+#define PRIxWORD56   "%014" PRIx64
+#define PRIxWORD60   "%015" PRIx60
 #define U64LE(x) x##ull
 #define U58LE(x) x##ull
+#define U56LE(x) x##ull
+#define U60LE(x) x##ull
 #define letohWORD letoh64
 #define GOLDI_BITS 64
 #else
@@ -51,9 +54,11 @@ typedef int32_t sword_t;
 typedef int64_t dsword_t;
 #define PRIxWORD PRIx32
 #define PRIxWORDfull "%08" PRIx32
-#define PRIxWORD58   "%07" PRIx32
+#define PRIxWORD56   "%07" PRIx32
 #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
-#define U58LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
+#define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
+#define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
+#define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
 #define letohWORD letoh32
 #define GOLDI_BITS 32
 #endif
diff --git a/src/arch_32/arch_config.h b/src/p448/arch_32/arch_config.h
similarity index 100%
rename from src/arch_32/arch_config.h
rename to src/p448/arch_32/arch_config.h
diff --git a/src/arch_32/p448.c b/src/p448/arch_32/p448.c
similarity index 100%
rename from src/arch_32/p448.c
rename to src/p448/arch_32/p448.c
diff --git a/src/arch_32/p448.h b/src/p448/arch_32/p448.h
similarity index 100%
rename from src/arch_32/p448.h
rename to src/p448/arch_32/p448.h
diff --git a/src/arch_arm_32/arch_config.h b/src/p448/arch_arm_32/arch_config.h
similarity index 100%
rename from src/arch_arm_32/arch_config.h
rename to src/p448/arch_arm_32/arch_config.h
diff --git a/src/arch_arm_32/p448.c b/src/p448/arch_arm_32/p448.c
similarity index 100%
rename from src/arch_arm_32/p448.c
rename to src/p448/arch_arm_32/p448.c
diff --git a/src/arch_arm_32/p448.h b/src/p448/arch_arm_32/p448.h
similarity index 100%
rename from src/arch_arm_32/p448.h
rename to src/p448/arch_arm_32/p448.h
diff --git a/src/arch_neon/arch_config.h b/src/p448/arch_neon/arch_config.h
similarity index 100%
rename from src/arch_neon/arch_config.h
rename to src/p448/arch_neon/arch_config.h
diff --git a/src/arch_neon/neon_emulation.h b/src/p448/arch_neon/neon_emulation.h
similarity index 100%
rename from src/arch_neon/neon_emulation.h
rename to src/p448/arch_neon/neon_emulation.h
diff --git a/src/arch_neon/p448.c b/src/p448/arch_neon/p448.c
similarity index 100%
rename from src/arch_neon/p448.c
rename to src/p448/arch_neon/p448.c
diff --git a/src/arch_neon/p448.h b/src/p448/arch_neon/p448.h
similarity index 100%
rename from src/arch_neon/p448.h
rename to src/p448/arch_neon/p448.h
diff --git a/src/arch_neon_experimental/arch_config.h b/src/p448/arch_neon_experimental/arch_config.h
similarity index 100%
rename from src/arch_neon_experimental/arch_config.h
rename to src/p448/arch_neon_experimental/arch_config.h
diff --git a/src/arch_neon_experimental/p448.c b/src/p448/arch_neon_experimental/p448.c
similarity index 100%
rename from src/arch_neon_experimental/p448.c
rename to src/p448/arch_neon_experimental/p448.c
diff --git a/src/arch_neon_experimental/p448.h b/src/p448/arch_neon_experimental/p448.h
similarity index 100%
rename from src/arch_neon_experimental/p448.h
rename to src/p448/arch_neon_experimental/p448.h
diff --git a/src/arch_ref64/arch_config.h b/src/p448/arch_ref64/arch_config.h
similarity index 100%
rename from src/arch_ref64/arch_config.h
rename to src/p448/arch_ref64/arch_config.h
diff --git a/src/arch_ref64/p448.c b/src/p448/arch_ref64/p448.c
similarity index 100%
rename from src/arch_ref64/p448.c
rename to src/p448/arch_ref64/p448.c
diff --git a/src/arch_ref64/p448.h b/src/p448/arch_ref64/p448.h
similarity index 100%
rename from src/arch_ref64/p448.h
rename to src/p448/arch_ref64/p448.h
diff --git a/src/arch_x86_64/arch_config.h b/src/p448/arch_x86_64/arch_config.h
similarity index 100%
rename from src/arch_x86_64/arch_config.h
rename to src/p448/arch_x86_64/arch_config.h
diff --git a/src/arch_x86_64/p448.c b/src/p448/arch_x86_64/p448.c
similarity index 100%
rename from src/arch_x86_64/p448.c
rename to src/p448/arch_x86_64/p448.c
diff --git a/src/arch_x86_64/p448.h b/src/p448/arch_x86_64/p448.h
similarity index 100%
rename from src/arch_x86_64/p448.h
rename to src/p448/arch_x86_64/p448.h
diff --git a/src/arch_x86_64/x86-64-arith.h b/src/p448/arch_x86_64/x86-64-arith.h
similarity index 100%
rename from src/arch_x86_64/x86-64-arith.h
rename to src/p448/arch_x86_64/x86-64-arith.h
diff --git a/src/p448/f_arithmetic.c b/src/p448/f_arithmetic.c
new file mode 100644
index 0000000..82f35b8
--- /dev/null
+++ b/src/p448/f_arithmetic.c
@@ -0,0 +1,43 @@
+/**
+ * @cond internal
+ * @file f_arithmetic.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Field-specific arithmetic.
+ */
+
+#include "ec_point.h"
+
+void 
+field_isr (
+    struct field_t*       a,
+    const struct field_t* x
+) {
+    struct field_t L0, L1, L2;
+    field_sqr  (   &L1,     x );
+    field_mul  (   &L2,     x,   &L1 );
+    field_sqr  (   &L1,   &L2 );
+    field_mul  (   &L2,     x,   &L1 );
+    field_sqrn (   &L1,   &L2,     3 );
+    field_mul  (   &L0,   &L2,   &L1 );
+    field_sqrn (   &L1,   &L0,     3 );
+    field_mul  (   &L0,   &L2,   &L1 );
+    field_sqrn (   &L2,   &L0,     9 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqr  (   &L0,   &L1 );
+    field_mul  (   &L2,     x,   &L0 );
+    field_sqrn (   &L0,   &L2,    18 );
+    field_mul  (   &L2,   &L1,   &L0 );
+    field_sqrn (   &L0,   &L2,    37 );
+    field_mul  (   &L1,   &L2,   &L0 );
+    field_sqrn (   &L0,   &L1,    37 );
+    field_mul  (   &L1,   &L2,   &L0 );
+    field_sqrn (   &L0,   &L1,   111 );
+    field_mul  (   &L2,   &L1,   &L0 );
+    field_sqr  (   &L0,   &L2 );
+    field_mul  (   &L1,     x,   &L0 );
+    field_sqrn (   &L0,   &L1,   223 );
+    field_mul  (     a,   &L2,   &L0 );
+}
diff --git a/src/p448/f_field.h b/src/p448/f_field.h
new file mode 100644
index 0000000..c743c8d
--- /dev/null
+++ b/src/p448/f_field.h
@@ -0,0 +1,39 @@
+/**
+ * @file f_field.h
+ * @brief Field-specific code.
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+#ifndef __F_FIELD_H__
+#define __F_FIELD_H__ 1
+
+#include <string.h>
+#include "constant_time.h"
+
+#include "p448.h"
+#define FIELD_BITS           448
+#define field_t              p448_t
+#define field_mul            p448_mul
+#define field_sqr            p448_sqr
+#define field_add            p448_add
+#define field_sub            p448_sub
+#define field_mulw           p448_mulw
+#define field_addw           p448_addw
+#define field_subw           p448_subw
+#define field_neg            p448_neg
+#define field_set_ui         p448_set_ui
+#define field_bias           p448_bias
+#define field_cond_neg       p448_cond_neg
+#define field_inverse        p448_inverse
+#define field_eq             p448_eq
+#define field_isr            p448_isr
+#define field_simultaneous_invert p448_simultaneous_invert
+#define field_weak_reduce    p448_weak_reduce
+#define field_strong_reduce  p448_strong_reduce
+#define field_serialize      p448_serialize
+#define field_deserialize    p448_deserialize
+#define field_is_zero        p448_is_zero
+
+#endif /* __F_FIELD_H__ */
diff --git a/src/p448/f_magic.h b/src/p448/f_magic.h
new file mode 100644
index 0000000..9e1365a
--- /dev/null
+++ b/src/p448/f_magic.h
@@ -0,0 +1,35 @@
+/**
+ * @file f_magic.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
+ */
+
+#ifndef __GOLDI_F_MAGIC_H__
+#define __GOLDI_F_MAGIC_H__ 1
+
+#include "field.h"
+#include "ec_point.h"
+
+/**
+ * @brief The Edwards "d" term for this curve.
+ */
+static const int64_t EDWARDS_D = -39081;
+
+/** @brief The number of combs to use for signed comb algo */
+#define COMB_N (USE_BIG_COMBS ? 5  : 8)
+
+/** @brief The number of teeth of the combs for signed comb algo */
+#define COMB_T (USE_BIG_COMBS ? 5  : 4)
+
+/** @brief The spacing the of combs for signed comb algo */
+#define COMB_S (USE_BIG_COMBS ? 18 : 14)
+
+/**
+ * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
+ */
+#define CRANDOM_MAGIC 0x72657475726e2034ull
+
+#endif /* __GOLDI_F_MAGIC_H__ */
diff --git a/src/p448/field.h b/src/p448/field.h
new file mode 100644
index 0000000..bf36e95
--- /dev/null
+++ b/src/p448/field.h
@@ -0,0 +1,123 @@
+/**
+ * @file field.h
+ * @brief Field switch code.
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+#ifndef __FIELD_H__
+#define __FIELD_H__
+
+#include <string.h>
+#include "constant_time.h"
+
+#include "p448.h"
+#define FIELD_BITS           448
+#define field_t              p448_t
+#define field_mul            p448_mul
+#define field_sqr            p448_sqr
+#define field_add            p448_add
+#define field_sub            p448_sub
+#define field_mulw           p448_mulw
+#define field_addw           p448_addw
+#define field_subw           p448_subw
+#define field_neg            p448_neg
+#define field_set_ui         p448_set_ui
+#define field_bias           p448_bias
+#define field_cond_neg       p448_cond_neg
+#define field_inverse        p448_inverse
+#define field_eq             p448_eq
+#define field_isr            p448_isr
+#define field_simultaneous_invert p448_simultaneous_invert
+#define field_weak_reduce    p448_weak_reduce
+#define field_strong_reduce  p448_strong_reduce
+#define field_serialize      p448_serialize
+#define field_deserialize    p448_deserialize
+#define field_is_zero        p448_is_zero
+
+/** @brief Bytes in a field element */
+#define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
+
+/** @brief Words in a field element */
+#define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))
+
+/**
+ * @brief For GMP tests: little-endian representation of the field modulus.
+ */
+extern const uint8_t FIELD_MODULUS[FIELD_BYTES];
+
+/**
+ * Copy one field element to another.
+ */
+static inline void
+__attribute__((unused,always_inline))        
+field_copy (
+    struct field_t *__restrict__ a,
+    const struct field_t *__restrict__ b
+) {
+    memcpy(a,b,sizeof(*a));
+}
+
+/**
+ * Negate a in place if doNegate.
+ */
+static inline void
+__attribute__((unused,always_inline)) 
+field_cond_neg(
+    field_t *a,
+    mask_t doNegate
+) {
+	struct field_t negated;
+    field_neg(&negated, a);
+    field_bias(&negated, 2);
+	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
+}
+
+/**
+ * Returns 1/sqrt(+- x).
+ * 
+ * The Legendre symbol of the result is the same as that of the
+ * input.
+ * 
+ * If x=0, returns 0.
+ */
+void
+field_isr (
+    struct field_t*       a,
+    const struct field_t* x
+);
+    
+/**
+ * Batch inverts out[i] = 1/in[i]
+ * 
+ * If any input is zero, all the outputs will be zero.
+ */     
+void
+field_simultaneous_invert (
+    struct field_t *__restrict__ out,
+    const struct field_t *in,
+    unsigned int n
+);
+
+/**
+ * Returns 1/x.
+ * 
+ * If x=0, returns 0.
+ */
+void
+field_inverse (
+    struct field_t*       a,
+    const struct field_t* x
+);
+
+/**
+ * Returns -1 if a==b, 0 otherwise.
+ */
+mask_t
+field_eq (
+    const struct field_t *a,
+    const struct field_t *b
+);
+
+#endif /* __FIELD_H__ */
diff --git a/src/magic.c b/src/p448/magic.c
similarity index 82%
rename from src/magic.c
rename to src/p448/magic.c
index 5157e14..b1e7ca5 100644
--- a/src/magic.c
+++ b/src/p448/magic.c
@@ -39,10 +39,10 @@ const struct affine_t goldilocks_base_point = {
        0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
     }},
 #else
-    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
-       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
-       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
-       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
+    {{ U56LE(0xf0de840aed939f), U56LE(0xc170033f4ba0c7),
+       U56LE(0xf3932d94c63d96), U56LE(0x9cecfa96147eaa),
+       U56LE(0x5f065c3c59d070), U56LE(0x3a6a26adf73324),
+       U56LE(0x1b4faff4609845), U56LE(0x297ea0ea2692ff)
     }},
 #endif
     {{ 19 }}
@@ -69,13 +69,13 @@ sqrt_d_minus_1 = {{
     0xbdeea38,0x748734a,0x5a189aa,0x49443b8,
     0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c
 #else
-    U58LE(0xd2e21836749f46),
-    U58LE(0x888db42b4f0179),
-    U58LE(0x5a189aabdeea38),
-    U58LE(0x51e65ca6f14c06),
-    U58LE(0xa49f7b424d9770),
-    U58LE(0xdcac4628c5f656),
-    U58LE(0x49443b8748734a),
-    U58LE(0x12fec0c0b25b7a)
+    U56LE(0xd2e21836749f46),
+    U56LE(0x888db42b4f0179),
+    U56LE(0x5a189aabdeea38),
+    U56LE(0x51e65ca6f14c06),
+    U56LE(0xa49f7b424d9770),
+    U56LE(0xdcac4628c5f656),
+    U56LE(0x49443b8748734a),
+    U56LE(0x12fec0c0b25b7a)
 #endif
 }};
diff --git a/src/p480/arch_x86_64/arch_config.h b/src/p480/arch_x86_64/arch_config.h
new file mode 100644
index 0000000..58758cc
--- /dev/null
+++ b/src/p480/arch_x86_64/arch_config.h
@@ -0,0 +1 @@
+#define WORD_BITS 64
diff --git a/src/p480/arch_x86_64/p480.c b/src/p480/arch_x86_64/p480.c
new file mode 100644
index 0000000..6110373
--- /dev/null
+++ b/src/p480/arch_x86_64/p480.c
@@ -0,0 +1,435 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "p480.h"
+#include "x86-64-arith.h"
+
+void
+p480_mul (
+    p480_t *__restrict__ cs,
+    const p480_t *as,
+    const p480_t *bs
+) {
+    const uint64_t *a = as->limb, *b = bs->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ull<<60) - 1;  
+
+    uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    unsigned int i;
+    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
+        ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    }
+    /*
+    for (int i=0; i<4; i++) {
+    aa[i] = a[i] + a[i+4];
+    bb[i] = b[i] + b[i+4];
+    }
+    */
+
+    accum2  = widemul(&a[0],&b[3]);
+    accum0  = widemul(&aa[0],&bb[3]);
+    accum1  = widemul(&a[4],&b[7]);
+
+    mac(&accum2, &a[1], &b[2]);
+    mac(&accum0, &aa[1], &bb[2]);
+    mac(&accum1, &a[5], &b[6]);
+
+    mac(&accum2, &a[2], &b[1]);
+    mac(&accum0, &aa[2], &bb[1]);
+    mac(&accum1, &a[6], &b[5]);
+
+    mac(&accum2, &a[3], &b[0]);
+    mac(&accum0, &aa[3], &bb[0]);
+    mac(&accum1, &a[7], &b[4]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1)) & mask;
+    c[7] = ((uint64_t)(accum0)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+    
+    mac(&accum0, &aa[1],&bb[3]);
+    mac(&accum1, &a[5], &b[7]);
+    mac(&accum0, &aa[2], &bb[2]);
+    mac(&accum1, &a[6], &b[6]);
+    mac(&accum0, &aa[3], &bb[1]);
+    accum1 += accum0;
+
+    accum2 = widemul(&a[0],&b[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+    
+    msb(&accum0, &a[1], &b[3]);
+    msb(&accum0, &a[2], &b[2]);
+    mac(&accum1, &a[7], &b[5]);
+    msb(&accum0, &a[3], &b[1]);
+    mac(&accum1, &aa[0], &bb[0]);
+    mac(&accum0, &a[4], &b[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum2  = widemul(&a[2],&b[7]);
+    mac(&accum0, &a[6], &bb[3]);
+    mac(&accum1, &aa[2], &bbb[3]);
+
+    mac(&accum2, &a[3], &b[6]);
+    mac(&accum0, &a[7], &bb[2]);
+    mac(&accum1, &aa[3], &bbb[2]);
+
+    mac(&accum2, &a[0],&b[1]);
+    mac(&accum1, &aa[0], &bb[1]);
+    mac(&accum0, &a[4], &b[5]);
+
+    mac(&accum2, &a[1], &b[0]);
+    mac(&accum1, &aa[1], &bb[0]);
+    mac(&accum0, &a[5], &b[4]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum2  = widemul(&a[3],&b[7]);
+    mac(&accum0, &a[7], &bb[3]);
+    mac(&accum1, &aa[3], &bbb[3]);
+
+    mac(&accum2, &a[0],&b[2]);
+    mac(&accum1, &aa[0], &bb[2]);
+    mac(&accum0, &a[4], &b[6]);
+
+    mac(&accum2, &a[1], &b[1]);
+    mac(&accum1, &aa[1], &bb[1]);
+    mac(&accum0, &a[5], &b[5]);
+
+    mac(&accum2, &a[2], &b[0]);
+    mac(&accum1, &aa[2], &bb[0]);
+    mac(&accum0, &a[6], &b[4]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
+
+void
+p480_mulw (
+    p480_t *__restrict__ cs,
+    const p480_t *as,
+    uint64_t b
+) {
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0, accum4;
+    uint64_t mask = (1ull<<60) - 1;  
+
+    accum0 = widemul_rm(b, &a[0]);
+    accum4 = widemul_rm(b, &a[4]);
+
+    c[0] = accum0 & mask; accum0 >>= 60;
+    c[4] = accum4 & mask; accum4 >>= 60;
+
+    mac_rm(&accum0, b, &a[1]);
+    mac_rm(&accum4, b, &a[5]);
+
+    c[1] = accum0 & mask; accum0 >>= 60;
+    c[5] = accum4 & mask; accum4 >>= 60;
+
+    mac_rm(&accum0, b, &a[2]);
+    mac_rm(&accum4, b, &a[6]);
+
+    c[2] = accum0 & mask; accum0 >>= 60;
+    c[6] = accum4 & mask; accum4 >>= 60;
+
+    mac_rm(&accum0, b, &a[3]);
+    mac_rm(&accum4, b, &a[7]);
+
+    c[3] = accum0 & mask; accum0 >>= 60;
+    c[7] = accum4 & mask; accum4 >>= 60;
+    
+    accum0 += accum4 + c[4];
+    c[4] = accum0 & mask;
+    c[5] += accum0 >> 60;
+
+    accum4 += c[0];
+    c[0] = accum4 & mask;
+    c[1] += accum4 >> 60;
+}
+
+void
+p480_sqr (
+    p480_t *__restrict__ cs,
+    const p480_t *as
+) {
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+
+    __uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ull<<60) - 1;  
+
+    uint64_t aa[4] __attribute__((aligned(32)));
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    unsigned int i;
+    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
+      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+    }
+
+    accum2  = widemul(&a[0],&a[3]);
+    accum0  = widemul(&aa[0],&aa[3]);
+    accum1  = widemul(&a[4],&a[7]);
+
+    mac(&accum2, &a[1], &a[2]);
+    mac(&accum0, &aa[1], &aa[2]);
+    mac(&accum1, &a[5], &a[6]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1))<<1 & mask;
+    c[7] = ((uint64_t)(accum0))<<1 & mask;
+
+    accum0 >>= 59;
+    accum1 >>= 59;
+
+    mac2(&accum0, &aa[1],&aa[3]);
+    mac2(&accum1, &a[5], &a[7]);
+    mac(&accum0, &aa[2], &aa[2]);
+    accum1 += accum0;
+
+    msb2(&accum0, &a[1], &a[3]);
+    mac(&accum1, &a[6], &a[6]);
+    
+    accum2 = widemul(&a[0],&a[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    msb(&accum0, &a[2], &a[2]);
+    mac(&accum1, &aa[0], &aa[0]);
+    mac(&accum0, &a[4], &a[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum2  = widemul2(&aa[2],&aa[3]);
+    msb2(&accum0, &a[2], &a[3]);
+    mac2(&accum1, &a[6], &a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul2(&a[0],&a[1]);
+    mac2(&accum1, &aa[0], &aa[1]);
+    mac2(&accum0, &a[4], &a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum2  = widemul(&aa[3],&aa[3]);
+    msb(&accum0, &a[3], &a[3]);
+    mac(&accum1, &a[7], &a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2  = widemul2(&a[0],&a[2]);
+    mac2(&accum1, &aa[0], &aa[2]);
+    mac2(&accum0, &a[4], &a[6]);
+
+    mac(&accum2, &a[1], &a[1]);
+    mac(&accum1, &aa[1], &aa[1]);
+    mac(&accum0, &a[5], &a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 60;
+    accum1 >>= 60;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
+
+void
+p480_strong_reduce (
+    p480_t *a
+) {
+    uint64_t mask = (1ull<<60)-1;
+
+    /* first, clear high */
+    a->limb[4] += a->limb[7]>>60;
+    a->limb[0] += a->limb[7]>>60;
+    a->limb[7] &= mask;
+
+    /* now the total is less than 2^480 - 2^(480-60) + 2^(480-60+8) < 2p */
+
+    /* compute total_value - p.  No need to reduce mod p. */
+
+    __int128_t scarry = 0;
+    int i;
+    for (i=0; i<8; i++) {
+        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
+        a->limb[i] = scarry & mask;
+        scarry >>= 60;
+    }
+
+    /* uncommon case: it was >= p, so now scarry = 0 and this = x
+    * common case: it was < p, so now scarry = -1 and this = x - p + 2^480
+    * so let's add back in p.  will carry back off the top for 2^480.
+    */
+
+    assert(is_zero(scarry) | is_zero(scarry+1));
+
+    uint64_t scarry_mask = scarry & mask;
+    __uint128_t carry = 0;
+
+    /* add it back */
+    for (i=0; i<8; i++) {
+        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
+        a->limb[i] = carry & mask;
+        carry >>= 60;
+    }
+
+    assert(is_zero(carry + scarry));
+}
+
+mask_t
+p480_is_zero (
+    const struct p480_t *a
+) {
+    struct p480_t b;
+    p480_copy(&b,a);
+    p480_strong_reduce(&b);
+
+    uint64_t any = 0;
+    int i;
+    for (i=0; i<8; i++) {
+        any |= b.limb[i];
+    }
+    return is_zero(any);
+}
+
+void
+p480_serialize (
+    uint8_t *serial,
+    const struct p480_t *x
+) {
+    int i,j,k=0;
+    p480_t red;
+    p480_copy(&red, x);
+    p480_strong_reduce(&red);
+    word_t r = 0;
+    for (i=0; i<8; i+=2) {
+        r = red.limb[i];
+        for (j=0; j<7; j++) {
+            serial[k++] = r;
+            r >>= 8;
+        }
+        assert(r<16);
+        r += red.limb[i+1]<<4;
+        for (j=0; j<8; j++) {
+            serial[k++] = r;
+            r >>= 8;
+        }
+        assert(r==0);
+    }
+}
+
+mask_t
+p480_deserialize (
+    p480_t *x,
+    const uint8_t serial[60]
+) {
+    int i,j,k=0;
+
+    for (i=0; i<8; i+=2) {
+        word_t r = 0;
+        for (j=0; j<8; j++) {
+            r |= ((word_t)serial[k++])<<(8*j);
+        }
+        x->limb[i] = r & ((1ull<<60)-1);
+        r >>= 60;
+        for (j=0; j<7; j++) {
+            r |= ((word_t)serial[k++])<<(8*j+4);
+        }
+        x->limb[i+1] = r;
+    }
+    
+    /* Check for reduction.
+     *
+     * The idea is to create a variable ge which is all ones (rather, 60 ones)
+     * if and only if the low $i$ words of $x$ are >= those of p.
+     *
+     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
+     */
+    word_t ge = -1, mask = (1ull<<60)-1;
+    for (i=0; i<4; i++) {
+        ge &= x->limb[i];
+    }
+    
+    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
+    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
+    
+    /* Propagate the rest */
+    for (i=5; i<8; i++) {
+        ge &= x->limb[i];
+    }
+    
+    return ~is_zero(ge ^ mask);
+}
+
diff --git a/src/p480/arch_x86_64/p480.h b/src/p480/arch_x86_64/p480.h
new file mode 100644
index 0000000..a49c6d0
--- /dev/null
+++ b/src/p480/arch_x86_64/p480.h
@@ -0,0 +1,257 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+#ifndef __p480_H__
+#define __p480_H__ 1
+
+#include <stdint.h>
+#include <assert.h>
+
+#include "word.h"
+
+typedef struct p480_t {
+  uint64_t limb[8];
+} __attribute__((aligned(32))) p480_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ void
+p480_set_ui (
+    p480_t *out,
+    uint64_t x
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p480_add (
+    p480_t *out,
+    const p480_t *a,
+    const p480_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p480_sub (
+    p480_t *out,
+    const p480_t *a,
+    const p480_t *b
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p480_neg (
+    p480_t *out,
+    const p480_t *a
+) __attribute__((unused,always_inline));
+
+static __inline__ void
+p480_addw (
+    p480_t *a,
+    uint64_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p480_subw (
+    p480_t *a,
+    uint64_t x
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p480_copy (
+    p480_t *out,
+    const p480_t *a
+) __attribute__((unused,always_inline));
+             
+static __inline__ void
+p480_weak_reduce (
+    p480_t *inout
+) __attribute__((unused,always_inline));
+             
+void
+p480_strong_reduce (
+    p480_t *inout
+);
+
+mask_t
+p480_is_zero (
+    const p480_t *in
+);
+  
+static __inline__ void
+p480_bias (
+    p480_t *inout,
+    int amount
+) __attribute__((unused,always_inline));
+         
+void
+p480_mul (
+    p480_t *__restrict__ out,
+    const p480_t *a,
+    const p480_t *b
+);
+
+void
+p480_mulw (
+    p480_t *__restrict__ out,
+    const p480_t *a,
+    uint64_t b
+);
+
+void
+p480_sqr (
+    p480_t *__restrict__ out,
+    const p480_t *a
+);
+
+void
+p480_serialize (
+    uint8_t *serial,
+    const struct p480_t *x
+);
+
+mask_t
+p480_deserialize (
+    p480_t *x,
+    const uint8_t serial[60]
+);
+
+/* -------------- Inline functions begin here -------------- */
+
+void
+p480_set_ui (
+    p480_t *out,
+    uint64_t x
+) {
+    int i;
+    out->limb[0] = x;
+    for (i=1; i<8; i++) {
+      out->limb[i] = 0;
+    }
+}
+
+void
+p480_add (
+    p480_t *out,
+    const p480_t *a,
+    const p480_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] + b->limb[i];
+    }
+    */
+}
+
+void
+p480_sub (
+    p480_t *out,
+    const p480_t *a,
+    const p480_t *b
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = a->limb[i] - b->limb[i];
+    }
+    */
+}
+
+void
+p480_neg (
+    struct p480_t *out,
+    const p480_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
+        ((uint64xn_t*)out)[i] = -((const uint64xn_t*)a)[i];
+    }
+    /*
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+        out->limb[i] = -a->limb[i];
+    }
+    */
+}
+
+void
+p480_addw (
+    p480_t *a,
+    uint64_t x
+) {
+  a->limb[0] += x;
+}
+             
+void
+p480_subw (
+    p480_t *a,
+    uint64_t x
+) {
+  a->limb[0] -= x;
+}
+
+void
+p480_copy (
+    p480_t *out,
+    const p480_t *a
+) {
+    unsigned int i;
+    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
+        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
+    }
+}
+
+void
+p480_bias (
+    p480_t *a,
+    int amt
+) {
+    uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
+    
+#if __AVX2__
+    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
+    uint64x4_t *aa = (uint64x4_t*) a;
+    aa[0] += lo;
+    aa[1] += hi;
+#elif __SSE2__
+    uint64x2_t lo = {co1,co1}, hi = {co2,co1};
+    uint64x2_t *aa = (uint64x2_t*) a;
+    aa[0] += lo;
+    aa[1] += lo;
+    aa[2] += hi;
+    aa[3] += lo;
+#else
+    unsigned int i;
+    for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
+        a->limb[i] += (i==4) ? co2 : co1;
+    }
+#endif
+}
+
+void
+p480_weak_reduce (
+    p480_t *a
+) {
+    /* PERF: use pshufb/palignr if anyone cares about speed of this */
+    uint64_t mask = (1ull<<60) - 1;
+    uint64_t tmp = a->limb[7] >> 60;
+    int i;
+    a->limb[4] += tmp;
+    for (i=7; i>0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>60);
+    }
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __p480_H__ */
diff --git a/src/p480/arch_x86_64/x86-64-arith.h b/src/p480/arch_x86_64/x86-64-arith.h
new file mode 100644
index 0000000..32ee832
--- /dev/null
+++ b/src/p480/arch_x86_64/x86-64-arith.h
@@ -0,0 +1,279 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __X86_64_ARITH_H__
+#define __X86_64_ARITH_H__
+
+#include <stdint.h>
+
+/* TODO: non x86-64 versions of these.
+ * FUTURE: autogenerate
+ */
+
+static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax;"
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"r"(a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"d"(a));
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
+  #ifndef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b];"
+       : [c]"=a"(c), [d]"=d"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "cc");
+  return (((__uint128_t)(d))<<64) | c;
+  #else
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx;"
+       "leaq (,%%rdx,2), %%rdx;"
+       "mulx %[b], %[c], %[d];"
+       : [c]"=r"(c), [d]"=r"(d)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx");
+  return (((__uint128_t)(d))<<64) | c;
+  #endif
+}
+
+static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       "addq %[c], %[lo2]; "
+       "adcq %[d], %[hi2]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       "addq %%rax, %[lo2]; "
+       "adcq %%rdx, %[hi2]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
+}
+
+static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"d"(a)
+       : "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"r"(a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "addq %[c], %[lo]; "
+       "adcq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "addq %%rax, %[lo]; "
+       "adcq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+  uint64_t c,d;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "addq %%rdx, %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[c], %[lo]; "
+       "sbbq %[d], %[hi]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  #else
+  __asm__ volatile
+      ("movq %[a], %%rax; "
+       "addq %%rax, %%rax; "
+       "mulq %[b]; "
+       "subq %%rax, %[lo]; "
+       "sbbq %%rdx, %[hi]; "
+       : [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  
+}
+
+static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d, lo = *acc, hi = *acc>>64;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[lo], %[c]; "
+       "sbbq %[hi], %[d]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  *acc = (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
+  return ((__uint128_t)(a)) * b;
+}
+
+static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
+  return ((__int128_t)(a)) * b;
+}
+ 
+static __inline__ uint64_t opacify(uint64_t x) {
+  __asm__ volatile("" : "+r"(x));
+  return x;
+}
+
+static __inline__ mask_t is_zero(uint64_t x) {
+  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
+  return ~x;
+}
+
+#endif /* __X86_64_ARITH_H__ */
diff --git a/src/p480/f_arithmetic.c b/src/p480/f_arithmetic.c
new file mode 100644
index 0000000..d616e42
--- /dev/null
+++ b/src/p480/f_arithmetic.c
@@ -0,0 +1,43 @@
+/**
+ * @cond internal
+ * @file f_arithmetic.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Field-specific arithmetic.
+ */
+
+#include "ec_point.h"
+
+void 
+field_isr (
+    struct field_t*       a,
+    const struct field_t* x
+) {
+    struct field_t L0, L1, L2, L3;
+    field_sqr  (   &L2,     x );
+    field_mul  (   &L1,     x,   &L2 );
+    field_sqrn (   &L0,   &L1,     2 );
+    field_mul  (   &L2,   &L1,   &L0 );
+    field_sqrn (   &L0,   &L2,     4 );
+    field_mul  (   &L1,   &L2,   &L0 );
+    field_sqr  (   &L0,   &L1 );
+    field_mul  (   &L2,     x,   &L0 );
+    field_sqrn (   &L0,   &L2,     8 );
+    field_mul  (   &L2,   &L1,   &L0 );
+    field_sqrn (   &L0,   &L2,    17 );
+    field_mul  (   &L1,   &L2,   &L0 );
+    field_sqrn (   &L0,   &L1,    17 );
+    field_mul  (   &L1,   &L2,   &L0 );
+    field_sqrn (   &L3,   &L1,    17 );
+    field_mul  (   &L0,   &L2,   &L3 );
+    field_sqrn (   &L2,   &L0,    51 );
+    field_mul  (   &L0,   &L1,   &L2 );
+    field_sqrn (   &L1,   &L0,   119 );
+    field_mul  (   &L2,   &L0,   &L1 );
+    field_sqr  (   &L0,   &L2 );
+    field_mul  (   &L1,     x,   &L0 );
+    field_sqrn (   &L0,   &L1,   239 );
+    field_mul  (     a,   &L2,   &L0 );
+}
diff --git a/src/p480/f_field.h b/src/p480/f_field.h
new file mode 100644
index 0000000..397f83d
--- /dev/null
+++ b/src/p480/f_field.h
@@ -0,0 +1,39 @@
+/**
+ * @file f_field.h
+ * @brief Field-specific code.
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+#ifndef __F_FIELD_H__
+#define __F_FIELD_H__ 1
+
+#include <string.h>
+#include "constant_time.h"
+
+#include "p480.h"
+#define FIELD_BITS           480
+#define field_t              p480_t
+#define field_mul            p480_mul
+#define field_sqr            p480_sqr
+#define field_add            p480_add
+#define field_sub            p480_sub
+#define field_mulw           p480_mulw
+#define field_addw           p480_addw
+#define field_subw           p480_subw
+#define field_neg            p480_neg
+#define field_set_ui         p480_set_ui
+#define field_bias           p480_bias
+#define field_cond_neg       p480_cond_neg
+#define field_inverse        p480_inverse
+#define field_eq             p480_eq
+#define field_isr            p480_isr
+#define field_simultaneous_invert p480_simultaneous_invert
+#define field_weak_reduce    p480_weak_reduce
+#define field_strong_reduce  p480_strong_reduce
+#define field_serialize      p480_serialize
+#define field_deserialize    p480_deserialize
+#define field_is_zero        p480_is_zero
+
+#endif /* __F_FIELD_H__ */
diff --git a/src/p480/f_magic.h b/src/p480/f_magic.h
new file mode 100644
index 0000000..d5d095a
--- /dev/null
+++ b/src/p480/f_magic.h
@@ -0,0 +1,35 @@
+/**
+ * @file f_magic.h
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc).
+ */
+
+#ifndef __GOLDI_F_MAGIC_H__
+#define __GOLDI_F_MAGIC_H__ 1
+
+#include "field.h"
+#include "ec_point.h"
+
+/**
+ * @brief The Edwards "d" term for this curve.
+ */
+static const int64_t EDWARDS_D = 53825;
+
+/** @brief The number of combs to use for signed comb algo */
+#define COMB_N (USE_BIG_COMBS ? 6  : 5)
+
+/** @brief The number of teeth of the combs for signed comb algo */
+#define COMB_T (USE_BIG_COMBS ? 5  : 4)
+
+/** @brief The spacing the of combs for signed comb algo */
+#define COMB_S (USE_BIG_COMBS ? 16 : 24)
+
+/**
+ * @brief crandom magic structure guard constant = "return 4", cf xkcd #221
+ */
+#define CRANDOM_MAGIC 0x72657475726e2034ull
+
+#endif /* __GOLDI_F_MAGIC_H__ */
diff --git a/src/p480/magic.c b/src/p480/magic.c
new file mode 100644
index 0000000..ee90a0a
--- /dev/null
+++ b/src/p480/magic.c
@@ -0,0 +1,68 @@
+/* Copyright (c) 2014 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#include "field.h"
+#include "magic.h"
+#include "barrett_field.h"
+
+/* FUTURE: automatically generate this file? */
+
+const uint8_t FIELD_MODULUS[FIELD_BYTES] = {
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+/*!*/ 0xfe, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
+    U64LE(0x58b51bc56ea8f0c4),
+    U64LE(0xd361f6a2348b50c9),
+    U64LE(0x08089c139c0002ae),
+    U64LE(0x0001d2ac3d9503a0),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    0x40000000,
+    
+    U64LE(0xcb9c25073e36965b),
+    U64LE(0x6f2d48d8460f1661),
+    U64LE(0x0ab6256f7aaaae3e),
+    U64LE(0x00026e3afcc6af80),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    U64LE(0x0000000000000000),
+    0x00000000
+};
+
+const struct affine_t goldilocks_base_point = {
+    {{
+        U60LE(0x849ff7f845c30d3),
+        U60LE(0x7dda488553a4c5b),
+        U60LE(0x1d3a2d9844831ea),
+        U60LE(0xb33ecf6ade470a2),
+        U60LE(0x8b3cb95210bd3c3),
+        U60LE(0xfc955e59aeefa65),
+        U60LE(0x3ab247cd530013c),
+        U60LE(0x7ca42af3d564280)
+    }},
+    {{ 5 }}
+};
+
+static const word_t curve_prime_order_lo[(240+WORD_BITS-1)/WORD_BITS] = {
+    U64LE(0x72e70941cf8da597),
+    U64LE(0x9bcb52361183c598),
+    U64LE(0x02ad895bdeaaab8f),
+    U64LE(0x9b8ebf31abe0)
+};
+const struct barrett_prime_t curve_prime_order = {
+    GOLDI_FIELD_WORDS,
+    30 % WORD_BITS,
+    sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]),
+    curve_prime_order_lo
+};
+
+const struct field_t
+sqrt_d_minus_1 = {{
+    232 /* Whoa, it comes out even. */
+}};
diff --git a/src/p521/f_arithmetic.c b/src/p521/f_arithmetic.c
new file mode 100644
index 0000000..7fbdfb8
--- /dev/null
+++ b/src/p521/f_arithmetic.c
@@ -0,0 +1,43 @@
+/**
+ * @cond internal
+ * @file f_arithmetic.c
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ * @brief Field-specific arithmetic.
+ */
+
+#include "ec_point.h"
+
+void 
+field_isr (
+    struct field_t*       a,
+    const struct field_t* x
+) {
+    struct field_t L0, L1, L2;
+    field_sqr  (   &L1,     x );
+    field_mul  (   &L0,     x,   &L1 );
+    field_sqrn (   &L2,   &L0,     2 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqrn (   &L2,   &L1,     4 );
+    field_mul  (   &L0,   &L1,   &L2 );
+    field_sqrn (   &L2,   &L0,     8 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqrn (   &L2,   &L1,    16 );
+    field_mul  (   &L0,   &L1,   &L2 );
+    field_sqrn (   &L2,   &L0,    32 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqr  (   &L2,   &L1 );
+    field_mul  (   &L0,     x,   &L2 );
+    field_sqrn (   &L2,   &L0,    64 );
+    field_mul  (   &L0,   &L1,   &L2 );
+    field_sqrn (   &L2,   &L0,   129 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqr  (   &L2,   &L1 );
+    field_mul  (   &L0,     x,   &L2 );
+    field_sqrn (   &L2,   &L0,   259 );
+    field_mul  (   &L1,   &L0,   &L2 );
+    field_sqr  (   &L0,   &L1 );
+    field_mul  (     a,     x,   &L0 );
+}
diff --git a/src/p521/f_field.h b/src/p521/f_field.h
new file mode 100644
index 0000000..f17fe3d
--- /dev/null
+++ b/src/p521/f_field.h
@@ -0,0 +1,39 @@
+/**
+ * @file f_field.h
+ * @brief Field-specific code.
+ * @copyright
+ *   Copyright (c) 2014 Cryptography Research, Inc.  \n
+ *   Released under the MIT License.  See LICENSE.txt for license information.
+ * @author Mike Hamburg
+ */
+#ifndef __F_FIELD_H__
+#define __F_FIELD_H__ 1
+
+#include <string.h>
+#include "constant_time.h"
+
+#include "p521.h"
+#define FIELD_BITS           521
+#define field_t              p521_t
+#define field_mul            p521_mul
+#define field_sqr            p521_sqr
+#define field_add            p521_add
+#define field_sub            p521_sub
+#define field_mulw           p521_mulw
+#define field_addw           p521_addw
+#define field_subw           p521_subw
+#define field_neg            p521_neg
+#define field_set_ui         p521_set_ui
+#define field_bias           p521_bias
+#define field_cond_neg       p521_cond_neg
+#define field_inverse        p521_inverse
+#define field_eq             p521_eq
+#define field_isr            p521_isr
+#define field_simultaneous_invert p521_simultaneous_invert
+#define field_weak_reduce    p521_weak_reduce
+#define field_strong_reduce  p521_strong_reduce
+#define field_serialize      p521_serialize
+#define field_deserialize    p521_deserialize
+#define field_is_zero        p521_is_zero
+
+#endif /* __F_FIELD_H__ */
diff --git a/test/bench.c b/test/bench.c
index 028844f..399337d 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -39,13 +39,12 @@ static void q448_randomize( struct crandom_state_t *crand, word_t sk[SCALAR_WORD
 }
 
 static void field_print( const char *descr, const struct field_t *a ) {
-    field_t b;
-    field_copy(&b, a);
-    field_strong_reduce(&b);
     int j;
+    unsigned char ser[FIELD_BYTES];
+    field_serialize(ser,a);
     printf("%s = 0x", descr);
-    for (j=sizeof(*a)/sizeof(a->limb[0])-1; j>=0; j--) {
-        printf(PRIxWORD58, b.limb[j]);
+    for (j=FIELD_BYTES - 1; j>=0; j--) {
+        printf("%02x", ser[j]);
     }
     printf("\n");
 }
@@ -58,7 +57,7 @@ field_print_full (
     int j;
     printf("%s = 0x", descr);
     for (j=15; j>=0; j--) {
-        printf("%02" PRIxWORD "_" PRIxWORD58 " ",
+        printf("%02" PRIxWORD "_" PRIxWORD56 " ",
             a->limb[j]>>28, a->limb[j]&((1<<28)-1));
     }
     printf("\n");
diff --git a/test/test.c b/test/test.c
index 3c25700..d3c41b4 100644
--- a/test/test.c
+++ b/test/test.c
@@ -84,13 +84,12 @@ void field_print (
     const char *descr,
     const struct field_t *a
 ) {
-    field_t b;
-    field_copy(&b, a);
-    field_strong_reduce(&b);
     int j;
+    unsigned char ser[FIELD_BYTES];
+    field_serialize(ser,a);
     printf("%s = 0x", descr);
-    for (j=FIELD_WORDS - 1; j>=0; j--) {
-        printf(PRIxWORD58, b.limb[LIMBPERM(j)]);
+    for (j=FIELD_BYTES - 1; j>=0; j--) {
+        printf("%02x", ser[j]);
     }
     printf("\n");
 }
diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c
index a956c03..4d2d7d1 100644
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -22,6 +22,8 @@ static mask_t mpz_to_field (
 
 static mask_t field_assert_eq_gmp(
     const char *descr,
+    const struct field_t *a,
+    const struct field_t *b,
     const struct field_t *x,
     const mpz_t y,
     float lowBound,
@@ -40,7 +42,7 @@ static mask_t field_assert_eq_gmp(
     
     unsigned int i;
     for (i=0; i<sizeof(*x)/sizeof(x->limb[0]); i++) {
-        int radix_bits = sizeof(x->limb[0]) * 448 / sizeof(*x);
+        int radix_bits = sizeof(x->limb[0]) * FIELD_BITS / sizeof(*x);
         word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ?
             (1ull<<radix_bits) - 2 : (1ull<<radix_bits) - 1; // FIELD_MAGIC
         if (x->limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) {
@@ -54,6 +56,8 @@ static mask_t field_assert_eq_gmp(
     if (memcmp(xser,yser,FIELD_BYTES)) {
         youfail();
         printf("    Failed arithmetic test %s\n", descr);
+        field_print("    a", a);
+        field_print("    b", b);
         field_print("    goldi", x);
         printf("    gmp   = 0x");
         int j;
@@ -82,28 +86,30 @@ static mask_t test_add_sub (
     
     field_add(&tt,&xx,&yy);
     mpz_add(t,x,y);
-    succ &= field_assert_eq_gmp("add",&tt,t,0,2.1);
+    succ &= field_assert_eq_gmp("add",&xx,&yy,&tt,t,0,2.1);
     
     field_sub(&tt,&xx,&yy);
     field_bias(&tt,2);
     mpz_sub(t,x,y);
-    succ &= field_assert_eq_gmp("sub",&tt,t,0,3.1);
+    succ &= field_assert_eq_gmp("sub",&xx,&yy,&tt,t,0,3.1);
     
     field_copy(&tt,&xx);
     field_addw(&tt,word);
     mpz_add_ui(t,x,word);
-    succ &= field_assert_eq_gmp("addw",&tt,t,0,2.1);
+    succ &= field_assert_eq_gmp("addw",&xx,&yy,&tt,t,0,2.1);
     
     field_copy(&tt,&xx);
     field_subw(&tt,word);
     field_bias(&tt,1);
     mpz_sub_ui(t,x,word);
-    succ &= field_assert_eq_gmp("subw",&tt,t,0,2.1);
-    
+    succ &= field_assert_eq_gmp("subw",&xx,&yy,&tt,t,0,2.1);
+
+    /*
     if (!succ) {
         field_print("    x", &xx);
         field_print("    y", &yy);
     }
+    */
     
     mpz_clear(t);
     
@@ -124,19 +130,19 @@ static mask_t test_mul_sqr (
     
     field_mul(&tt,&xx,&yy);
     mpz_mul(t,x,y);
-    succ &= field_assert_eq_gmp("mul",&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("mul",&xx,&yy,&tt,t,0,1.1);
     
     field_mulw(&tt,&xx,word);
     mpz_mul_ui(t,x,word);
-    succ &= field_assert_eq_gmp("mulw",&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("mulw",&xx,&yy,&tt,t,0,1.1);
     
     field_sqr(&tt,&xx);
     mpz_mul(t,x,x);
-    succ &= field_assert_eq_gmp("sqrx",&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("sqrx",&xx,&yy,&tt,t,0,1.1);
     
     field_sqr(&tt,&yy);
     mpz_mul(t,y,y);
-    succ &= field_assert_eq_gmp("sqy",&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("sqy",&xx,&yy,&tt,t,0,1.1);
     
     if (!succ) {
         field_print("    x", &xx);
@@ -148,6 +154,36 @@ static mask_t test_mul_sqr (
     return succ;
 }
 
+static mask_t test_isr (
+    const mpz_t x
+) {
+    struct field_t xx,yy,ss,tt;
+    mask_t succ = 0;
+    succ  = mpz_to_field(&xx,x);
+    
+    field_isr(&ss,&xx);
+    field_sqr(&tt,&ss);
+    field_mul(&yy,&xx,&tt);
+    
+    field_addw(&tt,1);
+    succ |= field_is_zero(&tt);
+    
+    field_subw(&tt,2);
+    field_bias(&tt,1);
+    succ |= field_is_zero(&tt);
+    
+    field_addw(&tt,1);
+    if (~succ) {
+        youfail();
+        printf("ISR failure.\n");
+        field_print("    x", &xx);
+        field_print("    s", &ss);
+        field_print("    t", &tt);
+    }
+    
+    return succ;
+}
+
 int test_arithmetic (void) {
     int j, ntests = 100000;
     
@@ -168,8 +204,8 @@ int test_arithmetic (void) {
         if (j<256) {
             mpz_set_ui(x,0);
             mpz_set_ui(y,0);
-            mpz_setbit(x,(j%16)*28); // FIELD_MAGIC
-            mpz_setbit(y,(j/16)*28); // FIELD_MAGIC
+            mpz_setbit(x,(j%16)*28);
+            mpz_setbit(y,(j/16)*28);
         } else if (j&1) {
             mpz_rrandomb(x, state, FIELD_BITS);
             mpz_rrandomb(y, state, FIELD_BITS);
@@ -183,6 +219,9 @@ int test_arithmetic (void) {
         succ &= test_add_sub(x,y,word);
         succ &= test_mul_sqr(x,y,word);
         
+        if (j < 1000)
+            succ &= test_isr(x);
+        
         // TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
     }
     
diff --git a/test/test_pointops.c b/test/test_pointops.c
index 8907608..6d4230d 100644
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 
 #include "ec_point.h"
+#include "magic.h"
 #include "field.h"
 #include "crandom.h"
 
@@ -256,6 +257,15 @@ int test_pointops (void) {
     struct crandom_state_t crand;
     crandom_init_from_buffer(&crand, "test_pointops random initializer");
     
+    struct extensible_t ext_base;
+    if (!validate_affine(&goldilocks_base_point)) {
+        youfail();
+        printf("  Base point isn't on the curve.\n");
+        return -1;
+    }
+    convert_affine_to_extensible(&ext_base, &goldilocks_base_point);
+    if (!validate_ext(&ext_base, 2, "base")) return -1;
+    
     int i, ret;
     for (i=0; i<1000; i++) {
         uint8_t ser[FIELD_BYTES];
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index 82989b9..80636cf 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -39,8 +39,14 @@ single_scalarmul_compatibility_test (
     if (!succ) {
         return 1;
     }
-    
-    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}}; // FIELD_MAGIC
+
+#if FIELD_BITS == 448
+    struct { int n,t,s; } params[] = {{5,5,18},{3,5,30},{4,4,28},{1,2,224}};
+#elif FIELD_BITS == 480
+    struct { int n,t,s; } params[] = {{5,6,16},{6,5,16},{4,5,24},{4,4,30},{1,2,240}};
+#else
+    struct { int n,t,s; } params[] = {{5,5,(SCALAR_BITS+24)/25},{1,2,(SCALAR_BITS+1)/2}};
+#endif
     const int nparams = sizeof(params)/sizeof(params[0]);
     struct fixed_base_table_t fbt;
     const int nsizes = 6;