Begin ref impl, currently an arch option (arch_ref64).

10 years ago · a9c72b5a8d
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,13 @@
 September 18, 2014:
    Begin work on a "ref" implementation.  Currently this is just the
    arch_ref64 architecture.  The ref implementation always weak_reduces
    after arithmetic, and doesn't use vectors or other hackery.  Currently
    it still must declare field elements as vector aligned, though,
    other code outside the arch directory can be vectorized.

    Change goldilocks.c to use field_eq instead of calling deep into field
    apis.

 September 6, 2014:
    Pull in minor changes from David Leon Gil and Nicholas Wilson, with
    some adjustments.  I hope the adjustments don't break their compiles.
--- a/src/arch_ref64/ec_point.c
+++ b/src/arch_ref64/ec_point.c
@@ -0,0 +1,825 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_sqr  ( &a->x, &a->z );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_sqr  ( &a->x, &a->z );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_sqr  ( &b->x, &a->z );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_sqr  ( &b->x, &a->z );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_add  ( &b->z, &a->z, &a->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L0, L1, L2;
    struct p448_t L3, L4, L5, L6;
    p448_mul  (   &L6, &a->z0, &a->zd );
    p448_sub  (   &L4,   &L6, &a->xd );
    p448_mul  (   &L6, &a->za,   &L4 );
    p448_mul  (   &L5, &a->z0, &a->xd );
    p448_sub  (   &L4,   &L5, &a->zd );
    p448_mul  (   &L3, &a->xa,   &L4 );
    p448_add  (   &L5,   &L3,   &L6 );
    p448_sub  (   &L4,   &L6,   &L3 );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_copy (   &L5, &a->z0 );
    p448_addw (   &L5,     1 );
    p448_sqr  (   &L4,   &L5 );
    p448_mulw (   &L5,   &L4, 39082 );
    p448_neg  (   &L4,   &L5 );
    p448_add  (   &L3, &a->z0, &a->z0 );
    p448_add  (   &L5,   &L3,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_mul  (   &L5, &a->xd,   &L3 );
       L1 = p448_is_zero( &a->zd );
       L2 = -   L1;
    p448_mask (   &L4,   &L5,    L1 );
    p448_add  (   &L5,   &L4, &a->zd );
       L0 = ~   L1;
    p448_mul  (   &L4,   sbz,   &L6 );
    p448_addw (   &L4,    L2 );
    p448_mul  (   &L6,   &L5,   &L4 );
    p448_mul  (   &L4,   &L6,   &L5 );
    p448_mul  (   &L5,   &L6, &a->xd );
    p448_mul  (   &L6,   &L4,   &L5 );
    p448_isr  (   &L3,   &L6 );
    p448_mul  (   &L5,   &L4,   &L3 );
    p448_sqr  (   &L4,   &L3 );
    p448_mul  (   &L3,   &L6,   &L4 );
    p448_mask (     b,   &L5,    L0 );
    p448_subw (   &L3,     1 );
       L1 = p448_is_zero(   &L3 );
       L0 = p448_is_zero(   sbz );
    return    L1 |    L0;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L3, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L3 );
    p448_isr  (   &L2, &b->x );
    p448_mul  ( &b->u, &b->t,   &L2 );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  ( &b->t, &b->x,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_sub  (   &L2, &a->x, &a->y );
    p448_mul  ( &b->x, &b->t,   &L2 );
    p448_add  (   &L2, &b->x,   &L3 );
    p448_sub  ( &b->t,   &L3, &b->x );
    p448_mul  ( &b->x,   &L2, &b->u );
       L0 = p448_is_zero( &b->y );
       L1 = -   L0;
    p448_addw ( &b->x,    L1 );
    p448_mul  ( &b->y, &b->t, &b->u );
       L0 = p448_is_zero( &b->z );
       L1 = -   L0;
    p448_addw ( &b->y,    L1 );
       L1 = p448_is_zero( &a->y );
       L0 =    L1 +     1;
    p448_set_ui( &b->z,    L0 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3;
    p448_isr  (   &L2,     x );
    p448_sqr  (   &L3,   &L2 );
    p448_mul  (   &L2,     x,   &L3 );
    p448_subw (   &L2,     1 );
       L1 = p448_is_zero(   &L2 );
       L0 = p448_is_zero(     x );
    return    L1 |    L0;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2;
    p448_sub  (   &L2, &a->x, &b->x );
       L1 = p448_is_zero(   &L2 );
    p448_sub  (   &L2, &a->y, &b->y );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4;
    p448_mul  (   &L4, &b->z, &a->x );
    p448_mul  (   &L3, &a->z, &b->x );
    p448_sub  (   &L2,   &L4,   &L3 );
       L1 = p448_is_zero(   &L2 );
    p448_mul  (   &L4, &b->z, &a->y );
    p448_mul  (   &L3, &a->z, &b->y );
    p448_sub  (   &L2,   &L4,   &L3 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L4, &a->y );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L2, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L2 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L3, &ext->x, &ext->y );
    p448_neg  (   &L2,   &L3 );
    p448_add  (   &L3,   &L2,   &L4 );
       L1 = p448_is_zero(   &L3 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L2,   &L4 );
    p448_addw (   &L2,     0 );
    p448_sqr  (   &L3, &ext->x );
    p448_add  (   &L4,   &L3,   &L2 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L3, &ext->t );
    p448_mul  (   &L2,   &L3,   &L5 );
    p448_mulw (   &L3,   &L2, 39081 );
    p448_neg  (   &L5,   &L3 );
    p448_add  (   &L3,   &L5,   &L4 );
    p448_neg  (   &L5,   &L2 );
    p448_add  (   &L4,   &L5,   &L3 );
    p448_sqr  (   &L3, &ext->z );
    p448_add  (   &L2,   &L3,   &L4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L4, &ext->y );
    p448_neg  (   &L3,   &L4 );
    p448_addw (   &L3,     0 );
    p448_sqr  (   &L2, &ext->z );
    p448_add  (   &L4,   &L2,   &L3 );
    p448_sqr  (   &L5, &ext->u );
    p448_sqr  (   &L2, &ext->t );
    p448_mul  (   &L3,   &L2,   &L5 );
    p448_mulw (   &L5,   &L3, 39081 );
    p448_neg  (   &L2,   &L5 );
    p448_add  (   &L3,   &L2,   &L4 );
    p448_sqr  (   &L2, &ext->x );
    p448_neg  (   &L4,   &L2 );
    p448_add  (   &L2,   &L4,   &L3 );
       L1 = p448_is_zero(   &L2 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L3, &ext->t, &ext->u );
    p448_mul  (   &L4, &ext->z,   &L3 );
    p448_addw (   &L4,     0 );
    p448_mul  (   &L2, &ext->x, &ext->y );
    p448_neg  (   &L3,   &L2 );
    p448_add  (   &L2,   &L3,   &L4 );
       L0 = p448_is_zero(   &L2 );
    return    L1 &    L0;
 }


--- a/src/arch_ref64/p448.c
+++ b/src/arch_ref64/p448.c
@@ -0,0 +1,477 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

 #include "p448.h"

 static __inline__ __uint128_t widemul(
    const uint64_t a,
    const uint64_t b
 ) {
    return ((__uint128_t)a) * ((__uint128_t)b);
 }

 static __inline__ uint64_t is_zero(uint64_t a) {
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((__uint128_t)a)-1)>>64;
 }

 void
 p448_mul (
    p448_t *__restrict__ cs,
    const p448_t *as,
    const p448_t *bs
 ) {
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4], bb[4], bbb[4];

    unsigned int i;
    for (i=0; i<4; i++) {
        aa[i]  = a[i] + a[i+4];
        bb[i]  = b[i] + b[i+4];
        bbb[i] = bb[i] + b[i+4];
    }

    int I_HATE_UNROLLED_LOOPS = 0;

    if (I_HATE_UNROLLED_LOOPS) {
        /* The compiler probably won't unroll this,
         * so it's like 80% slower.
         */
        for (i=0; i<4; i++) {
            accum2 = 0;

            unsigned int j;
            for (j=0; j<=i; j++) {
                accum2 += widemul(a[j],   b[i-j]);
                accum1 += widemul(aa[j], bb[i-j]);
                accum0 += widemul(a[j+4], b[i-j+4]);
            }
            for (; j<4; j++) {
                accum2 += widemul(a[j],   b[i-j+8]);
                accum1 += widemul(aa[j], bbb[i-j+4]);
                accum0 += widemul(a[j+4], bb[i-j+4]);
            }

            accum1 -= accum2;
            accum0 += accum2;

            c[i]   = ((uint64_t)(accum0)) & mask;
            c[i+4] = ((uint64_t)(accum1)) & mask;

            accum0 >>= 56;
            accum1 >>= 56;
        }
    } else {
        accum2  = widemul(a[0],  b[0]);
        accum1 += widemul(aa[0], bb[0]);
        accum0 += widemul(a[4],  b[4]);

        accum2 += widemul(a[1],  b[7]);
        accum1 += widemul(aa[1], bbb[3]);
        accum0 += widemul(a[5],  bb[3]);

        accum2 += widemul(a[2],  b[6]);
        accum1 += widemul(aa[2], bbb[2]);
        accum0 += widemul(a[6],  bb[2]);

        accum2 += widemul(a[3],  b[5]);
        accum1 += widemul(aa[3], bbb[1]);
        accum0 += widemul(a[7],  bb[1]);

        accum1 -= accum2;
        accum0 += accum2;

        c[0] = ((uint64_t)(accum0)) & mask;
        c[4] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[1]);
        accum1 += widemul(aa[0], bb[1]);
        accum0 += widemul(a[4],  b[5]);

        accum2 += widemul(a[1],  b[0]);
        accum1 += widemul(aa[1], bb[0]);
        accum0 += widemul(a[5],  b[4]);

        accum2 += widemul(a[2],  b[7]);
        accum1 += widemul(aa[2], bbb[3]);
        accum0 += widemul(a[6],  bb[3]);

        accum2 += widemul(a[3],  b[6]);
        accum1 += widemul(aa[3], bbb[2]);
        accum0 += widemul(a[7],  bb[2]);

        accum1 -= accum2;
        accum0 += accum2;

        c[1] = ((uint64_t)(accum0)) & mask;
        c[5] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[2]);
        accum1 += widemul(aa[0], bb[2]);
        accum0 += widemul(a[4],  b[6]);

        accum2 += widemul(a[1],  b[1]);
        accum1 += widemul(aa[1], bb[1]);
        accum0 += widemul(a[5],  b[5]);

        accum2 += widemul(a[2],  b[0]);
        accum1 += widemul(aa[2], bb[0]);
        accum0 += widemul(a[6],  b[4]);

        accum2 += widemul(a[3],  b[7]);
        accum1 += widemul(aa[3], bbb[3]);
        accum0 += widemul(a[7],  bb[3]);

        accum1 -= accum2;
        accum0 += accum2;

        c[2] = ((uint64_t)(accum0)) & mask;
        c[6] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;

        accum2  = widemul(a[0],  b[3]);
        accum1 += widemul(aa[0], bb[3]);
        accum0 += widemul(a[4],  b[7]);

        accum2 += widemul(a[1],  b[2]);
        accum1 += widemul(aa[1], bb[2]);
        accum0 += widemul(a[5],  b[6]);

        accum2 += widemul(a[2],  b[1]);
        accum1 += widemul(aa[2], bb[1]);
        accum0 += widemul(a[6],  b[5]);

        accum2 += widemul(a[3],  b[0]);
        accum1 += widemul(aa[3], bb[0]);
        accum0 += widemul(a[7],  b[4]);

        accum1 -= accum2;
        accum0 += accum2;

        c[3] = ((uint64_t)(accum0)) & mask;
        c[7] = ((uint64_t)(accum1)) & mask;

        accum0 >>= 56;
        accum1 >>= 56;
    } /* !I_HATE_UNROLLED_LOOPS */

    accum0 += accum1;
    accum0 += c[4];
    accum1 += c[0];
    c[4] = ((uint64_t)(accum0)) & mask;
    c[0] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    c[5] += ((uint64_t)(accum0));
    c[1] += ((uint64_t)(accum1));
 }

 void
 p448_mulw (
    p448_t *__restrict__ cs,
    const p448_t *as,
    uint64_t b
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum4 = 0;
    uint64_t mask = (1ull<<56) - 1;  

    int i;
    for (i=0; i<4; i++) {
        accum0 += widemul(b, a[i]);
        accum4 += widemul(b, a[i+4]);
        c[i]   = accum0 & mask; accum0 >>= 56;
        c[i+4] = accum4 & mask; accum4 >>= 56;
    }
    
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;

    accum4 += c[0];
    c[0] = accum4 & mask;
    c[1] += accum4 >> 56;
 }

 void
 p448_sqr (
    p448_t *__restrict__ cs,
    const p448_t *as
 ) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
    uint64_t mask = (1ull<<56) - 1;  

    uint64_t aa[4];

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
    for (i=0; i<4; i++) {
        aa[i] = a[i] + a[i+4];
    }

    accum2  = widemul(a[0],a[3]);
    accum0  = widemul(aa[0],aa[3]);
    accum1  = widemul(a[4],a[7]);

    accum2 += widemul(a[1], a[2]);
    accum0 += widemul(aa[1], aa[2]);
    accum1 += widemul(a[5], a[6]);

    accum0 -= accum2;
    accum1 += accum2;

    c[3] = ((uint64_t)(accum1))<<1 & mask;
    c[7] = ((uint64_t)(accum0))<<1 & mask;

    accum0 >>= 55;
    accum1 >>= 55;

    accum0 += widemul(2*aa[1],aa[3]);
    accum1 += widemul(2*a[5], a[7]);
    accum0 += widemul(aa[2], aa[2]);
    accum1 += accum0;

    accum0 -= widemul(2*a[1], a[3]);
    accum1 += widemul(a[6], a[6]);
    
    accum2 = widemul(a[0],a[0]);
    accum1 -= accum2;
    accum0 += accum2;

    accum0 -= widemul(a[2], a[2]);
    accum1 += widemul(aa[0], aa[0]);
    accum0 += widemul(a[4], a[4]);

    c[0] = ((uint64_t)(accum0)) & mask;
    c[4] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(2*aa[2],aa[3]);
    accum0 -= widemul(2*a[2], a[3]);
    accum1 += widemul(2*a[6], a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(2*a[0],a[1]);
    accum1 += widemul(2*aa[0], aa[1]);
    accum0 += widemul(2*a[4], a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[1] = ((uint64_t)(accum0)) & mask;
    c[5] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum2  = widemul(aa[3],aa[3]);
    accum0 -= widemul(a[3], a[3]);
    accum1 += widemul(a[7], a[7]);

    accum1 += accum2;
    accum0 += accum2;

    accum2  = widemul(2*a[0],a[2]);
    accum1 += widemul(2*aa[0], aa[2]);
    accum0 += widemul(2*a[4], a[6]);

    accum2 += widemul(a[1], a[1]);
    accum1 += widemul(aa[1], aa[1]);
    accum0 += widemul(a[5], a[5]);

    accum1 -= accum2;
    accum0 += accum2;

    c[2] = ((uint64_t)(accum0)) & mask;
    c[6] = ((uint64_t)(accum1)) & mask;

    accum0 >>= 56;
    accum1 >>= 56;

    accum0 += c[3];
    accum1 += c[7];
    c[3] = ((uint64_t)(accum0)) & mask;
    c[7] = ((uint64_t)(accum1)) & mask;

    /* we could almost stop here, but it wouldn't be stable, so... */

    accum0 >>= 56;
    accum1 >>= 56;
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }

 void
 p448_strong_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<56)-1;

    /* first, clear high */
    a->limb[4] += a->limb[7]>>56;
    a->limb[0] += a->limb[7]>>56;
    a->limb[7] &= mask;

    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */

    /* compute total_value - p.  No need to reduce mod p. */

    __int128_t scarry = 0;
    int i;
    for (i=0; i<8; i++) {
        scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
        a->limb[i] = scarry & mask;
        scarry >>= 56;
    }

    /* uncommon case: it was >= p, so now scarry = 0 and this = x
    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
    * so let's add back in p.  will carry back off the top for 2^448.
    */

    assert(is_zero(scarry) | is_zero(scarry+1));

    uint64_t scarry_mask = scarry & mask;
    __uint128_t carry = 0;

    /* add it back */
    for (i=0; i<8; i++) {
        carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
        a->limb[i] = carry & mask;
        carry >>= 56;
    }

    assert(is_zero(carry + scarry));
 }

 mask_t
 p448_is_zero (
    const struct p448_t *a
 ) {
    struct p448_t b;
    p448_copy(&b,a);
    p448_strong_reduce(&b);

    uint64_t any = 0;
    int i;
    for (i=0; i<8; i++) {
        any |= b.limb[i];
    }
    return is_zero(any);
 }

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 ) {
    int i,j;
    p448_t red;
    p448_copy(&red, x);
    p448_strong_reduce(&red);
    for (i=0; i<8; i++) {
        for (j=0; j<7; j++) {
            serial[7*i+j] = red.limb[i];
            red.limb[i] >>= 8;
        }
        assert(red.limb[i] == 0);
    }
 }

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 ) {
    int i,j;
    for (i=0; i<8; i++) {
        uint64_t out = 0;
        for (j=0; j<7; j++) {
            out |= ((uint64_t)serial[7*i+j])<<(8*j);
        }
        x->limb[i] = out;
    }
    
    /* Check for reduction.
     *
     * The idea is to create a variable ge which is all ones (rather, 56 ones)
     * if and only if the low $i$ words of $x$ are >= those of p.
     *
     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
     */
    uint64_t ge = -1, mask = (1ull<<56)-1;
    for (i=0; i<4; i++) {
        ge &= x->limb[i];
    }
    
    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
    ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
    
    /* Propagate the rest */
    for (i=5; i<8; i++) {
        ge &= x->limb[i];
    }
    
    return ~is_zero(ge ^ mask);
 }

 void
 simultaneous_invert_p448(
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 ) {
  if (n==0) {
      return;
  } else if (n==1) {
      p448_inverse(out,in);
      return;
  }
  
  p448_copy(&out[1], &in[0]);
  int i;
  for (i=1; i<(int) (n-1); i++) {
      p448_mul(&out[i+1], &out[i], &in[i]);
  }
  p448_mul(&out[0], &out[n-1], &in[n-1]);
  
  struct p448_t tmp;
  p448_inverse(&tmp, &out[0]);
  p448_copy(&out[0], &tmp);
  
  /* at this point, out[0] = product(in[i]) ^ -1
   * out[i] = product(in[0]..in[i-1]) if i != 0
   */
  for (i=n-1; i>0; i--) {
      p448_mul(&tmp, &out[i], &out[0]);
      p448_copy(&out[i], &tmp);
      
      p448_mul(&tmp, &out[0], &in[i]);
      p448_copy(&out[0], &tmp);
  }
 }
--- a/src/arch_ref64/p448.h
+++ b/src/arch_ref64/p448.h
@@ -0,0 +1,373 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include <stdint.h>
 #include <assert.h>
 #include <string.h>

 #include "word.h"

 typedef struct p448_t {
  uint64_t limb[8];
 } __attribute__((aligned(32))) p448_t;

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) __attribute__((unused));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );

 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused));

 static __inline__ void
 p448_really_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused));
         
 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    out->limb[0] = x;
    for (i=1; i<8; i++) {
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
        uint64_t x = doswap & (a->limb[i]^b->limb[i]);
        a->limb[i] ^= x;
        b->limb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    p448_weak_reduce(out);
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
    for (i=0; i<8; i++) {
        out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1);
    }
    p448_weak_reduce(out);
 }

 void
 p448_neg (
    struct p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
    for (i=0; i<8; i++) {
        out->limb[i] = ((i==4) ? co2 : co1) - a->limb[i];
    }
    p448_weak_reduce(out);
 }

 void
 p448_cond_neg(
    struct p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<8; i++) {
        a->limb[i] = ( a->limb[i]      & ~doNegate )
                   | ( negated.limb[i] &  doNegate );
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] += x;
  a->limb[1] += a->limb[0]>>56;
  a->limb[0] &= (1ull<<56)-1;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint64_t x
 ) {
  a->limb[0] -= x;
  p448_really_bias(a, 1);
  p448_weak_reduce(a);
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
    memcpy(out,a,sizeof(*a));
 }

 void
 p448_really_bias (
    p448_t *a,
    int amt
 ) {
    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
    int i;
    for (i=0; i<8; i++) {
        a->limb[i] += (i==4) ? co2 : co1;
    }
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    (void) a;
    (void) amt;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {
    uint64_t mask = (1ull<<56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    int i;
    a->limb[4] += tmp;
    for (i=7; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_sub(&ra, &ra, &rb);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -442,11 +442,8 @@ goldilocks_verify (
        goldilocks_global.wnafs, WNAF_PRECMP_BITS );
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    field_sub(&eph, &eph, &pk);
    field_bias(&eph, 2);
    
    succ = field_is_zero(&eph);
    

    succ = field_eq(&eph, &pk);
    return succ ? 0 : GOLDI_EINVAL;
 }
 #endif
@@ -533,11 +530,8 @@ goldilocks_verify_precomputed (
    if (!succ) return GOLDI_EINVAL;
    
    untwist_and_double_and_serialize( &pk, &pk_text );
    field_sub(&eph, &eph, &pk);
    field_bias(&eph, 2);
    
    succ = field_is_zero(&eph);
    

    succ = field_eq(&eph, &pk);
    return succ ? 0 : GOLDI_EINVAL;
 }

--- a/src/include/field.h
+++ b/src/include/field.h
@@ -24,6 +24,7 @@
 #define field_cond_neg       p448_cond_neg
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_eq             p448_eq
 #define field_is_zero        p448_is_zero
 #define simultaneous_invert  simultaneous_invert_p448 /* FUTURE: consistency */