Added really_memset, thanks David Leon Gil.

Trying to work around an apparent GCC bug on SSE2, thanks Samuel Neves. Added an experimental NEON arch. It's fast. It's not yet GCC clean. It needs some more work on general cleanliness too.
11 years ago · 04b955eabe
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,23 @@
 August 4, 2014:
    Experiments and bug fixes.

    Add really_memset = memset_s (except not because I'm setting -std=c99),
    thanks David Leon Gil.  I think I put it in the right places.

    Try to work around what I think is a compiler bug in GCC -O3 on non-AVX
    platforms.  I can't seem to work around it as -Os, so I'm just flagging
    a warning (-Werror makes it an error) for now.  Will take more
    investigation.  Thanks Samuel Neves.

    Added an experimental (not ready yet!) ARM NEON implementation in
    arch_neon_experimental.  This implementation seems to work, but needs
    more testing.  It is currently asm-heavy and not GCC clean.  I am
    planning to have a flag for it to use intrinsics instead of asm;
    currently the intrinsics are commented out.  On clang this does ECDH
    in 1850kcy on my BeagleBone Black, comparable to Curve41417.  Once this
    is ready, I will probably move it to arch_neon proper, since arch_neon
    isn't particularly tuned.

 July 11, 2014:
    This is mostly a cleanup release.

--- a/+ 3
+++ b/+ 3
@@ -22,7 +22,7 @@ endif


 WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 	 -Wmissing-declarations -Wunused-function $(EXWARN)
 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 	 
 	 
 INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
@@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon
 else
 ARCHFLAGS += -mfpu=vfpv3-d16
 endif
 ARCHFLAGS += -mcpu=cortex-a9 # FIXME
 GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
 ARCHFLAGS += -mcpu=cortex-a8 # FIXME
 GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
 else
 ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
 endif
--- a/README.txt
+++ b/README.txt
@@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford
 University. All of these files are usable under the MIT license contained in
 LICENSE.txt.

 The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run
 The Makefile is set for my 2013 MacBook Air. You can `make bench` to run
 a completely arbitrary set of benchmarks and tests, or `make
 build/goldilocks.so` to build a stripped-down version of the library. For
 non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate
--- a/src/arch_neon/p448.c
+++ b/src/arch_neon/p448.c
@@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) {
 #include "neon_emulation.h"
 #endif /* ARM_NEON */

 static inline void __attribute__((gnu_inline,always_inline))
 static inline void __attribute__((gnu_inline,always_inline,unused))
 smlal (
    uint64_t *acc,
    const uint32_t a,
@@ -48,7 +48,7 @@ smlal (
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 static inline void __attribute__((gnu_inline,always_inline,unused))
 smlal2 (
    uint64_t *acc,
    const uint32_t a,
@@ -57,7 +57,7 @@ smlal2 (
    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 static inline void __attribute__((gnu_inline,always_inline,unused))
 smull (
    uint64_t *acc,
    const uint32_t a,
@@ -66,7 +66,7 @@ smull (
    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
 }

 static inline void __attribute__((gnu_inline,always_inline))
 static inline void __attribute__((gnu_inline,always_inline,unused))
 smull2 (
    uint64_t *acc,
    const uint32_t a,
@@ -84,6 +84,7 @@ p448_mul (
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
    
    
    const int32x2_t
        *val = (const int32x2_t *)a,
        *vbl = (const int32x2_t *)b,
@@ -109,155 +110,170 @@ p448_mul (
    
    accumx0a = vmull_lane_s32(          delta = val[1] + vah[1], vbh[3], 0);
    accumx1a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx2a = vmull_lane_s32(          delta = val[2] + vah[2], vbh[3], 0);
    accumx3a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
    accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0);
    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
    accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
    accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
    accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
    accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0);
    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
    accumx2b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[1], 0);
    accumx3b = vmull_lane_s32(          delta, vbh[1], 1);
    accumx0b = vmull_lane_s32(          delta, vbh[0], 0);
    accumx0b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[0], 0);
    accumx1b = vmull_lane_s32(          delta, vbh[0], 1);
    accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
    accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
    accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
    accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
    accumx2b += accumx2a;
    accumx3b += accumx3a;
    accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
    accumx0b += accumx0a;
    accumx1b += accumx1a;
    accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
    accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
    accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
    accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
    accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
    accumx2a += accumx2b;
    accumx3a += accumx3b;
    accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
    accumx0a += accumx0b;
    accumx1a += accumx1b;
    accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
    accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
    accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
    xx_vtrnq_s64(&accumx0a, &accumx0b);
    xx_vtrnq_s64(&accumx1a, &accumx1b);
    xx_vtrnq_s64(&accumx2a, &accumx2b);
    xx_vtrnq_s64(&accumx3a, &accumx3b);
    accumx0b += accumx1a;
    accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
    accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
    trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
    vcl[0] = trn_res.val[1] & vmask;
    vch[0] = trn_res.val[0] & vmask;
    
    
    
    
    accumx2a = vmull_lane_s32(          delta = val[2] + vah[2], vbh[3], 0);
    accumx3a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
    accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
    accumx2b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[1], 0);
    accumx3b = vmull_lane_s32(          delta, vbh[1], 1);
    accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
    accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
    accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
    accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
    accumx2b += accumx2a;
    accumx3b += accumx3a;
    accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
    accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
    accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
    accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0);
    accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
    accumx2a += accumx2b;
    accumx3a += accumx3b;
    accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
    accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0);
    accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
    xx_vtrnq_s64(&accumx2a, &accumx2b);
    xx_vtrnq_s64(&accumx3a, &accumx3b);
    accumx2a += accumx1b;
    accumx2b += accumx3a;
    accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
    accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
    trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
    vcl[0] = trn_res.val[1] & vmask;
    vch[0] = trn_res.val[0] & vmask;
    trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
    vcl[1] = trn_res.val[1] & vmask;
    vch[1] = trn_res.val[0] & vmask;
    carry = accumx3b;
    
    
    
    
    accumx4a = vmull_lane_s32(          delta = val[3] + vah[3], vbh[3], 0);
    accumx5a = vmull_lane_s32(          delta, vbh[3], 1);
    accumx6b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[3], 0);
    accumx7b = vmull_lane_s32(          delta, vbh[3], 1);
    accumx4b = accumx4a;
    accumx5b = accumx5a;
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
    accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
    accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
    accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
    accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0);
    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
    accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
    accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
    accumx6a = accumx6b;
    accumx7a = accumx7b;
    accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
    accumx4a += accumx4b;
    accumx5a += accumx5b;
    accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
    accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
    accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
    accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
    accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
    accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
    /**/
    accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
    accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
    accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
    
    xx_vtrnq_s64(&accumx4a, &accumx4b);
    xx_vtrnq_s64(&accumx5a, &accumx5b);
    xx_vtrnq_s64(&accumx6a, &accumx6b);
    xx_vtrnq_s64(&accumx7a, &accumx7b);
    accumx4a += carry;
    accumx4b += accumx5a;
    accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
    accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
    accumx6a += accumx5b;
    accumx6b += accumx7a;
    
    trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
    vcl[2] = trn_res.val[1] & vmask;
    vch[2] = trn_res.val[0] & vmask;
    
    
    
    
    accumx6b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[3], 0);
    accumx7b = vmull_lane_s32(          delta, vbh[3], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
    accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
    accumx6a = accumx6b;
    accumx7a = accumx7b;
    accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
    accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
    accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
    /**/
    accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
    accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0);
    accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);

    xx_vtrnq_s64(&accumx6a, &accumx6b);
    xx_vtrnq_s64(&accumx7a, &accumx7b);
    accumx6a += accumx5b;
    accumx6b += accumx7a;
    
    accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
    accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
    trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
    vcl[3] = trn_res.val[1] & vmask;
    vch[3] = trn_res.val[0] & vmask;
    
    
    accumx7b = xx_vaddup_s64(accumx7b);

    int32x2_t t0 = vcl[0], t1 = vch[0];
--- a/src/arch_neon_experimental/ec_point.c
+++ b/src/arch_neon_experimental/ec_point.c
@@ -0,0 +1,962 @@
 /**
 * @cond internal
 * @file ec_point.c
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 * @warning This file was automatically generated.
 */

 #include "ec_point.h"


 void
 p448_isr (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L1,     x );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mul  (   &L2,     x,   &L1 );
    p448_sqrn (   &L1,   &L2,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L1,   &L0,     3 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_sqrn (   &L2,   &L0,     9 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (   &L2,     x,   &L0 );
    p448_sqrn (   &L0,   &L2,    18 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqrn (   &L0,   &L2,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,    37 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_sqrn (   &L0,   &L1,   111 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L0,   &L2 );
    p448_mul  (   &L1,     x,   &L0 );
    p448_sqrn (   &L0,   &L1,   223 );
    p448_mul  (     a,   &L2,   &L0 );
 }

 void
 p448_inverse (
    struct p448_t*       a,
    const struct p448_t* x
 ) {
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mul  (     a,     x,   &L0 );
 }

 void
 add_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->a,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->b,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_sub  ( &d->y, &d->z, &d->x );
    p448_bias ( &d->y,     2 );
    p448_weak_reduce( &d->y );
    p448_add  (   &L0, &d->x, &d->z );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 sub_tw_niels_from_tw_extensible (
    struct tw_extensible_t*  d,
    const struct tw_niels_t* e
 ) {
    struct p448_t L0, L1;
    p448_sub  (   &L1, &d->y, &d->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &e->b,   &L1 );
    p448_add  (   &L1, &d->x, &d->y );
    p448_mul  ( &d->y, &e->a,   &L1 );
    p448_mul  (   &L1, &d->u, &d->t );
    p448_mul  ( &d->x, &e->c,   &L1 );
    p448_add  ( &d->u,   &L0, &d->y );
    p448_sub  ( &d->t, &d->y,   &L0 );
    p448_bias ( &d->t,     2 );
    p448_weak_reduce( &d->t );
    p448_add  ( &d->y, &d->x, &d->z );
    p448_sub  (   &L0, &d->z, &d->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &d->z,   &L0, &d->y );
    p448_mul  ( &d->x, &d->y, &d->t );
    p448_mul  ( &d->y,   &L0, &d->u );
 }

 void
 add_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    add_tw_niels_to_tw_extensible(     e, &a->n );
 }

 void
 sub_tw_pniels_from_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* a
 ) {
    struct p448_t L0;
    p448_mul  (   &L0, &e->z, &a->z );
    p448_copy ( &e->z,   &L0 );
    sub_tw_niels_from_tw_extensible(     e, &a->n );
 }

 void
 double_tw_extensible (
    struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  ( &a->u,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  (   &L1, &a->t );
    p448_sub  ( &a->t,   &L1, &a->u );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  (   &L1,   &L0,   &L2 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     1 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 double_extensible (
    struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->x );
    p448_sqr  (   &L0, &a->y );
    p448_add  (   &L1,   &L2,   &L0 );
    p448_add  ( &a->t, &a->y, &a->x );
    p448_sqr  ( &a->u, &a->t );
    p448_sub  ( &a->t, &a->u,   &L1 );
    p448_bias ( &a->t,     3 );
    p448_weak_reduce( &a->t );
    p448_sub  ( &a->u,   &L0,   &L2 );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_sqr  ( &a->x, &a->z );
    p448_bias ( &a->x,     2 );
    p448_add  ( &a->z, &a->x, &a->x );
    p448_sub  (   &L0, &a->z,   &L1 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->z,   &L1,   &L0 );
    p448_mul  ( &a->x,   &L0, &a->t );
    p448_mul  ( &a->y,   &L1, &a->u );
 }

 void
 twist_and_double (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  ( &b->u, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  (   &L0, &b->t );
    p448_sub  ( &b->t,   &L0, &b->u );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  (   &L0, &b->z, &b->x );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     2 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 untwist_and_double (
    struct extensible_t*          b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0;
    p448_sqr  ( &b->x, &a->x );
    p448_sqr  ( &b->z, &a->y );
    p448_add  (   &L0, &b->x, &b->z );
    p448_add  ( &b->t, &a->y, &a->x );
    p448_sqr  ( &b->u, &b->t );
    p448_sub  ( &b->t, &b->u,   &L0 );
    p448_bias ( &b->t,     3 );
    p448_weak_reduce( &b->t );
    p448_sub  ( &b->u, &b->z, &b->x );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sqr  ( &b->x, &a->z );
    p448_bias ( &b->x,     1 );
    p448_add  ( &b->z, &b->x, &b->x );
    p448_sub  ( &b->y, &b->z, &b->u );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->z,   &L0, &b->y );
    p448_mul  ( &b->x, &b->y, &b->t );
    p448_mul  ( &b->y,   &L0, &b->u );
 }

 void
 convert_tw_affine_to_tw_pniels (
    struct tw_pniels_t*       b,
    const struct tw_affine_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->y, &a->x );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_set_ui( &b->z,     2 );
 }

 void
 convert_tw_affine_to_tw_extensible (
    struct tw_extensible_t*   b,
    const struct tw_affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_affine_to_extensible (
    struct extensible_t*   b,
    const struct affine_t* a
 ) {
    p448_copy ( &b->x, &a->x );
    p448_copy ( &b->y, &a->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &a->x );
    p448_copy ( &b->u, &a->y );
 }

 void
 convert_tw_extensible_to_tw_pniels (
    struct tw_pniels_t*           b,
    const struct tw_extensible_t* a
 ) {
    p448_sub  ( &b->n.a, &a->y, &a->x );
    p448_bias ( &b->n.a,     2 );
    p448_weak_reduce( &b->n.a );
    p448_add  ( &b->n.b, &a->x, &a->y );
    p448_weak_reduce( &b->n.b );
    p448_mul  ( &b->n.c, &a->u, &a->t );
    p448_mulw ( &b->z, &b->n.c, 78164 );
    p448_neg  ( &b->n.c, &b->z );
    p448_bias ( &b->n.c,     2 );
    p448_weak_reduce( &b->n.c );
    p448_add  ( &b->z, &a->z, &a->z );
    p448_weak_reduce( &b->z );
 }

 void
 convert_tw_pniels_to_tw_extensible (
    struct tw_extensible_t*   e,
    const struct tw_pniels_t* d
 ) {
    p448_add  ( &e->u, &d->n.b, &d->n.a );
    p448_sub  ( &e->t, &d->n.b, &d->n.a );
    p448_bias ( &e->t,     2 );
    p448_weak_reduce( &e->t );
    p448_mul  ( &e->x, &d->z, &e->t );
    p448_mul  ( &e->y, &d->z, &e->u );
    p448_sqr  ( &e->z, &d->z );
 }

 void
 convert_tw_niels_to_tw_extensible (
    struct tw_extensible_t*  e,
    const struct tw_niels_t* d
 ) {
    p448_add  ( &e->y, &d->b, &d->a );
    p448_weak_reduce( &e->y );
    p448_sub  ( &e->x, &d->b, &d->a );
    p448_bias ( &e->x,     2 );
    p448_weak_reduce( &e->x );
    p448_set_ui( &e->z,     1 );
    p448_copy ( &e->t, &e->x );
    p448_copy ( &e->u, &e->y );
 }

 void
 montgomery_step (
    struct montgomery_t* a
 ) {
    struct p448_t L0, L1;
    p448_add  (   &L0, &a->zd, &a->xd );
    p448_sub  (   &L1, &a->xd, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_sub  ( &a->zd, &a->xa, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_add  ( &a->zd, &a->za, &a->xa );
    p448_mul  ( &a->za,   &L1, &a->zd );
    p448_add  ( &a->xa, &a->za, &a->xd );
    p448_sqr  ( &a->zd, &a->xa );
    p448_mul  ( &a->xa, &a->z0, &a->zd );
    p448_sub  ( &a->zd, &a->xd, &a->za );
    p448_bias ( &a->zd,     2 );
    p448_weak_reduce( &a->zd );
    p448_sqr  ( &a->za, &a->zd );
    p448_sqr  ( &a->xd,   &L0 );
    p448_sqr  (   &L0,   &L1 );
    p448_mulw ( &a->zd, &a->xd, 39082 );
    p448_sub  (   &L1, &a->xd,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->xd,   &L0, &a->zd );
    p448_sub  (   &L0, &a->zd,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &a->zd,   &L0,   &L1 );
 }

 void
 deserialize_montgomery (
    struct montgomery_t* a,
    const struct p448_t* sbz
 ) {
    p448_sqr  ( &a->z0,   sbz );
    p448_set_ui( &a->xd,     1 );
    p448_set_ui( &a->zd,     0 );
    p448_set_ui( &a->xa,     1 );
    p448_copy ( &a->za, &a->z0 );
 }

 mask_t
 serialize_montgomery (
    struct p448_t*             b,
    const struct montgomery_t* a,
    const struct p448_t*       sbz
 ) {
    mask_t L4, L5, L6;
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->z0, &a->zd );
    p448_sub  (   &L1,   &L3, &a->xd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3, &a->za,   &L1 );
    p448_mul  (   &L2, &a->z0, &a->xd );
    p448_sub  (   &L1,   &L2, &a->zd );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L0, &a->xa,   &L1 );
    p448_add  (   &L2,   &L0,   &L3 );
    p448_sub  (   &L1,   &L3,   &L0 );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_copy (   &L2, &a->z0 );
    p448_addw (   &L2,     1 );
    p448_sqr  (   &L1,   &L2 );
    p448_mulw (   &L2,   &L1, 39082 );
    p448_neg  (   &L1,   &L2 );
    p448_add  (   &L2, &a->z0, &a->z0 );
    p448_bias (   &L2,     1 );
    p448_add  (   &L0,   &L2,   &L2 );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_weak_reduce(   &L2 );
    p448_mul  (   &L0, &a->xd,   &L2 );
       L5 = p448_is_zero( &a->zd );
       L6 = -   L5;
    p448_mask (   &L1,   &L0,    L5 );
    p448_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    p448_mul  (   &L1,   sbz,   &L3 );
    p448_addw (   &L1,    L6 );
    p448_mul  (   &L3,   &L2,   &L1 );
    p448_mul  (   &L1,   &L3,   &L2 );
    p448_mul  (   &L2,   &L3, &a->xd );
    p448_mul  (   &L3,   &L1,   &L2 );
    p448_isr  (   &L0,   &L3 );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L3,   &L1 );
    p448_mask (     b,   &L2,    L4 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L5 = p448_is_zero(   &L0 );
       L4 = p448_is_zero(   sbz );
    return    L5 |    L4;
 }

 void
 serialize_extensible (
    struct p448_t*             b,
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sub  (   &L0, &a->y, &a->z );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_add  (     b, &a->z, &a->y );
    p448_mul  (   &L1, &a->z, &a->x );
    p448_mul  (   &L2,   &L0,   &L1 );
    p448_mul  (   &L1,   &L2,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (   &L2,   &L1,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (     b,   &L1,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,   &L2,   &L1 );
 }

 void
 untwist_and_double_and_serialize (
    struct p448_t*                b,
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_mul  (   &L3, &a->y, &a->x );
    p448_add  (     b, &a->y, &a->x );
    p448_sqr  (   &L1,     b );
    p448_add  (   &L2,   &L3,   &L3 );
    p448_sub  (     b,   &L1,   &L2 );
    p448_bias (     b,     3 );
    p448_weak_reduce(     b );
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1,   &L2 );
    p448_add  (   &L2,     b,     b );
    p448_mulw (     b,   &L2, 39082 );
    p448_neg  (   &L2,     b );
    p448_bias (   &L2,     2 );
    p448_mulw (   &L0,   &L2, 39082 );
    p448_neg  (     b,   &L0 );
    p448_bias (     b,     2 );
    p448_mul  (   &L0,   &L2,   &L1 );
    p448_mul  (   &L2,     b,   &L0 );
    p448_isr  (   &L0,   &L2 );
    p448_mul  (   &L1,     b,   &L0 );
    p448_sqr  (     b,   &L0 );
    p448_mul  (   &L0,   &L2,     b );
    p448_mul  (     b,   &L1,   &L3 );
 }

 void
 twist_even (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L0, L1;
    p448_sqr  ( &b->y, &a->z );
    p448_sqr  ( &b->z, &a->x );
    p448_sub  ( &b->u, &b->y, &b->z );
    p448_bias ( &b->u,     2 );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->z, &a->z, &a->x );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->y, &b->z, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->x, &b->z, &b->y );
    p448_mul  ( &b->t, &b->x, &b->u );
    p448_mul  ( &b->y, &b->x, &b->t );
    p448_isr  ( &b->t, &b->y );
    p448_mul  ( &b->u, &b->x, &b->t );
    p448_sqr  ( &b->x, &b->t );
    p448_mul  ( &b->t, &b->y, &b->x );
    p448_mul  ( &b->x, &a->x, &b->u );
    p448_mul  ( &b->y, &a->y, &b->u );
       L1 = p448_is_zero( &b->z );
       L0 = -   L1;
    p448_addw ( &b->y,    L0 );
    p448_weak_reduce( &b->y );
    p448_set_ui( &b->z,     1 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 void
 test_only_twist (
    struct tw_extensible_t*    b,
    const struct extensible_t* a
 ) {
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_sqr  ( &b->u, &a->z );
    p448_sqr  ( &b->y, &a->x );
    p448_sub  ( &b->z, &b->u, &b->y );
    p448_bias ( &b->z,     2 );
    p448_add  ( &b->y, &b->z, &b->z );
    p448_add  ( &b->u, &b->y, &b->y );
    p448_weak_reduce( &b->u );
    p448_sub  ( &b->y, &a->z, &a->x );
    p448_bias ( &b->y,     2 );
    p448_weak_reduce( &b->y );
    p448_mul  ( &b->x, &b->y, &a->y );
    p448_sub  ( &b->z, &a->z, &a->y );
    p448_bias ( &b->z,     2 );
    p448_weak_reduce( &b->z );
    p448_mul  ( &b->t, &b->z, &b->x );
    p448_mul  (   &L1, &b->t, &b->u );
    p448_mul  ( &b->x, &b->t,   &L1 );
    p448_isr  (   &L0, &b->x );
    p448_mul  ( &b->u, &b->t,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &b->t, &b->x,   &L1 );
    p448_add  (   &L1, &a->y, &a->x );
    p448_weak_reduce(   &L1 );
    p448_sub  (   &L0, &a->x, &a->y );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    p448_mul  ( &b->x, &b->t,   &L0 );
    p448_add  (   &L0, &b->x,   &L1 );
    p448_sub  ( &b->t,   &L1, &b->x );
    p448_bias ( &b->t,     2 );
    p448_weak_reduce( &b->t );
    p448_mul  ( &b->x,   &L0, &b->u );
       L2 = p448_is_zero( &b->y );
       L3 = -   L2;
    p448_addw ( &b->x,    L3 );
    p448_weak_reduce( &b->x );
    p448_mul  ( &b->y, &b->t, &b->u );
       L2 = p448_is_zero( &b->z );
       L3 = -   L2;
    p448_addw ( &b->y,    L3 );
    p448_weak_reduce( &b->y );
       L3 = p448_is_zero( &a->y );
       L2 =    L3 +     1;
    p448_set_ui( &b->z,    L2 );
    p448_copy ( &b->t, &b->x );
    p448_copy ( &b->u, &b->y );
 }

 mask_t
 is_square (
    const struct p448_t* x
 ) {
    mask_t L2, L3;
    struct p448_t L0, L1;
    p448_isr  (   &L0,     x );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  (   &L0,     x,   &L1 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
       L3 = p448_is_zero(   &L0 );
       L2 = p448_is_zero(     x );
    return    L3 |    L2;
 }

 mask_t
 is_even_pt (
    const struct extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 is_even_tw (
    const struct tw_extensible_t* a
 ) {
    struct p448_t L0, L1, L2;
    p448_sqr  (   &L2, &a->z );
    p448_sqr  (   &L1, &a->x );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_weak_reduce(   &L0 );
    return is_square (   &L0 );
 }

 mask_t
 deserialize_affine (
    struct affine_t*     a,
    const struct p448_t* sz
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L1,    sz );
    p448_copy (   &L3,   &L1 );
    p448_addw (   &L3,     1 );
    p448_sqr  ( &a->x,   &L3 );
    p448_mulw (   &L3, &a->x, 39082 );
    p448_neg  ( &a->x,   &L3 );
    p448_add  (   &L3,   &L1,   &L1 );
    p448_bias (   &L3,     1 );
    p448_add  ( &a->y,   &L3,   &L3 );
    p448_add  (   &L3, &a->y, &a->x );
    p448_weak_reduce(   &L3 );
    p448_copy ( &a->y,   &L1 );
    p448_subw ( &a->y,     1 );
    p448_neg  ( &a->x, &a->y );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_mul  ( &a->y, &a->x,   &L3 );
    p448_sqr  (   &L2, &a->x );
    p448_mul  (   &L0,   &L2, &a->y );
    p448_mul  ( &a->y, &a->x,   &L0 );
    p448_isr  (   &L3, &a->y );
    p448_mul  ( &a->y,   &L2,   &L3 );
    p448_sqr  (   &L2,   &L3 );
    p448_mul  (   &L3,   &L0,   &L2 );
    p448_mul  (   &L0, &a->x,   &L3 );
    p448_add  (   &L2, &a->y, &a->y );
    p448_mul  ( &a->x,    sz,   &L2 );
    p448_addw (   &L1,     1 );
    p448_mul  ( &a->y,   &L1,   &L3 );
    p448_subw (   &L0,     1 );
    p448_bias (   &L0,     1 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 deserialize_and_twist_approx (
    struct tw_extensible_t* a,
    const struct p448_t*    sdm1,
    const struct p448_t*    sz
 ) {
    struct p448_t L0, L1;
    p448_sqr  ( &a->z,    sz );
    p448_copy ( &a->y, &a->z );
    p448_addw ( &a->y,     1 );
    p448_sqr  ( &a->x, &a->y );
    p448_mulw ( &a->y, &a->x, 39082 );
    p448_neg  ( &a->x, &a->y );
    p448_add  ( &a->y, &a->z, &a->z );
    p448_bias ( &a->y,     1 );
    p448_add  ( &a->u, &a->y, &a->y );
    p448_add  ( &a->y, &a->u, &a->x );
    p448_weak_reduce( &a->y );
    p448_sqr  ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  ( &a->u, &a->x );
    p448_bias ( &a->u,     2 );
    p448_weak_reduce( &a->u );
    p448_mul  ( &a->x,  sdm1, &a->u );
    p448_mul  (   &L0, &a->x, &a->y );
    p448_mul  ( &a->t,   &L0, &a->y );
    p448_mul  ( &a->u, &a->x, &a->t );
    p448_mul  ( &a->t, &a->u,   &L0 );
    p448_mul  ( &a->y, &a->x, &a->t );
    p448_isr  (   &L0, &a->y );
    p448_mul  ( &a->y, &a->u,   &L0 );
    p448_sqr  (   &L1,   &L0 );
    p448_mul  ( &a->u, &a->t,   &L1 );
    p448_mul  ( &a->t, &a->x, &a->u );
    p448_add  ( &a->x,    sz,    sz );
    p448_mul  (   &L0, &a->u, &a->x );
    p448_copy ( &a->x, &a->z );
    p448_subw ( &a->x,     1 );
    p448_neg  (   &L1, &a->x );
    p448_bias (   &L1,     2 );
    p448_weak_reduce(   &L1 );
    p448_mul  ( &a->x,   &L1,   &L0 );
    p448_mul  (   &L0, &a->u, &a->y );
    p448_addw ( &a->z,     1 );
    p448_mul  ( &a->y, &a->z,   &L0 );
    p448_subw ( &a->t,     1 );
    p448_bias ( &a->t,     1 );
    mask_t ret = p448_is_zero( &a->t );
    p448_set_ui( &a->z,     1 );
    p448_copy ( &a->t, &a->x );
    p448_copy ( &a->u, &a->y );
    return ret;
 }

 void
 set_identity_extensible (
    struct extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_tw_extensible (
    struct tw_extensible_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
    p448_set_ui( &a->z,     1 );
    p448_set_ui( &a->t,     0 );
    p448_set_ui( &a->u,     0 );
 }

 void
 set_identity_affine (
    struct affine_t* a
 ) {
    p448_set_ui( &a->x,     0 );
    p448_set_ui( &a->y,     1 );
 }

 mask_t
 eq_affine (
    const struct affine_t* a,
    const struct affine_t* b
 ) {
    mask_t L1, L2;
    struct p448_t L0;
    p448_sub  (   &L0, &a->x, &b->x );
    p448_bias (   &L0,     2 );
       L2 = p448_is_zero(   &L0 );
    p448_sub  (   &L0, &a->y, &b->y );
    p448_bias (   &L0,     2 );
       L1 = p448_is_zero(   &L0 );
    return    L2 &    L1;
 }

 mask_t
 eq_extensible (
    const struct extensible_t* a,
    const struct extensible_t* b
 ) {
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 mask_t
 eq_tw_extensible (
    const struct tw_extensible_t* a,
    const struct tw_extensible_t* b
 ) {
    mask_t L3, L4;
    struct p448_t L0, L1, L2;
    p448_mul  (   &L2, &b->z, &a->x );
    p448_mul  (   &L1, &a->z, &b->x );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    p448_mul  (   &L2, &b->z, &a->y );
    p448_mul  (   &L1, &a->z, &b->y );
    p448_sub  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     2 );
       L3 = p448_is_zero(   &L0 );
    return    L4 &    L3;
 }

 void
 elligator_2s_inject (
    struct affine_t*     a,
    const struct p448_t* r
 ) {
    mask_t L0, L1;
    struct p448_t L2, L3, L4, L5, L6, L7, L8;
    p448_sqr  ( &a->x,     r );
    p448_sqr  (   &L3, &a->x );
    p448_copy ( &a->y,   &L3 );
    p448_subw ( &a->y,     1 );
    p448_neg  (   &L4, &a->y );
    p448_bias (   &L4,     2 );
    p448_weak_reduce(   &L4 );
    p448_sqr  (   &L2,   &L4 );
    p448_mulw (   &L7,   &L2, 1527402724 );
    p448_mulw (   &L8,   &L3, 6108985600 );
    p448_add  ( &a->y,   &L8,   &L7 );
    p448_weak_reduce( &a->y );
    p448_mulw (   &L8,   &L2, 6109454568 );
    p448_sub  (   &L7, &a->y,   &L8 );
    p448_bias (   &L7,     2 );
    p448_weak_reduce(   &L7 );
    p448_mulw (   &L6, &a->y, 78160 );
    p448_mul  (   &L5,   &L7,   &L6 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_mul  (   &L5,   &L7,   &L8 );
    p448_mul  (   &L8,   &L5,   &L4 );
    p448_mul  (   &L4,   &L7,   &L8 );
    p448_isr  (   &L6,   &L4 );
    p448_mul  (   &L4,   &L5,   &L6 );
    p448_sqr  (   &L5,   &L6 );
    p448_mul  (   &L6,   &L8,   &L5 );
    p448_mul  (   &L8,   &L7,   &L6 );
    p448_mul  (   &L7,   &L8,   &L6 );
    p448_copy (   &L6, &a->x );
    p448_subw (   &L6,     1 );
    p448_addw ( &a->x,     1 );
    p448_mul  (   &L5, &a->x,   &L8 );
    p448_sub  ( &a->x,   &L6,   &L5 );
    p448_bias ( &a->x,     3 );
    p448_weak_reduce( &a->x );
    p448_mul  (   &L5,   &L4, &a->x );
    p448_mulw (   &L4,   &L5, 78160 );
    p448_neg  ( &a->x,   &L4 );
    p448_bias ( &a->x,     2 );
    p448_weak_reduce( &a->x );
    p448_add  (   &L4,   &L3,   &L3 );
    p448_add  (   &L3,   &L4,   &L2 );
    p448_subw (   &L3,     2 );
    p448_bias (   &L3,     1 );
    p448_weak_reduce(   &L3 );
    p448_mul  (   &L2,   &L3,   &L8 );
    p448_mulw (   &L3,   &L2, 3054649120 );
    p448_add  (   &L2,   &L3, &a->y );
    p448_mul  ( &a->y,   &L7,   &L2 );
       L1 = p448_is_zero(   &L8 );
       L0 = -   L1;
    p448_addw ( &a->y,    L0 );
    p448_weak_reduce( &a->y );
 }

 mask_t
 validate_affine (
    const struct affine_t* a
 ) {
    struct p448_t L0, L1, L2, L3;
    p448_sqr  (   &L0, &a->y );
    p448_sqr  (   &L2, &a->x );
    p448_add  (   &L3,   &L2,   &L0 );
    p448_subw (   &L3,     1 );
    p448_mulw (   &L1,   &L2, 39081 );
    p448_neg  (   &L2,   &L1 );
    p448_bias (   &L2,     2 );
    p448_mul  (   &L1,   &L0,   &L2 );
    p448_sub  (   &L0,   &L3,   &L1 );
    p448_bias (   &L0,     3 );
    return p448_is_zero(   &L0 );
 }

 mask_t
 validate_tw_extensible (
    const struct tw_extensible_t* ext
 ) {
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->x );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L0,   &L1, 39081 );
    p448_neg  (   &L3,   &L0 );
    p448_add  (   &L0,   &L3,   &L2 );
    p448_neg  (   &L3,   &L1 );
    p448_add  (   &L2,   &L3,   &L0 );
    p448_sqr  (   &L1, &ext->z );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     4 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }

 mask_t
 validate_extensible (
    const struct extensible_t* ext
 ) {
    mask_t L4, L5;
    struct p448_t L0, L1, L2, L3;
    /*
     * Check invariant:
     * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
     */
    p448_sqr  (   &L2, &ext->y );
    p448_neg  (   &L1,   &L2 );
    p448_addw (   &L1,     0 );
    p448_sqr  (   &L0, &ext->z );
    p448_add  (   &L2,   &L0,   &L1 );
    p448_sqr  (   &L3, &ext->u );
    p448_sqr  (   &L0, &ext->t );
    p448_mul  (   &L1,   &L0,   &L3 );
    p448_mulw (   &L3,   &L1, 39081 );
    p448_neg  (   &L0,   &L3 );
    p448_add  (   &L1,   &L0,   &L2 );
    p448_sqr  (   &L0, &ext->x );
    p448_neg  (   &L2,   &L0 );
    p448_add  (   &L0,   &L2,   &L1 );
    p448_bias (   &L0,     4 );
       L5 = p448_is_zero(   &L0 );
    /*
     * Check invariant:
     * 0 = -x*y + z*t*u
     */
    p448_mul  (   &L1, &ext->t, &ext->u );
    p448_mul  (   &L2, &ext->z,   &L1 );
    p448_addw (   &L2,     0 );
    p448_mul  (   &L0, &ext->x, &ext->y );
    p448_neg  (   &L1,   &L0 );
    p448_add  (   &L0,   &L1,   &L2 );
    p448_bias (   &L0,     2 );
       L4 = p448_is_zero(   &L0 );
    return    L5 &    L4;
 }


--- a/src/arch_neon_experimental/p448.c
+++ b/src/arch_neon_experimental/p448.c
--- a/src/arch_neon_experimental/p448.h
+++ b/src/arch_neon_experimental/p448.h
@@ -0,0 +1,376 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */
 #ifndef __P448_H__
 #define __P448_H__ 1

 #include "word.h"

 #include <stdint.h>
 #include <assert.h>

 typedef struct p448_t {
  uint32_t limb[16];
 } __attribute__((aligned(32))) p448_t;

 #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
 #define USE_NEON_PERM 1

 #ifdef __cplusplus
 extern "C" {
 #endif

 static __inline__ void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
            
 static __inline__ void
 p448_cond_neg (
    p448_t *a,
    mask_t doNegate
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) __attribute__((unused,always_inline));
             
 static __inline__ void
 p448_weak_reduce (
    p448_t *inout
 ) __attribute__((unused,always_inline));
             
 void
 p448_strong_reduce (
    p448_t *inout
 );

 mask_t
 p448_is_zero (
    const p448_t *in
 );
             
 static __inline__ void
 p448_bias (
    p448_t *inout,
    int amount
 ) __attribute__((unused,always_inline));

 void
 p448_mul (
    p448_t *__restrict__ out,
    const p448_t *a,
    const p448_t *b
 );

 void
 p448_mulw (
    p448_t *__restrict__ out,
    const p448_t *a,
    uint64_t b
 );

 void
 p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
    uint8_t *serial,
    const struct p448_t *x
 );

 mask_t
 p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
       
 void
 simultaneous_invert_p448 (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 static inline mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) __attribute__((always_inline,unused));

 /* -------------- Inline functions begin here -------------- */

 void
 p448_set_ui (
    p448_t *out,
    uint64_t x
 ) {
    int i;
    for (i=0; i<16; i++) {
      out->limb[i] = 0;
    }
    out->limb[0] = x & ((1<<28)-1);
    out->limb[2] = x>>28;
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
    }
 }

 void
 p448_sub (
    p448_t *out,
    const p448_t *a,
    const p448_t *b
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
    */
 }

 void
 p448_neg (
    p448_t *out,
    const p448_t *a
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
    }
    /*
    unsigned int i;
    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
        out->limb[i] = -a->limb[i];
    }
    */
 }

 void
 p448_cond_neg(
    p448_t *a,
    mask_t doNegate
 ) {
    unsigned int i;
    struct p448_t negated;
    big_register_t *aa = (big_register_t *)a;
    big_register_t *nn = (big_register_t*)&negated;
    big_register_t m = br_set_to_mask(doNegate);
    
    p448_neg(&negated, a);
    p448_bias(&negated, 2);
    
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        aa[i] = (aa[i] & ~m) | (nn[i] & m);
    }
 }

 void
 p448_addw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] += x;
 }
             
 void
 p448_subw (
    p448_t *a,
    uint32_t x
 ) {
  a->limb[0] -= x;
 }

 void
 p448_copy (
    p448_t *out,
    const p448_t *a
 ) {
  *out = *a;
 }

 void
 p448_bias (
    p448_t *a,
    int amt
 ) {
    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
    uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
    uint32x4_t *aa = (uint32x4_t*) a;
    aa[0] += lo;
    aa[1] += hi;
    aa[2] += hi;
    aa[3] += hi;
 }

 void
 p448_weak_reduce (
    p448_t *a
 ) {

    uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
       tmp = vshr_n_u32(aa[7],28);
       
    int i;
    for (i=7; i>=1; i--) {
        aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28);
    }
    aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif

 #endif /* __P448_H__ */
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -466,7 +466,7 @@ crandom_generate(
        unsigned long long copy = (length > state->fill) ? state->fill : length;
        state->fill -= copy;
        memcpy(output, state->buffer + state->fill, copy);
        memset(state->buffer + state->fill, 0, copy);
        really_memset(state->buffer + state->fill, 0, copy);
        output += copy; length -= copy;
    }

@@ -484,5 +484,5 @@ crandom_destroy(
         */
    }

    memset(state, 0, sizeof(*state));
    really_memset(state, 0, sizeof(*state));
 }
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -340,7 +340,7 @@ goldilocks_sign (
    word_t skw[GOLDI_FIELD_WORDS];
    mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order);
    if (!succ) {
        memset(skw,0,sizeof(skw));
        really_memset(skw,0,sizeof(skw));
        return GOLDI_ECORRUPT;
    }
        
@@ -389,9 +389,9 @@ goldilocks_sign (
        
    memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
    barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
    memset((unsigned char *)tk,0,sizeof(tk));
    memset((unsigned char *)skw,0,sizeof(skw));
    memset((unsigned char *)challenge,0,sizeof(challenge));
    really_memset((unsigned char *)tk,0,sizeof(tk));
    really_memset((unsigned char *)skw,0,sizeof(skw));
    really_memset((unsigned char *)challenge,0,sizeof(challenge));
    
    /* response = 2(nonce_secret - sk*challenge)
     * Nonce = 8[nonce_secret]*G
@@ -494,7 +494,7 @@ goldilocks_destroy_precomputed_public_key (
 ) {
    if (!precom) return;
    destroy_fixed_base(&precom->table);
    memset(&precom->pub.opaque, 0, sizeof(precom->pub));
    really_memset(&precom->pub.opaque, 0, sizeof(precom->pub));
    free(precom);
 }

--- a/src/include/word.h
+++ b/src/include/word.h
@@ -146,11 +146,17 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
    }
 #endif

 #if __AVX2__ || __SSE2__
 #if __AVX2__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)(x == br_set_to_mask(0));
 }
 #elif __SSE2__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
    return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
    //return (big_register_t)(x == br_set_to_mask(0));
 }
 #elif __ARM_NEON__
 static __inline__ big_register_t
 br_is_zero(big_register_t x) {
@@ -179,7 +185,25 @@ static inline uint64_t
 letoh64 (uint64_t x) { return x; }
 #endif


 /**
 * Really call memset, in a way that prevents the compiler from optimizing it out.
 * @param p The object to zeroize.
 * @param c The char to set it to (probably zero).
 * @param s The size of the object.
 */
 #ifdef __STDC_LIB_EXT1__ /* which it won't be, because we're -std=c99 */
 static __inline__ void
 really_memset(void *p, char c, size_t s) {
    memset_s(p,s,c,s);
 }
 #else
 static __inline__ void __attribute__((always_inline,unused))
 really_memset(void *p, char c, size_t s) {
    volatile char *pv = (volatile char *)p;
    size_t i;
    for (i=0; i<s; i++) pv[i] = c;
 }
 #endif

 /**
 * Allocate memory which is sufficiently aligned to be used for the
--- a/src/magic.c
+++ b/src/magic.c
@@ -27,11 +27,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
 };

 const struct affine_t goldilocks_base_point = {
 #ifdef USE_NEON_PERM
    {{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a,
       0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
    }},
 #else
    {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
       U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
       U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
       U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
    }},
 #endif
    {{ 19 }}
 };

@@ -50,6 +56,12 @@ const struct barrett_prime_t curve_prime_order = {

 const struct field_t
 sqrt_d_minus_1 = {{
 #ifdef USE_NEON_PERM
    0x6749f46,0x24d9770,0xd2e2183,0xa49f7b4,
    0xb4f0179,0x8c5f656,0x888db42,0xdcac462,
    0xbdeea38,0x748734a,0x5a189aa,0x49443b8,
    0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c
 #else
    U58LE(0xd2e21836749f46),
    U58LE(0x888db42b4f0179),
    U58LE(0x5a189aabdeea38),
@@ -58,4 +70,5 @@ sqrt_d_minus_1 = {{
    U58LE(0xdcac4628c5f656),
    U58LE(0x49443b8748734a),
    U58LE(0x12fec0c0b25b7a)
 #endif
 }};
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -63,7 +63,15 @@ cond_negate_tw_pniels (
    cond_negate_tw_niels(&n->n, doNegate);
 }

 static __inline__ void
 #if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__))
  /* This works around an apparent compiler bug in GCC, thanks Samuel Neves */
  static void __attribute__((optimize("O1")))
  #ifdef __OPTIMIZE_SIZE__
    #warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms"
  #endif
 #else
  static __inline__ void
 #endif
 constant_time_lookup_tw_pniels (
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
@@ -76,7 +84,7 @@ constant_time_lookup_tw_pniels (
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    really_memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
@@ -98,7 +106,7 @@ constant_time_lookup_tw_niels (
    int j;
    unsigned int k;
    
    memset(out, 0, sizeof(*out));
    really_memset(out, 0, sizeof(*out));
    for (j=0; j<nin; j++, big_i-=big_one) {
        big_register_t mask = br_is_zero(big_i);
        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
@@ -449,7 +457,7 @@ precompute_fixed_base (
  struct tw_niels_t *prealloc
 ) {
    if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
        memset(out, 0, sizeof(*out));
        really_memset(out, 0, sizeof(*out));
        return 0;
    }
    
@@ -478,8 +486,8 @@ precompute_fixed_base (
        free(doubles);
        free(zs);
        free(zis);
        memset(out, 0, sizeof(*out));
        memset(table, 0, sizeof(*table) * (n<<(t-1)));
        really_memset(out, 0, sizeof(*out));
        really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
        if (!prealloc) free(table);
        return 0;
    }
@@ -593,9 +601,9 @@ precompute_fixed_base (
    free(zis);

    if (unlikely(!ret)) {
        memset(table, 0, sizeof(*table) * (n<<(t-1)));
        really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
        if (!prealloc) free(table);
        memset(out, 0, sizeof(*out));
        really_memset(out, 0, sizeof(*out));
        return 0;
    }

@@ -607,12 +615,12 @@ destroy_fixed_base (
    struct fixed_base_table_t *table
 ) {
    if (table->table) {
        memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
        really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
    }
    if (table->own_table) {
        free(table->table);
    }
    memset(table,0,sizeof(*table));
    really_memset(table,0,sizeof(*table));
 }

 mask_t
--- a/test/bench.c
+++ b/test/bench.c
@@ -108,33 +108,33 @@ int main(int argc, char **argv) {
    q448_randomize(&crand, sk);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
    for (i=0; i<nbase*5000; i++) {
        p448_mul(&c, &b, &a);
    }
    when = now() - when;
    printf("mul:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
    for (i=0; i<nbase*5000; i++) {
        p448_sqr(&c, &a);
    }
    when = now() - when;
    printf("sqr:         %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*500; i++) {
        p448_mul(&c, &b, &a);
        p448_mul(&a, &b, &c);
    for (i=0; i<nbase*5000; i++) {
        p448_mulw(&c, &b, 1234562);
    }
    when = now() - when;
    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
    printf("mulw:        %5.1fns\n", when * 1e9 / i);
    
    when = now();
    for (i=0; i<nbase*1000; i++) {
        p448_mulw(&c, &b, 1234562);
    for (i=0; i<nbase*500; i++) {
        p448_mul(&c, &b, &a);
        p448_mul(&a, &b, &c);
    }
    when = now() - when;
    printf("mulw:        %5.1fns\n", when * 1e9 / i);
    printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
    
    when = now();
    for (i=0; i<nbase*10; i++) {
--- a/test/test.c
+++ b/test/test.c
@@ -3,6 +3,9 @@
 #include <stdio.h>
 #include <string.h>

 #ifndef LIMBPERM
 #define LIMBPERM(x) (x)
 #endif

 int failed_tests, n_tests, failed_this_test, running_a_test;

@@ -87,7 +90,7 @@ void p448_print (
    int j;
    printf("%s = 0x", descr);
    for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) {
        printf(PRIxWORD58, b.limb[j]);
        printf(PRIxWORD58, b.limb[LIMBPERM(j)]);
    }
    printf("\n");
 }
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -170,7 +170,12 @@ int test_arithmetic () {
    int bits = sizeof(word_t) * 448 / sizeof(p448_t);
    
    for (j=0; j<ntests; j++) {
        if (j&1) {
        if (j<256) {
            mpz_set_ui(x,0);
            mpz_set_ui(y,0);
            mpz_setbit(x,(j%16)*28);
            mpz_setbit(y,(j/16)*28);
        } else if (j&1) {
            mpz_rrandomb(x, state, 448);
            mpz_rrandomb(y, state, 448);
        } else {