WARNING: This commit is largely untested.

Continuing demagication and factoring of field code. Removing high-level ops from p448.h and putting them in field.h. That way they won't need rewriting for new fields and architectures. Create constant_time.h which contains constant-time lookups, condswaps, etc. That way the code is the same on all architectures, instead of varying depending on whether the field size is a multiple of the vector register size. I should still add a constant_time_select to factor out field_cond_negate. TODO: I need to test this for correctness and performance on various platforms. It works on my Mac, but since Yosemite the timing is totally unpredictable (background tasks? variable boost?).
10 years ago · c6d69dec2e
--- a/src/arch_32/p448.h
+++ b/src/arch_32/p448.h
@@ -22,13 +22,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
@@ -114,13 +107,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
@@ -133,24 +119,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );

 static inline mask_t
 p448_eq (
@@ -172,24 +140,6 @@ p448_set_ui (
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -315,28 +265,6 @@ p448_weak_reduce (
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
@@ -352,18 +280,6 @@ p448_eq (
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arch_arm_32/p448.h
+++ b/src/arch_arm_32/p448.h
@@ -22,13 +22,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
@@ -114,13 +107,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
@@ -133,24 +119,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );
   
 static inline mask_t
 p448_eq (
@@ -172,28 +140,6 @@ p448_set_ui (
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
 #if __ARM_NEON__
    big_register_t m = vdupq_n_u32(doswap);
 #else
    big_register_t m = doswap;
 #endif

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -323,28 +269,6 @@ p448_weak_reduce (
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
@@ -360,18 +284,6 @@ p448_eq (
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arch_neon/p448.h
+++ b/src/arch_neon/p448.h
@@ -22,13 +22,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
@@ -114,13 +107,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
@@ -133,24 +119,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );

 static inline mask_t
 p448_eq (
@@ -172,24 +140,6 @@ p448_set_ui (
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -315,28 +265,6 @@ p448_weak_reduce (
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
@@ -352,18 +280,6 @@ p448_eq (
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arch_neon_experimental/p448.h
+++ b/src/arch_neon_experimental/p448.h
@@ -25,13 +25,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
@@ -117,13 +110,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
@@ -136,24 +122,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );

 static inline mask_t
 p448_eq (
@@ -175,24 +143,6 @@ p448_set_ui (
    out->limb[0] = x & ((1<<28)-1);
    out->limb[2] = x>>28;
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -313,28 +263,6 @@ p448_weak_reduce (
    aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
@@ -350,18 +278,6 @@ p448_eq (
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arch_ref64/p448.h
+++ b/src/arch_ref64/p448.h
@@ -23,13 +23,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused));

 static __inline__ void
 p448_add (
@@ -121,13 +114,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused));

 void
 p448_serialize (
@@ -140,24 +126,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );

 static inline mask_t
 p448_eq (
@@ -178,20 +146,6 @@ p448_set_ui (
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
        uint64_t x = doswap & (a->limb[i]^b->limb[i]);
        a->limb[i] ^= x;
        b->limb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -313,28 +267,6 @@ p448_weak_reduce (
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
@@ -347,18 +279,6 @@ p448_eq (
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<8; i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arch_x86_64/p448.h
+++ b/src/arch_x86_64/p448.h
@@ -22,13 +22,6 @@ p448_set_ui (
    p448_t *out,
    uint64_t x
 ) __attribute__((unused,always_inline));
           
 static __inline__ void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t do_swap
 ) __attribute__((unused,always_inline));

 static __inline__ void
 p448_add (
@@ -114,13 +107,6 @@ p448_sqr (
    p448_t *__restrict__ out,
    const p448_t *a
 );
         
 static __inline__ void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) __attribute__((unused,always_inline));

 void
 p448_serialize (
@@ -133,24 +119,6 @@ p448_deserialize (
    p448_t *x,
    const uint8_t serial[56]
 );
    
 static __inline__ void
 p448_mask(
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 p448_inverse (
   struct p448_t*       a,
   const struct p448_t* x
 );

 static inline mask_t
 p448_eq (
@@ -171,24 +139,6 @@ p448_set_ui (
      out->limb[i] = 0;
    }
 }
            
 void
 p448_cond_swap (
    p448_t *a,
    p448_t *b,
    mask_t doswap
 ) {
    big_register_t *aa = (big_register_t*)a;
    big_register_t *bb = (big_register_t*)b;
    big_register_t m = br_set_to_mask(doswap);

    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
        big_register_t x = m & (aa[i]^bb[i]);
        aa[i] ^= x;
        bb[i] ^= x;
    }
 }

 void
 p448_add (
@@ -331,55 +281,6 @@ p448_weak_reduce (
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }

 void
 p448_sqrn (
    p448_t *__restrict__ y,
    const p448_t *x,
    int n
 ) {
    p448_t tmp;
    assert(n>0);
    if (n&1) {
        p448_sqr(y,x);
        n--;
    } else {
        p448_sqr(&tmp,x);
        p448_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        p448_sqr(&tmp,y);
        p448_sqr(y,&tmp);
    }
 }

 mask_t
 p448_eq (
    const struct p448_t *a,
    const struct p448_t *b
 ) {
    struct p448_t ra, rb;
    p448_copy(&ra, a);
    p448_copy(&rb, b);
    p448_weak_reduce(&ra);
    p448_weak_reduce(&rb);
    p448_sub(&ra, &ra, &rb);
    p448_bias(&ra, 2);
    return p448_is_zero(&ra);
 }

 void
 p448_mask (
    struct p448_t *a,
    const struct p448_t *b,
    mask_t mask
 ) {
    unsigned int i;
    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
        a->limb[i] = b->limb[i] & mask;
    }
 }

 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
--- a/src/arithmetic.c
+++ b/src/arithmetic.c
@@ -11,6 +11,21 @@
 #include "field.h"
 #include "ec_point.h" // TODO

 mask_t
 field_eq (
    const struct field_t *a,
    const struct field_t *b
 ) {
    struct field_t ra, rb;
    field_copy(&ra, a);
    field_copy(&rb, b);
    field_weak_reduce(&ra);
    field_weak_reduce(&rb);
    field_sub(&ra, &ra, &rb);
    field_bias(&ra, 2);
    return field_is_zero(&ra);
 }

 void
 field_inverse (
    struct field_t*       a,
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -52,8 +52,30 @@ field_mulw_scc_wr (
        field_weak_reduce(out);
 }

 void
 field_isr (
 static __inline__ void
 field_sqrn (
    field_t *__restrict__ y,
    const field_t *x,
    int n
 ) {
    field_t tmp;
    assert(n>0);
    if (n&1) {
        field_sqr(y,x);
        n--;
    } else {
        field_sqr(&tmp,x);
        field_sqr(y,&tmp);
        n-=2;
    }
    for (; n; n-=2) {
        field_sqr(&tmp,y);
        field_sqr(y,&tmp);
    }
 }

 void 
 field_isr ( /* TODO: MAGIC */
    struct field_t*       a,
    const struct field_t* x
 ) {
@@ -433,7 +455,7 @@ serialize_montgomery (
    field_mul  (   &L0, &a->xd,   &L2 );
       L5 = field_is_zero( &a->zd );
       L6 = -   L5;
    field_mask (   &L1,   &L0,    L5 );
    constant_time_mask (   &L1,   &L0, sizeof(L1), L5 );
    field_add  (   &L2,   &L1, &a->zd );
       L4 = ~   L5;
    field_mul  (   &L1,   sbz,   &L3 );
@@ -446,7 +468,7 @@ serialize_montgomery (
    field_mul  (   &L2,   &L1,   &L0 );
    field_sqr  (   &L1,   &L0 );
    field_mul  (   &L0,   &L3,   &L1 );
    field_mask (     b,   &L2,    L4 );
    constant_time_mask (     b,   &L2, sizeof(L1), L4 );
    field_subw (   &L0,     1 );
    field_bias (   &L0,     1 );
       L5 = field_is_zero(   &L0 );
--- a/src/include/constant_time.h
+++ b/src/include/constant_time.h
@@ -0,0 +1,230 @@
 /**
 * @file constant_time.h
 * @copyright
 *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.
 * @author Mike Hamburg
 *
 * @brief Constant-time routines.
 */

 #ifndef __CONSTANT_TIME_H__
 #define __CONSTANT_TIME_H__ 1

 #include "word.h"

 /*
 * Constant-time operations on hopefully-compile-time-sized memory
 * regions.  Needed for flexibility / demagication: not all fields
 * have sizes which are multiples of the vector width, necessitating
 * a change from the Ed448 versions.
 *
 * These routines would be much simpler to define at the byte level,
 * but if not vectorized they would be a significant fraction of the
 * runtime.  Eg on NEON-less ARM, constant_time_lookup is like 15% of
 * signing time, vs 6% on Haswell with its fancy AVX2 vectors.
 *
 * If the compiler could do a good job of autovectorizing the code,
 * we could just leave it with the byte definition.  But that's unlikely
 * on most deployed compilers, especially if you consider that pcmpeq[size]
 * is much faster than moving a scalar to the vector unit (which is what
 * a naive autovectorizer will do with constant_time_lookup on Intel).
 *
 * Instead, we're putting our trust in the loop unroller and unswitcher.
 * 
 * TODO: verify correctness and performance on each platform, to make sure
 * that there are no regressions.
 */


 /**
 * Unaligned big (vector?) register.
 */
 typedef struct {
    big_register_t unaligned;
 } __attribute__((packed)) unaligned_br_t;

 /**
 * Unaligned word register, for architectures where that matters.
 */
 typedef struct {
    word_t unaligned;
 } __attribute__((packed)) unaligned_word_t;

 /**
 * @brief Constant-time conditional swap.
 *
 * If doswap, then swap elem_bytes between *a and *b.
 *
 * *a and *b must not alias.  Also, they must be at least as aligned
 * as their sizes, if the CPU cares about that sort of thing.
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_cond_swap (
    void *__restrict__ a_,
    void *__restrict__ b_,
    word_t elem_bytes,
    mask_t doswap
 ) {
    word_t k;
    unsigned char *a = (unsigned char *)a_;
    unsigned char *b = (unsigned char *)b_;
    
    big_register_t br_mask = br_set_to_mask(doswap);
    for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
        if (elem_bytes % sizeof(big_register_t)) {
            /* unaligned */
            big_register_t xor =
                ((unaligned_br_t*)(&a[k]))->unaligned
              ^ ((unaligned_br_t*)(&b[k]))->unaligned;
            xor &= br_mask;
            ((unaligned_br_t*)(&a[k]))->unaligned ^= xor;
            ((unaligned_br_t*)(&b[k]))->unaligned ^= xor;
        } else {
            /* aligned */
            big_register_t xor =
                *((big_register_t*)(&a[k]))
              ^ *((big_register_t*)(&b[k]));
            xor &= br_mask;
            *((big_register_t*)(&a[k])) ^= xor;
            *((big_register_t*)(&b[k])) ^= xor;
        }
    }

    if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
        for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
            if (elem_bytes % sizeof(word_t)) {
                /* unaligned */
                word_t xor =
                    ((unaligned_word_t*)(&a[k]))->unaligned
                  ^ ((unaligned_word_t*)(&b[k]))->unaligned;
                xor &= doswap;
                ((unaligned_word_t*)(&a[k]))->unaligned ^= xor;
                ((unaligned_word_t*)(&b[k]))->unaligned ^= xor;
            } else {
                /* aligned */
                word_t xor =
                    *((word_t*)(&a[k]))
                  ^ *((word_t*)(&b[k]));
                xor &= doswap;
                *((word_t*)(&a[k])) ^= xor;
                *((word_t*)(&b[k])) ^= xor;
            }
        }
    }
    
    if (elem_bytes % sizeof(word_t)) {
        for (; k<elem_bytes; k+=1) {
            unsigned char xor = a[k] ^ b[k];
            xor &= doswap;
            a[k] ^= xor;
            b[k] ^= xor;
        }
    }
 }

 /**
 * @brief Constant-time equivalent of memcpy(out, table + elem_bytes*idx, elem_bytes);
 *
 * The table must be at least as aligned as elem_bytes.  The output must be vector aligned.
 * The table and output must not alias.
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_lookup (
    void *__restrict__ out_,
    const void *table_,
    word_t elem_bytes,
    word_t n_table,
    word_t idx
 ) {
    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
    
    /* Can't do pointer arithmetic on void* */
    unsigned char *out = (unsigned char *)out_;
    const unsigned char *table = (const unsigned char *)table_;
    word_t j,k;
    
    really_memset(out, 0, elem_bytes);
    for (j=0; j<n_table; j++, big_i-=big_one) {        
        big_register_t br_mask = br_is_zero(big_i);
        for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
            if (elem_bytes % sizeof(big_register_t)) {
                /* input unaligned, output aligned */
                *(big_register_t *)(out+k) |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
            } else {
                /* aligned */
                *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]);
            }
        }

        word_t mask = word_is_zero(idx^j);
        if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
            for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
                if (elem_bytes % sizeof(word_t)) {
                    /* input unaligned, output aligned */
                    *(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned;
                } else {
                    /* aligned */
                    *(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]);
                }
            }
        }
        
        if (elem_bytes % sizeof(word_t)) {
            for (; k<elem_bytes; k+=1) {
                out[k] |= mask & table[k+j*elem_bytes];
            }
        }
    }
 }

 /**
 * @brief Constant-time a = b&mask.
 *
 * The input and output must be at least as aligned as elem_bytes.
 */
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_mask (
    void *__restrict__ a_,
    const void *b_,
    word_t elem_bytes,
    mask_t mask
 ) {
    unsigned char *a = (unsigned char *)a_;
    const unsigned char *b = (const unsigned char *)b_;
    
    word_t k;
    big_register_t br_mask = br_set_to_mask(mask);
    for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
        if (elem_bytes % sizeof(big_register_t)) {
            /* unaligned */
            ((unaligned_br_t*)(&a[k]))->unaligned = br_mask & ((const unaligned_br_t*)(&b[k]))->unaligned;
        } else {
            /* aligned */
            *(big_register_t *)(a+k) = br_mask & *(const big_register_t*)(&b[k]);
        }
    }

    if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
        for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
            if (elem_bytes % sizeof(word_t)) {
                /* unaligned */
                ((unaligned_word_t*)(&a[k]))->unaligned = mask & ((const unaligned_word_t*)(&b[k]))->unaligned;
            } else {
                /* aligned */
                *(word_t *)(a+k) = mask & *(const word_t*)(&b[k]);
            }
        }
    }
    
    if (elem_bytes % sizeof(word_t)) {
        for (; k<elem_bytes; k+=1) {
            a[k] = mask & b[k];
        }
    }
 }

 #endif /* __CONSTANT_TIME_H__ */
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -11,6 +11,7 @@
 #define __CC_INCLUDED_EC_POINT_H__

 #include "field.h"
 #include "constant_time.h"

 #ifdef __cplusplus
 extern "C" {
@@ -150,43 +151,6 @@ copy_tw_pniels (
    const struct tw_pniels_t* ds
 ) __attribute__((unused,always_inline));

 /**
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 );
    
 /**
 * Batch inverts out[i] = 1/in[i]
 * 
 * If any input is zero, all the outputs will be zero.
 */     
 void
 field_simultaneous_invert (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 field_inverse (
    struct field_t*       a,
    const struct field_t* x
 );

 /**
 * Add two points on a twisted Edwards curve, one in Extensible form
 * and the other in half-Niels form.
@@ -490,7 +454,7 @@ cond_negate_tw_niels (
    struct tw_niels_t *n,
    mask_t doNegate
 ) {
    field_cond_swap(&n->a, &n->b, doNegate);
    constant_time_cond_swap(&n->a, &n->b, sizeof(n->a), doNegate);
    field_cond_neg(&n->c, doNegate);
 }

--- a/src/include/field.h
+++ b/src/include/field.h
@@ -9,21 +9,13 @@
 #ifndef __FIELD_H__
 #define __FIELD_H__

 #include "p448.h"
 #include <string.h>

 #include "p448.h"
 #define FIELD_BITS           448
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
 #define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))

 /**
 * @brief For GMP tests: little-endian representation of the field modulus.
 */
 extern const uint8_t FIELD_MODULUS[FIELD_BYTES];

 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
 #define field_sqrn           p448_sqrn
 #define field_add            p448_add
 #define field_sub            p448_sub
 #define field_mulw           p448_mulw
@@ -32,15 +24,80 @@ extern const uint8_t FIELD_MODULUS[FIELD_BYTES];
 #define field_neg            p448_neg
 #define field_set_ui         p448_set_ui
 #define field_bias           p448_bias
 #define field_copy           p448_copy
 #define field_mask           p448_mask
 #define field_weak_reduce    p448_weak_reduce
 #define field_strong_reduce  p448_strong_reduce
 #define field_cond_swap      p448_cond_swap
 #define field_cond_neg       p448_cond_neg
 #define field_serialize      p448_serialize
 #define field_deserialize    p448_deserialize
 #define field_eq             p448_eq
 #define field_is_zero        p448_is_zero

 /** @brief Bytes in a field element */
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)

 /** @brief Words in a field element */
 #define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))

 /**
 * @brief For GMP tests: little-endian representation of the field modulus.
 */
 extern const uint8_t FIELD_MODULUS[FIELD_BYTES];

 /**
 * Copy one field element to another.
 */
 static inline void
 __attribute__((unused,always_inline))        
 field_copy (
    struct field_t *__restrict__ a,
    const struct field_t *__restrict__ b
 ) {
    memcpy(a,b,sizeof(*a));
 }

 /**
 * Returns 1/sqrt(+- x).
 * 
 * The Legendre symbol of the result is the same as that of the
 * input.
 * 
 * If x=0, returns 0.
 */
 void
 field_isr (
    struct field_t*       a,
    const struct field_t* x
 );
    
 /**
 * Batch inverts out[i] = 1/in[i]
 * 
 * If any input is zero, all the outputs will be zero.
 */     
 void
 field_simultaneous_invert (
    struct p448_t *__restrict__ out,
    const struct p448_t *in,
    unsigned int n
 );

 /**
 * Returns 1/x.
 * 
 * If x=0, returns 0.
 */
 void
 field_inverse (
    struct field_t*       a,
    const struct field_t* x
 );

 /**
 * Returns -1 if a==b, 0 otherwise.
 */
 mask_t
 field_eq (
    const struct field_t *a,
    const struct field_t *b
 );

 #endif /* __FIELD_H__ */
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -143,6 +143,15 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
        return (big_register_t)x;
    }
 #endif
    
 /**
 * Return -1 if x==0, and 0 otherwise.
 */
 static __inline__ mask_t
 __attribute__((always_inline,unused))
 word_is_zero(word_t x) {
    return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
 }

 #if __AVX2__
 static __inline__ big_register_t
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -11,6 +11,7 @@
 #include "intrinsics.h"
 #include "scalarmul.h"
 #include "barrett_field.h"
 #include "constant_time.h"

 mask_t
 montgomery_ladder (
@@ -29,15 +30,15 @@ montgomery_ladder (
        word_t w = scalar[j];
        for (i=n; i>=0; i--) {
            mask_t flip = -((w>>i)&1);
            field_cond_swap(&mont.xa,&mont.xd,flip^pflip);
            field_cond_swap(&mont.za,&mont.zd,flip^pflip);
            constant_time_cond_swap(&mont.xa,&mont.xd,sizeof(mont.xd),flip^pflip);
            constant_time_cond_swap(&mont.za,&mont.zd,sizeof(mont.xd),flip^pflip);
            montgomery_step(&mont);
            pflip = flip;
        }
        n = WORD_BITS-1;
    }
    field_cond_swap(&mont.xa,&mont.xd,pflip);
    field_cond_swap(&mont.za,&mont.zd,pflip);
    constant_time_cond_swap(&mont.xa,&mont.xd,sizeof(mont.xd),pflip);
    constant_time_cond_swap(&mont.za,&mont.zd,sizeof(mont.xd),pflip);
    
    assert(n_extra_doubles < INT_MAX);
    for (j=0; j<(int)n_extra_doubles; j++) {
@@ -47,6 +48,29 @@ montgomery_ladder (
    return serialize_montgomery(out, &mont, in);
 }

 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_lookup_tw_pniels (
    struct tw_pniels_t *out,
    const struct tw_pniels_t *in,
    int nin,
    int idx
 ) {
    constant_time_lookup(out,in,sizeof(*out),nin,idx);
 }

 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_lookup_tw_niels (
    struct tw_niels_t *out,
    const struct tw_niels_t *in,
    int nin,
    int idx
 ) {
    constant_time_lookup(out,in,sizeof(*out),nin,idx);
 }

 /*
 static __inline__ void
 constant_time_lookup_tw_pniels (
    struct tw_pniels_t *out,
@@ -90,6 +114,7 @@ constant_time_lookup_tw_niels (
        }
    }
 }
 */

 static void
 convert_to_signed_window_form (