diff --git a/include/decaf.h b/include/decaf.h
index 7609f34..9ed3a53 100644
--- a/include/decaf.h
+++ b/include/decaf.h
@@ -396,9 +396,6 @@ void decaf_448_precomputed_scalarmul (
  * @param [in] scalar1 A first scalar to multiply by.
  * @param [in] base2 A second point to be scaled.
  * @param [in] scalar2 A second scalar to multiply by.
- *
- * @TODO: test
- * @TODO: define vartime/precomp version of this for performance??
  */
 void decaf_448_point_double_scalarmul (
     decaf_448_point_t combo,
@@ -408,6 +405,30 @@ void decaf_448_point_double_scalarmul (
     const decaf_448_scalar_t scalar2
 ) API_VIS NONNULL5 NOINLINE;
 
+/**
+ * @brief Multiply two base points by two scalars:
+ * scaled = scalar1*base1 + scalar2*base2.
+ *
+ * Otherwise quivalent to two calls to decaf_448_point_scalarmul, but may be
+ * faster.
+ *
+ * @param [out] scaled The scaled point base*scalar
+ * @param [in] base1 A precomputed first point to be scaled.
+ * @param [in] scalar1 A first scalar to multiply by.
+ * @param [in] base2 A second point to be scaled.
+ * @param [in] scalar2 A second scalar to multiply by.
+ *
+ * @warning: This function takes variable time, and may leak the scalars
+ * used.  It is designed for signature verification.
+ */
+void decaf_448_precomputed_double_scalarmul_non_secret (
+    decaf_448_point_t combo,
+    const decaf_448_precomputed_s *base1,
+    const decaf_448_scalar_t scalar1,
+    const decaf_448_point_t base2,
+    const decaf_448_scalar_t scalar2
+) API_VIS NONNULL5 NOINLINE;
+
 /**
  * @brief Test that a point is valid, for debugging purposes.
  *
diff --git a/include/decaf_crypto.h b/include/decaf_crypto.h
index ae370b8..a86bfc7 100644
--- a/include/decaf_crypto.h
+++ b/include/decaf_crypto.h
@@ -85,6 +85,9 @@ void decaf_448_private_to_public (
  *
  * @retval DECAF_SUCCESS Key exchange was successful.
  * @retval DECAF_FAILURE Key exchange failed.
+ *
+ * @warning This is a pretty silly shared secret computation
+ * and will almost definitely change in the future.
  */
 decaf_bool_t
 decaf_448_shared_secret (
diff --git a/src/decaf.c b/src/decaf.c
index d134447..106fa00 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -804,3 +804,13 @@ void decaf_448_precomputed_scalarmul (
 ) {
     decaf_448_point_scalarmul(a,b->p[0],scalar);
 }
+
+void decaf_448_precomputed_double_scalarmul_non_secret (
+    decaf_448_point_t combo,
+    const decaf_448_precomputed_s *base1,
+    const decaf_448_scalar_t scalar1,
+    const decaf_448_point_t base2,
+    const decaf_448_scalar_t scalar2
+) {
+    decaf_448_point_double_scalarmul(combo, base1->p[0], scalar1, base2, scalar2);
+}
diff --git a/src/decaf_crypto.c b/src/decaf_crypto.c
index 886f287..7b16f0c 100644
--- a/src/decaf_crypto.c
+++ b/src/decaf_crypto.c
@@ -186,9 +186,9 @@ decaf_448_verify_shake (
     ret &= decaf_448_point_decode(pubpoint, pub, DECAF_FALSE);
     ret &= decaf_448_scalar_decode(response, &sig[DECAF_448_SER_BYTES]);
 
-    decaf_448_point_double_scalarmul (
+    decaf_448_precomputed_double_scalarmul_non_secret (
         pubpoint,
-        decaf_448_point_base, response,
+        decaf_448_precomputed_base, response,
         pubpoint, challenge
     );
 
diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index 3809e8a..b5e7d35 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -13,10 +13,6 @@
 #include <string.h>
 #include "field.h"
 
- /* TODO REMOVE */
-#include "constant_time.h"
-#include <stdio.h>
-
 #define WBITS DECAF_WORD_BITS
 
 #if WBITS == 64
@@ -184,9 +180,17 @@ siv gf_add_nr ( gf c, const gf a, const gf b ) {
 }
 
 /** Constant time, x = is_z ? z : y */
-sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
-    //FOR_LIMB(i, x[i] = (y[i] & ~is_z) | (z[i] & is_z) );
+siv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
+    big_register_t br_mask = br_set_to_mask(is_z);
+    big_register_t *out = (big_register_t *)x;
+    const big_register_t *y_ = (const big_register_t *)y, *z_ = (const big_register_t *)z;
+    word_t k;
+    for (k=0; k<sizeof(gf)/sizeof(big_register_t); k++) {
+        out[k] = (~br_mask & y_[k]) | (br_mask & z_[k]);
+    }
+    /*
     constant_time_select(x,z,y,sizeof(gf),is_z);
+    */
 }
 
 /** Constant time, if (neg) x=-x; */
@@ -209,9 +213,6 @@ siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
         x->limb[i] ^= s;
         y->limb[i] ^= s;
     }
-    /*
-    constant_time_cond_swap(x,y,sizeof(gf),swap);
-    */
 }
 
 /**
@@ -791,6 +792,27 @@ snv add_niels_to_pt (
     if (!before_double) gf_mul ( d->t, b, c );
 }
 
+snv sub_niels_from_pt (
+    decaf_448_point_t d,
+    const niels_t e,
+    decaf_bool_t before_double
+) {
+    gf a, b, c;
+    gf_sub_nr ( b, d->y, d->x );
+    gf_mul ( a, e->b, b );
+    gf_add_nr ( b, d->x, d->y );
+    gf_mul ( d->y, e->a, b );
+    gf_mul ( d->x, e->c, d->t );
+    gf_add_nr ( c, a, d->y );
+    gf_sub_nr ( b, d->y, a );
+    gf_add_nr ( d->y, d->z, d->x );
+    gf_sub_nr ( a, d->z, d->x );
+    gf_mul ( d->z, a, d->y );
+    gf_mul ( d->x, d->y, b );
+    gf_mul ( d->y, a, c );
+    if (!before_double) gf_mul ( d->t, b, c );
+}
+
 sv add_pniels_to_pt (
     decaf_448_point_t p,
     const pniels_t pn,
@@ -802,8 +824,42 @@ sv add_pniels_to_pt (
     add_niels_to_pt( p, pn->n, before_double );
 }
 
+sv sub_pniels_from_pt (
+    decaf_448_point_t p,
+    const pniels_t pn,
+    decaf_bool_t before_double
+) {
+    gf L0;
+    gf_mul ( L0, p->z, pn->z );
+    gf_cpy ( p->z, L0 );
+    sub_niels_from_pt( p, pn->n, before_double );
+}
+
 extern const decaf_448_scalar_t decaf_448_point_scalarmul_adjustment;
 
+/* TODO: get rid of big_register_t dependencies? */
+siv constant_time_lookup_xx (
+    void *__restrict__ out_,
+    const void *table_,
+    word_t elem_bytes,
+    word_t n_table,
+    word_t idx
+) {
+    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
+    big_register_t *out = (big_register_t *)out_;
+    const unsigned char *table = (const unsigned char *)table_;
+    word_t j,k;
+    
+    for (k=0; k<elem_bytes/sizeof(big_register_t); k++)
+        out[k] = 0;
+    for (j=0; j<n_table; j++, big_i-=big_one) {        
+        big_register_t br_mask = br_is_zero(big_i);
+        for (k=0; k<elem_bytes/sizeof(big_register_t); k++) {
+            out[k] |= br_mask & *(const big_register_t*)(&table[k*sizeof(big_register_t)+j*elem_bytes]);
+        }
+    }
+}
+
 void decaf_448_point_scalarmul (
     decaf_448_point_t a,
     const decaf_448_point_t b,
@@ -838,7 +894,7 @@ void decaf_448_point_scalarmul (
         inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
     
-    constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
+    constant_time_lookup_xx(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
     cond_neg_niels(pn->n, inv);
     pniels_to_pt(tmp, pn);
 
@@ -861,7 +917,7 @@ void decaf_448_point_scalarmul (
         bits ^= inv;
     
         /* Add in from table.  Compute t only on last iteration. */
-        constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
+        constant_time_lookup_xx(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
         cond_neg_niels(pn->n, inv);
         add_pniels_to_pt(tmp, pn, i ? -1 : 0);
     }
@@ -1104,7 +1160,7 @@ siv constant_time_lookup_niels (
     int nelts,
     int idx
 ) {
-    constant_time_lookup(ni, table, sizeof(niels_s), nelts, idx);
+    constant_time_lookup_xx(ni, table, sizeof(niels_s), nelts, idx);
 }
 
 void decaf_448_precomputed_scalarmul (
@@ -1112,25 +1168,26 @@ void decaf_448_precomputed_scalarmul (
     const decaf_448_precomputed_s *table,
     const decaf_448_scalar_t scalar
 ) {
-    unsigned int i,j,k;
+    int i;
+    unsigned j,k;
     const unsigned int n = 5, t = 5, s = 18; // TODO MAGIC
     
-    decaf_448_scalar_t scalar2;
-    decaf_448_scalar_add(scalar2, scalar, decaf_448_precomputed_scalarmul_adjustment);
-    decaf_448_halve(scalar2,scalar2,decaf_448_scalar_p);
+    decaf_448_scalar_t scalar1x;
+    decaf_448_scalar_add(scalar1x, scalar, decaf_448_precomputed_scalarmul_adjustment);
+    decaf_448_halve(scalar1x,scalar1x,decaf_448_scalar_p);
     
     niels_t ni;
     
-    for (i=0; i<s; i++) {
-        if (i) decaf_448_point_double(out,out);
+    for (i=s-1; i>=0; i--) {
+        if (i != (int)s-1) decaf_448_point_double(out,out);
         
         for (j=0; j<n; j++) {
             int tab = 0;
          
             for (k=0; k<t; k++) {
-                unsigned int bit = (s-1-i) + k*s + j*(s*t);
+                unsigned int bit = i + s*(k + j*t);
                 if (bit < SCALAR_WORDS * WBITS) {
-                    tab |= (scalar2->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
+                    tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
                 }
             }
             
@@ -1141,8 +1198,8 @@ void decaf_448_precomputed_scalarmul (
             constant_time_lookup_niels(ni, &table->table[j<<(t-1)], 1<<(t-1), tab);
 
             cond_neg_niels(ni, invert);
-            if (i||j) {
-                add_niels_to_pt(out, ni, j==n-1 && i<s-1);
+            if ((i!=s-1)||j) {
+                add_niels_to_pt(out, ni, j==n-1 && i);
             } else {
                 niels_to_pt(out, ni);
             }
@@ -1280,3 +1337,158 @@ decaf_bool_t decaf_448_direct_scalarmul (
     return succ;
 }
 
+/**
+ * @cond internal
+ * Control for variable-time scalar multiply algorithms.
+ */
+struct smvt_control {
+  int power, addend;
+};
+
+static int recode_wnaf (
+    struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
+    const decaf_448_scalar_t scalar,
+    unsigned int tableBits
+) {
+    int current = 0, i, j;
+    unsigned int position = 0;
+
+    /* PERF: negate scalar if it's large
+     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
+     */
+    for (i=DECAF_448_SCALAR_BITS-1; i >= 0; i--) {
+        int bit = (scalar->limb[i/WORD_BITS] >> (i%WORD_BITS)) & 1;
+        current = 2*current + bit;
+
+        /*
+         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
+         * So current loses (tableBits+1) bits every time.  It otherwise gains
+         * 1 bit per iteration.  The number of iterations is
+         * (nbits + 2 + tableBits), and an additional control word is added at
+         * the end.  So the total number of control words is at most
+         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
+         * There's also the stopper with power -1, for a total of +3.
+         */
+        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
+            int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
+            current = -(current & 1);
+
+            for (j=i; (delta & 1) == 0; j++) {
+                delta >>= 1;
+            }
+            control[position].power = j+1;
+            control[position].addend = delta;
+            position++;
+            assert(position <= DECAF_448_SCALAR_BITS/(tableBits+1) + 2);
+        }
+    }
+    
+    if (current) {
+        for (j=0; (current & 1) == 0; j++) {
+            current >>= 1;
+        }
+        control[position].power = j;
+        control[position].addend = current;
+        position++;
+        assert(position <= DECAF_448_SCALAR_BITS/(tableBits+1) + 2);
+    }
+    
+  
+    control[position].power = -1;
+    control[position].addend = 0;
+    return position;
+}
+
+sv prepare_wnaf_table(
+    pniels_t *output,
+    decaf_448_point_t working,
+    unsigned int tbits
+) {
+    int i;
+    pt_to_pniels(output[0], working);
+
+    if (tbits == 0) return;
+
+    decaf_448_point_double(working,working);
+    pniels_t twop;
+    pt_to_pniels(twop, working);
+
+    add_pniels_to_pt(working, output[0],0);
+    pt_to_pniels(output[1], working);
+
+    for (i=2; i < 1<<tbits; i++) {
+        add_pniels_to_pt(working, twop,0);
+        pt_to_pniels(output[i], working);
+    }
+}
+
+void decaf_448_precomputed_double_scalarmul_non_secret (
+    decaf_448_point_t combo,
+    const decaf_448_precomputed_s *base1,
+    const decaf_448_scalar_t scalar1,
+    const decaf_448_point_t base2,
+    const decaf_448_scalar_t scalar2
+) {
+    int i;
+    unsigned j,k;
+    const unsigned int n = 5, t = 5;
+    const int s = 18; // TODO MAGIC
+    
+    decaf_448_scalar_t scalar1x;
+    decaf_448_scalar_add(scalar1x, scalar1, decaf_448_precomputed_scalarmul_adjustment);
+    decaf_448_halve(scalar1x,scalar1x,decaf_448_scalar_p);
+    
+    decaf_448_point_copy(combo, base2);
+    const int table_bits = 4; // TODO MAGIC
+    struct smvt_control control[DECAF_448_SCALAR_BITS/(table_bits+1)+3];
+    
+    int control_bits = recode_wnaf(control, scalar2, table_bits);
+  
+    pniels_t precmp[1<<table_bits];
+    prepare_wnaf_table(precmp, combo, table_bits);
+    
+    decaf_448_point_copy(combo, decaf_448_point_identity);
+
+    int conti = 0;
+    for (i = control[0].power; i >= 0; i--) {
+
+        if (i == control[conti].power) {
+            decaf_448_point_double_internal(combo,combo,0);
+            assert(control[conti].addend);
+
+            if (control[conti].addend > 0) {
+                add_pniels_to_pt(combo, precmp[control[conti].addend >> 1], i>=s); // TODO PERF: internal
+            } else {
+                sub_pniels_from_pt(combo, precmp[(-control[conti].addend) >> 1], i>=s); // TODO PERF: internal
+            }
+            conti++;
+            assert(conti <= control_bits);
+        } else {
+            decaf_448_point_double_internal(combo,combo,i>=s);
+        }
+        
+        if (i < s) {
+            /* comb component */
+            for (j=0; j<n; j++) {
+                int tab = 0;
+         
+                for (k=0; k<t; k++) {
+                    unsigned int bit = i + s*(k + j*t);
+                    if (bit < SCALAR_WORDS * WBITS) {
+                        tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
+                    }
+                }
+            
+                decaf_bool_t invert = (tab>>(t-1))-1;
+                tab ^= invert;
+                tab &= (1<<(t-1)) - 1;
+
+                if (invert) {
+                    sub_niels_from_pt(combo, base1->table[(j<<(t-1)) + tab], j==n-1 && i);
+                } else {
+                    add_niels_to_pt(combo, base1->table[(j<<(t-1)) + tab], j==n-1 && i);
+                }
+            }
+        }
+    }
+}