diff --git a/src/decaf.c b/src/decaf.c
index 00303f1..ae201b3 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -1710,54 +1710,51 @@ static int recode_wnaf (
     const scalar_t scalar,
     unsigned int tableBits
 ) {
-    int current = 0, i, j;
-    unsigned int position = 0;
+    unsigned int table_size = SCALAR_BITS/(tableBits+1) + 3;
+    unsigned int position = table_size - 1; /* at the end */
+    
+    /* place the end marker */
+    control[position].power = -1;
+    control[position].addend = 0;
+    position--;
 
-    /* PERF: negate scalar if it's large
-     * PERF: this is a pretty simplistic algorithm.  I'm sure there's a faster one...
-     * PERF MINOR: not technically WNAF, since last digits can be adjacent.  Could be rtl.
+    /* PERF: Could negate scalar if it's large.  But then would need more cases
+     * in the actual code that uses it, all for an expected reduction of like 1/5 op.
+     * Probably not worth it.
      */
-    for (i=SCALAR_BITS-1; i >= 0; i--) {
-        int bit = (scalar->limb[i/WBITS] >> (i%WBITS)) & 1;
-        current = 2*current + bit;
-
-        /*
-         * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
-         * So current loses (tableBits+1) bits every time.  It otherwise gains
-         * 1 bit per iteration.  The number of iterations is
-         * (nbits + 2 + tableBits), and an additional control word is added at
-         * the end.  So the total number of control words is at most
-         * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
-         * There's also the stopper with power -1, for a total of +3.
-         */
-        if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
-            int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
-            current = -(current & 1);
-
-            for (j=i; (delta & 1) == 0; j++) {
-                delta >>= 1;
-            }
-            control[position].power = j+1;
+    
+    uint64_t current = scalar->limb[0] & 0xFFFF;
+    uint32_t mask = (1<<(tableBits+1))-1;
+
+    unsigned int w;
+    const unsigned int B_OVER_16 = sizeof(scalar->limb[0]) / 2;
+    for (w = 1; w<(SCALAR_BITS-1)/16+3; w++) {
+        if (w < (SCALAR_BITS-1)/16+1) {
+            /* Refill the 16 high bits of current */
+            current += (uint32_t)((scalar->limb[w/B_OVER_16]>>(16*(w%B_OVER_16)))<<16);
+        }
+        
+        while (current & 0xFFFF) {
+            assert(position >= 0);
+            uint32_t pos = __builtin_ctz((uint32_t)current), odd = (uint32_t)current >> pos;
+            int32_t delta = odd & mask;
+            if (odd & 1<<(tableBits+1)) delta -= (1<<(tableBits+1));
+            current -= delta << pos;
+            control[position].power = pos + 16*(w-1);
             control[position].addend = delta;
-            position++;
-            assert(position <= SCALAR_BITS/(tableBits+1) + 2);
+            position--;
         }
+        current >>= 16;
     }
+    assert(current==0);
     
-    if (current) {
-        for (j=0; (current & 1) == 0; j++) {
-            current >>= 1;
-        }
-        control[position].power = j;
-        control[position].addend = current;
-        position++;
-        assert(position <= SCALAR_BITS/(tableBits+1) + 2);
+    position++;
+    unsigned int n = table_size - position;
+    unsigned int i;
+    for (i=0; i<n; i++) {
+        control[i] = control[i+position];
     }
-    
-  
-    control[position].power = -1;
-    control[position].addend = 0;
-    return position;
+    return n-1;
 }
 
 static void