diff --git a/src/decaf.c b/src/decaf.c index 00303f1..ae201b3 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -1710,54 +1710,51 @@ static int recode_wnaf ( const scalar_t scalar, unsigned int tableBits ) { - int current = 0, i, j; - unsigned int position = 0; + unsigned int table_size = SCALAR_BITS/(tableBits+1) + 3; + unsigned int position = table_size - 1; /* at the end */ + + /* place the end marker */ + control[position].power = -1; + control[position].addend = 0; + position--; - /* PERF: negate scalar if it's large - * PERF: this is a pretty simplistic algorithm. I'm sure there's a faster one... - * PERF MINOR: not technically WNAF, since last digits can be adjacent. Could be rtl. + /* PERF: Could negate scalar if it's large. But then would need more cases + * in the actual code that uses it, all for an expected reduction of like 1/5 op. + * Probably not worth it. */ - for (i=SCALAR_BITS-1; i >= 0; i--) { - int bit = (scalar->limb[i/WBITS] >> (i%WBITS)) & 1; - current = 2*current + bit; - - /* - * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0 - * So current loses (tableBits+1) bits every time. It otherwise gains - * 1 bit per iteration. The number of iterations is - * (nbits + 2 + tableBits), and an additional control word is added at - * the end. So the total number of control words is at most - * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2. - * There's also the stopper with power -1, for a total of +3. - */ - if (current >= (2<> 1; /* |delta| < 2^tablebits */ - current = -(current & 1); - - for (j=i; (delta & 1) == 0; j++) { - delta >>= 1; - } - control[position].power = j+1; + + uint64_t current = scalar->limb[0] & 0xFFFF; + uint32_t mask = (1<<(tableBits+1))-1; + + unsigned int w; + const unsigned int B_OVER_16 = sizeof(scalar->limb[0]) / 2; + for (w = 1; w<(SCALAR_BITS-1)/16+3; w++) { + if (w < (SCALAR_BITS-1)/16+1) { + /* Refill the 16 high bits of current */ + current += (uint32_t)((scalar->limb[w/B_OVER_16]>>(16*(w%B_OVER_16)))<<16); + } + + while (current & 0xFFFF) { + assert(position >= 0); + uint32_t pos = __builtin_ctz((uint32_t)current), odd = (uint32_t)current >> pos; + int32_t delta = odd & mask; + if (odd & 1<<(tableBits+1)) delta -= (1<<(tableBits+1)); + current -= delta << pos; + control[position].power = pos + 16*(w-1); control[position].addend = delta; - position++; - assert(position <= SCALAR_BITS/(tableBits+1) + 2); + position--; } + current >>= 16; } + assert(current==0); - if (current) { - for (j=0; (current & 1) == 0; j++) { - current >>= 1; - } - control[position].power = j; - control[position].addend = current; - position++; - assert(position <= SCALAR_BITS/(tableBits+1) + 2); + position++; + unsigned int n = table_size - position; + unsigned int i; + for (i=0; i