| @@ -1710,54 +1710,51 @@ static int recode_wnaf ( | |||||
| const scalar_t scalar, | const scalar_t scalar, | ||||
| unsigned int tableBits | unsigned int tableBits | ||||
| ) { | ) { | ||||
| int current = 0, i, j; | |||||
| unsigned int position = 0; | |||||
| unsigned int table_size = SCALAR_BITS/(tableBits+1) + 3; | |||||
| unsigned int position = table_size - 1; /* at the end */ | |||||
| /* place the end marker */ | |||||
| control[position].power = -1; | |||||
| control[position].addend = 0; | |||||
| position--; | |||||
| /* PERF: negate scalar if it's large | |||||
| * PERF: this is a pretty simplistic algorithm. I'm sure there's a faster one... | |||||
| * PERF MINOR: not technically WNAF, since last digits can be adjacent. Could be rtl. | |||||
| /* PERF: Could negate scalar if it's large. But then would need more cases | |||||
| * in the actual code that uses it, all for an expected reduction of like 1/5 op. | |||||
| * Probably not worth it. | |||||
| */ | */ | ||||
| for (i=SCALAR_BITS-1; i >= 0; i--) { | |||||
| int bit = (scalar->limb[i/WBITS] >> (i%WBITS)) & 1; | |||||
| current = 2*current + bit; | |||||
| /* | |||||
| * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0 | |||||
| * So current loses (tableBits+1) bits every time. It otherwise gains | |||||
| * 1 bit per iteration. The number of iterations is | |||||
| * (nbits + 2 + tableBits), and an additional control word is added at | |||||
| * the end. So the total number of control words is at most | |||||
| * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2. | |||||
| * There's also the stopper with power -1, for a total of +3. | |||||
| */ | |||||
| if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) { | |||||
| int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */ | |||||
| current = -(current & 1); | |||||
| for (j=i; (delta & 1) == 0; j++) { | |||||
| delta >>= 1; | |||||
| } | |||||
| control[position].power = j+1; | |||||
| uint64_t current = scalar->limb[0] & 0xFFFF; | |||||
| uint32_t mask = (1<<(tableBits+1))-1; | |||||
| unsigned int w; | |||||
| const unsigned int B_OVER_16 = sizeof(scalar->limb[0]) / 2; | |||||
| for (w = 1; w<(SCALAR_BITS-1)/16+3; w++) { | |||||
| if (w < (SCALAR_BITS-1)/16+1) { | |||||
| /* Refill the 16 high bits of current */ | |||||
| current += (uint32_t)((scalar->limb[w/B_OVER_16]>>(16*(w%B_OVER_16)))<<16); | |||||
| } | |||||
| while (current & 0xFFFF) { | |||||
| assert(position >= 0); | |||||
| uint32_t pos = __builtin_ctz((uint32_t)current), odd = (uint32_t)current >> pos; | |||||
| int32_t delta = odd & mask; | |||||
| if (odd & 1<<(tableBits+1)) delta -= (1<<(tableBits+1)); | |||||
| current -= delta << pos; | |||||
| control[position].power = pos + 16*(w-1); | |||||
| control[position].addend = delta; | control[position].addend = delta; | ||||
| position++; | |||||
| assert(position <= SCALAR_BITS/(tableBits+1) + 2); | |||||
| position--; | |||||
| } | } | ||||
| current >>= 16; | |||||
| } | } | ||||
| assert(current==0); | |||||
| if (current) { | |||||
| for (j=0; (current & 1) == 0; j++) { | |||||
| current >>= 1; | |||||
| } | |||||
| control[position].power = j; | |||||
| control[position].addend = current; | |||||
| position++; | |||||
| assert(position <= SCALAR_BITS/(tableBits+1) + 2); | |||||
| position++; | |||||
| unsigned int n = table_size - position; | |||||
| unsigned int i; | |||||
| for (i=0; i<n; i++) { | |||||
| control[i] = control[i+position]; | |||||
| } | } | ||||
| control[position].power = -1; | |||||
| control[position].addend = 0; | |||||
| return position; | |||||
| return n-1; | |||||
| } | } | ||||
| static void | static void | ||||