set mulw to <32 bits instead of <64 bits (but actually less than that: 1 limb instead of 2). also there is a bug if you compile ed448 for arch_32 on a 64-bit machine... tracing

9 years ago · 790745e2b3
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) {
 static INLINE void
 cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
    constant_time_cond_swap(x,y,sizeof(gf_s),swap);
    /*
    UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) {
        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
        x->limb[i] ^= s;
        y->limb[i] ^= s;
    }
    */
 }

 /** Inverse square root using addition chain. */
@@ -133,7 +126,7 @@ static void
 gf_invert(gf y, const gf x) {
    gf t1, t2;
    gf_sqr(t1, x); // o^2
    decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
    mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o
    (void)ret; assert(ret);
    gf_sqr(t1, t2);
    gf_mul(t2, t1, x); // not direct to y in case of alias.
@@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) {

 /** Mul by signed int.  Not constant-time WRT the sign of that int. */
 static INLINE void
 gf_mulw_sgn(gf c, const gf a, int w) {
 gf_mulw_sgn(gf c, const gf a, int32_t w) {
    if (w>0) {
        gf_mulw(c, a, w);
    } else {
@@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) {
 }

 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t hibit(const gf x) {
 static mask_t hibit(const gf x) {
    gf y;
    gf_add(y,x,x);
    gf_strong_reduce(y);
@@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) {

 #if COFACTOR==8
 /** Return high bit of x = low bit of 2x mod p */
 static decaf_word_t lobit(const gf x) {
 static mask_t lobit(const gf x) {
    gf y;
    gf_copy(y,x);
    gf_strong_reduce(y);
@@ -873,9 +866,9 @@ static INLINE void
 constant_time_lookup_xx (
    void *__restrict__ out_,
    const void *table_,
    decaf_word_t elem_bytes,
    decaf_word_t n_table,
    decaf_word_t idx
    word_t elem_bytes,
    word_t n_table,
    word_t idx
 ) {
    constant_time_lookup(out_,table_,elem_bytes,n_table,idx);
 }
@@ -928,12 +921,12 @@ void API_NS(point_scalarmul) (

    for (; i>=0; i-=WINDOW) {
        /* Fetch another block of bits */
        decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
        word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
        if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
            bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
        }
        bits &= WINDOW_MASK;
        decaf_word_t inv = (bits>>(WINDOW-1))-1;
        mask_t inv = (bits>>(WINDOW-1))-1;
        bits ^= inv;
    
        /* Add in from table.  Compute t only on last iteration. */
@@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) (

    for (; i>=0; i-=WINDOW) {
        /* Fetch another block of bits */
        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
        word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
                     bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
        if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
            bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
@@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) (
        }
        bits1 &= WINDOW_MASK;
        bits2 &= WINDOW_MASK;
        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
        mask_t inv1 = (bits1>>(WINDOW-1))-1;
        mask_t inv2 = (bits2>>(WINDOW-1))-1;
        bits1 ^= inv1;
        bits2 ^= inv2;
    
@@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) (
        }
        
        /* Fetch another block of bits */
        decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
                     bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
        word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
               bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
        if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
            bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
            bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
        }
        bits1 &= WINDOW_MASK;
        bits2 &= WINDOW_MASK;
        decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
        decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
        mask_t inv1 = (bits1>>(WINDOW-1))-1;
        mask_t inv2 = (bits2>>(WINDOW-1))-1;
        bits1 ^= inv1;
        bits2 ^= inv2;
        
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout);
 void gf_add (gf out, const gf a, const gf b);
 void gf_sub (gf out, const gf a, const gf b);
 void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
 void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
 void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b);
 void gf_sqr (gf_s *__restrict__ out, const gf a);
 void gf_serialize (uint8_t *serial, const gf x);
 void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */
--- a/src/p25519/arch_32/f_impl.c
+++ b/src/p25519/arch_32/f_impl.c
@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += accum;
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
    uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
    uint32_t *c = cs->limb;
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += accum;
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    int i;
    
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -4,6 +4,7 @@

 #include "f_field.h"

 /** Requires: input limbs < 9*2^51 */
 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;
@@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    
    ai = a[4];
    mac_rm(&accum1, ai, &b[0]);
    /* Here accum1 < 5*(9*2^51)^2 */
    
    c[3] = accum0 & mask;
    accum1 += shrld(accum0, 51);
@@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    
    /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
     * = 2^(-13 + <13)
     * PERF: good enough to fit into uint64_t?
     * PERF: good enough to fit into uint64_t.
     */
    
    uint64_t a1 = shrld(accum1,51);
    accum1 = (__uint128_t)a1 * 19 + c0;
    /* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small
     * a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51.
     */
    accum1 = a1 * 19 + c0;
    c[0] = accum1 & mask;
    c[1] = c1 + shrld(accum1,51);
    c[1] = c1 + (accum1>>51);
 }

 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
@@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    
    /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
     * = 2^(-13 + <13)
     * PERF: good enough to fit into uint64_t?
     */
    
    uint64_t a1 = shrld(accum1,51);
    accum1 = (__uint128_t)a1 * 19 + c0;
    accum1 = a1 * 19 + c0;
    c[0] = accum1 & mask;
    c[1] = c1 + shrld(accum1,51);
    c[1] = c1 + (accum1>>51);
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
    uint64_t *c = cs->limb;

@@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
    mac_rm(&accum, b, &a[4]);
    c[4] = accum & mask;

    accum = shrld(accum,51);
    accum = accum * 19 + c0;
    uint64_t a1 = shrld(accum,51);
    a1 = a1*19+c0;
    
    c[0] = accum & mask;
    c[1] = c1 + shrld(accum,51);
    c[0] = a1 & mask;
    c[1] = c1 + (a1>>51);
 }
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += ((uint32_t)(accum1));
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    assert(b<1<<28);
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;
@@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {

    int i;

    accum0 = widemul(blo, a[0]);
    accum8 = widemul(blo, a[8]);
    accum0 += widemul(bhi, a[15]);
    accum8 += widemul(bhi, a[15] + a[7]);
    accum0 = widemul(b, a[0]);
    accum8 = widemul(b, a[8]);

    c[0] = accum0 & mask; accum0 >>= 28;
    c[8] = accum8 & mask; accum8 >>= 28;
    
    for (i=1; i<8; i++) {
        accum0 += widemul(blo, a[i]);
        accum8 += widemul(blo, a[i+8]);
        
        accum0 += widemul(bhi, a[i-1]);
        accum8 += widemul(bhi, a[i+7]);
        accum0 += widemul(b, a[i]);
        accum8 += widemul(b, a[i+8]);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) {
 }

 void gf_weak_reduce (gf a) {
    uint64_t mask = (1ull<<28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    uint32_t mask = (1ull<<28) - 1;
    uint32_t tmp = a->limb[15] >> 28;
    a->limb[8] += tmp;
    for (unsigned int i=15; i>0; i--) {
        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
 void gf_mulw (
    gf_s *__restrict__ cs,
    const gf as,
    uint64_t b
    uint32_t b
 ) {
    uint32_t mask = (1ull<<28)-1;  
    const uint32_t bhi = b>>28, blo = b & mask;
    assert(b <= mask);
    
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;
@@ -737,11 +737,9 @@ void gf_mulw (
    int i;

    uint32_t c0, c8, n0, n8;
    accum0 = widemul(bhi, a[15]);
    accum8 = widemul(bhi, a[15] + a[7]);
    c0 = a[0]; c8 = a[8];
    smlal(&accum0, blo, c0);
    smlal(&accum8, blo, c8);
    accum0 = widemul(b, c0);
    accum8 = widemul(b, c8);

    c[0] = accum0 & mask; accum0 >>= 28;
    c[8] = accum8 & mask; accum8 >>= 28;
@@ -749,10 +747,8 @@ void gf_mulw (
    i=1;
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);
        
        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -760,10 +756,8 @@ void gf_mulw (
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -771,10 +765,8 @@ void gf_mulw (
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -782,10 +774,8 @@ void gf_mulw (
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -793,10 +783,8 @@ void gf_mulw (
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -804,10 +792,8 @@ void gf_mulw (
    }
    {
        c0 = a[i]; c8 = a[i+8];
        smlal(&accum0, bhi, n0);
        smlal(&accum8, bhi, n8);
        smlal(&accum0, blo, c0);
        smlal(&accum8, blo, c8);
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);
        
        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -815,10 +801,8 @@ void gf_mulw (
    }
    {
        n0 = a[i]; n8 = a[i+8];
        smlal(&accum0, bhi, c0);
        smlal(&accum8, bhi, c8);
        smlal(&accum0, blo, n0);
        smlal(&accum8, blo, n8);
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
--- a/src/p448/arch_neon/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
    );
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { 
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
    uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
    assert(b<(1<<28));
    
    uint64x2_t accum;
    const uint32x2_t *va = (const uint32x2_t *) as->limb;
    uint32x2_t *vo = (uint32x2_t *) cs->limb;
    uint32x2_t vc, vn;
    uint32x2_t vb = {b & ((1<<28)-1), b>>28};
    
    accum = vmull_lane_u32(va[7], vb, 1);
    accum = xx_vaddup_u64(vrev128_u64(accum));
    uint32x2_t vb = {b, 0};
    
    vc = va[0];
    accum = vmlal_lane_u32(accum, vc, vb, 0);
    accum = vmull_lane_u32(accum, vc, vb, 0);
    vo[0] = vmovn_u64(accum) & vmask;
    accum = vshrq_n_u64(accum,28);
    
@@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
    int i;
    for (i=1; i<8; i++) {
        vn = va[i];
        accum = vmlal_lane_u32(accum, vc, vb, 1);
        accum = vmlal_lane_u32(accum, vn, vb, 0);
        vo[i] = vmovn_u64(accum) & vmask;
        accum = vshrq_n_u64(accum,28);
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += ((uint64_t)(accum1));
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[0] += ((uint64_t)(accum1));
 }

 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
 void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

--- a/src/per_field.c
+++ b/src/per_field.c
@@ -1,6 +1,6 @@
 /**
 * @cond internal
 * @file decaf_crypto.c
 * @file per_field.c
 * @copyright
 *   Copyright (c) 2015-2016 Cryptography Research, Inc.  \n
 *   Released under the MIT License.  See LICENSE.txt for license information.