/* Copyright (c) 2014 Cryptography Research, Inc. * Released under the MIT License. See LICENSE.txt for license information. */ #include "f_field.h" void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) { const uint64_t *a = as->limb, *b = bs->limb; uint64_t *c = cs->limb; __uint128_t accum0 = 0, accum1 = 0, accum2; uint64_t mask = (1ull<<60) - 1; uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32))); /* For some reason clang doesn't vectorize this without prompting? */ unsigned int i; for (i=0; i>= 60; accum1 >>= 60; mac(&accum0, &aa[1],&bb[3]); mac(&accum1, &a[5], &b[7]); mac(&accum0, &aa[2], &bb[2]); mac(&accum1, &a[6], &b[6]); mac(&accum0, &aa[3], &bb[1]); accum1 += accum0; accum2 = widemul(&a[0],&b[0]); accum1 -= accum2; accum0 += accum2; msb(&accum0, &a[1], &b[3]); msb(&accum0, &a[2], &b[2]); mac(&accum1, &a[7], &b[5]); msb(&accum0, &a[3], &b[1]); mac(&accum1, &aa[0], &bb[0]); mac(&accum0, &a[4], &b[4]); c[0] = ((uint64_t)(accum0)) & mask; c[4] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum2 = widemul(&a[2],&b[7]); mac(&accum0, &a[6], &bb[3]); mac(&accum1, &aa[2], &bbb[3]); mac(&accum2, &a[3], &b[6]); mac(&accum0, &a[7], &bb[2]); mac(&accum1, &aa[3], &bbb[2]); mac(&accum2, &a[0],&b[1]); mac(&accum1, &aa[0], &bb[1]); mac(&accum0, &a[4], &b[5]); mac(&accum2, &a[1], &b[0]); mac(&accum1, &aa[1], &bb[0]); mac(&accum0, &a[5], &b[4]); accum1 -= accum2; accum0 += accum2; c[1] = ((uint64_t)(accum0)) & mask; c[5] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum2 = widemul(&a[3],&b[7]); mac(&accum0, &a[7], &bb[3]); mac(&accum1, &aa[3], &bbb[3]); mac(&accum2, &a[0],&b[2]); mac(&accum1, &aa[0], &bb[2]); mac(&accum0, &a[4], &b[6]); mac(&accum2, &a[1], &b[1]); mac(&accum1, &aa[1], &bb[1]); mac(&accum0, &a[5], &b[5]); mac(&accum2, &a[2], &b[0]); mac(&accum1, &aa[2], &bb[0]); mac(&accum0, &a[6], &b[4]); accum1 -= accum2; accum0 += accum2; c[2] = ((uint64_t)(accum0)) & mask; c[6] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum0 += c[3]; accum1 += c[7]; c[3] = ((uint64_t)(accum0)) & mask; c[7] = ((uint64_t)(accum1)) & mask; /* we could almost stop here, but it wouldn't be stable, so... */ accum0 >>= 60; accum1 >>= 60; c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); c[0] += ((uint64_t)(accum1)); } void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; __uint128_t accum0, accum4; uint64_t mask = (1ull<<60) - 1; accum0 = widemul_rm(b, &a[0]); accum4 = widemul_rm(b, &a[4]); c[0] = accum0 & mask; accum0 >>= 60; c[4] = accum4 & mask; accum4 >>= 60; mac_rm(&accum0, b, &a[1]); mac_rm(&accum4, b, &a[5]); c[1] = accum0 & mask; accum0 >>= 60; c[5] = accum4 & mask; accum4 >>= 60; mac_rm(&accum0, b, &a[2]); mac_rm(&accum4, b, &a[6]); c[2] = accum0 & mask; accum0 >>= 60; c[6] = accum4 & mask; accum4 >>= 60; mac_rm(&accum0, b, &a[3]); mac_rm(&accum4, b, &a[7]); c[3] = accum0 & mask; accum0 >>= 60; c[7] = accum4 & mask; accum4 >>= 60; accum0 += accum4 + c[4]; c[4] = accum0 & mask; c[5] += accum0 >> 60; accum4 += c[0]; c[0] = accum4 & mask; c[1] += accum4 >> 60; } void gf_sqr (gf *__restrict__ cs, const gf *as) { const uint64_t *a = as->limb; uint64_t *c = cs->limb; __uint128_t accum0 = 0, accum1 = 0, accum2; uint64_t mask = (1ull<<60) - 1; uint64_t aa[4] __attribute__((aligned(32))); /* For some reason clang doesn't vectorize this without prompting? */ unsigned int i; for (i=0; i>= 59; accum1 >>= 59; mac2(&accum0, &aa[1],&aa[3]); mac2(&accum1, &a[5], &a[7]); mac(&accum0, &aa[2], &aa[2]); accum1 += accum0; msb2(&accum0, &a[1], &a[3]); mac(&accum1, &a[6], &a[6]); accum2 = widemul(&a[0],&a[0]); accum1 -= accum2; accum0 += accum2; msb(&accum0, &a[2], &a[2]); mac(&accum1, &aa[0], &aa[0]); mac(&accum0, &a[4], &a[4]); c[0] = ((uint64_t)(accum0)) & mask; c[4] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum2 = widemul2(&aa[2],&aa[3]); msb2(&accum0, &a[2], &a[3]); mac2(&accum1, &a[6], &a[7]); accum1 += accum2; accum0 += accum2; accum2 = widemul2(&a[0],&a[1]); mac2(&accum1, &aa[0], &aa[1]); mac2(&accum0, &a[4], &a[5]); accum1 -= accum2; accum0 += accum2; c[1] = ((uint64_t)(accum0)) & mask; c[5] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum2 = widemul(&aa[3],&aa[3]); msb(&accum0, &a[3], &a[3]); mac(&accum1, &a[7], &a[7]); accum1 += accum2; accum0 += accum2; accum2 = widemul2(&a[0],&a[2]); mac2(&accum1, &aa[0], &aa[2]); mac2(&accum0, &a[4], &a[6]); mac(&accum2, &a[1], &a[1]); mac(&accum1, &aa[1], &aa[1]); mac(&accum0, &a[5], &a[5]); accum1 -= accum2; accum0 += accum2; c[2] = ((uint64_t)(accum0)) & mask; c[6] = ((uint64_t)(accum1)) & mask; accum0 >>= 60; accum1 >>= 60; accum0 += c[3]; accum1 += c[7]; c[3] = ((uint64_t)(accum0)) & mask; c[7] = ((uint64_t)(accum1)) & mask; /* we could almost stop here, but it wouldn't be stable, so... */ accum0 >>= 60; accum1 >>= 60; c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); c[0] += ((uint64_t)(accum1)); }