diff --git a/HISTORY.txt b/HISTORY.txt index 1f301e9..3443d68 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,3 +1,23 @@ +August 4, 2014: + Experiments and bug fixes. + + Add really_memset = memset_s (except not because I'm setting -std=c99), + thanks David Leon Gil. I think I put it in the right places. + + Try to work around what I think is a compiler bug in GCC -O3 on non-AVX + platforms. I can't seem to work around it as -Os, so I'm just flagging + a warning (-Werror makes it an error) for now. Will take more + investigation. Thanks Samuel Neves. + + Added an experimental (not ready yet!) ARM NEON implementation in + arch_neon_experimental. This implementation seems to work, but needs + more testing. It is currently asm-heavy and not GCC clean. I am + planning to have a flag for it to use intrinsics instead of asm; + currently the intrinsics are commented out. On clang this does ECDH + in 1850kcy on my BeagleBone Black, comparable to Curve41417. Once this + is ready, I will probably move it to arch_neon proper, since arch_neon + isn't particularly tuned. + July 11, 2014: This is mostly a cleanup release. diff --git a/Makefile b/Makefile index 7050e90..aac33e4 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ endif WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ - -Wmissing-declarations -Wunused-function $(EXWARN) + -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) @@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon else ARCHFLAGS += -mfpu=vfpv3-d16 endif -ARCHFLAGS += -mcpu=cortex-a9 # FIXME -GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow +ARCHFLAGS += -mcpu=cortex-a8 # FIXME +GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow else ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO endif diff --git a/README.txt b/README.txt index c73c88a..6a53bea 100644 --- a/README.txt +++ b/README.txt @@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford University. All of these files are usable under the MIT license contained in LICENSE.txt. -The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run +The Makefile is set for my 2013 MacBook Air. You can `make bench` to run a completely arbitrary set of benchmarks and tests, or `make build/goldilocks.so` to build a stripped-down version of the library. For non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c index fe69639..ac0c051 100644 --- a/src/arch_neon/p448.c +++ b/src/arch_neon/p448.c @@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) { #include "neon_emulation.h" #endif /* ARM_NEON */ -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smlal ( uint64_t *acc, const uint32_t a, @@ -48,7 +48,7 @@ smlal ( *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smlal2 ( uint64_t *acc, const uint32_t a, @@ -57,7 +57,7 @@ smlal2 ( *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smull ( uint64_t *acc, const uint32_t a, @@ -66,7 +66,7 @@ smull ( *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smull2 ( uint64_t *acc, const uint32_t a, @@ -84,6 +84,7 @@ p448_mul ( const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; + const int32x2_t *val = (const int32x2_t *)a, *vbl = (const int32x2_t *)b, @@ -109,155 +110,170 @@ p448_mul ( accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); accumx1a = vmull_lane_s32( delta, vbh[3], 1); - accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); - accumx3a = vmull_lane_s32( delta, vbh[3], 1); - accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0); + accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); - accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); - accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); - accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0); + accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); - accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); - accumx3b = vmull_lane_s32( delta, vbh[1], 1); - accumx0b = vmull_lane_s32( delta, vbh[0], 0); + accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0); accumx1b = vmull_lane_s32( delta, vbh[0], 1); - accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); - accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); - accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); - accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); - accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); - accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); - accumx2b += accumx2a; - accumx3b += accumx3a; - accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); - accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); accumx0b += accumx0a; accumx1b += accumx1a; accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); - accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); - accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); - accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0); - accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); - accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0); - accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); - accumx2a += accumx2b; - accumx3a += accumx3b; - accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0); - accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); accumx0a += accumx0b; accumx1a += accumx1b; accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); - accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0); - accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); xx_vtrnq_s64(&accumx0a, &accumx0b); xx_vtrnq_s64(&accumx1a, &accumx1b); - xx_vtrnq_s64(&accumx2a, &accumx2b); - xx_vtrnq_s64(&accumx3a, &accumx3b); accumx0b += accumx1a; accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); + trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); + vcl[0] = trn_res.val[1] & vmask; + vch[0] = trn_res.val[0] & vmask; + + + + + accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); + accumx3a = vmull_lane_s32( delta, vbh[3], 1); + accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); + accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); + accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); + accumx3b = vmull_lane_s32( delta, vbh[1], 1); + accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); + accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); + accumx2b += accumx2a; + accumx3b += accumx3a; + accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); + accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); + accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0); + accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); + accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0); + accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); + accumx2a += accumx2b; + accumx3a += accumx3b; + accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0); + accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); + accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0); + accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); + xx_vtrnq_s64(&accumx2a, &accumx2b); + xx_vtrnq_s64(&accumx3a, &accumx3b); accumx2a += accumx1b; accumx2b += accumx3a; accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); - trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); - vcl[0] = trn_res.val[1] & vmask; - vch[0] = trn_res.val[0] & vmask; trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); vcl[1] = trn_res.val[1] & vmask; vch[1] = trn_res.val[0] & vmask; carry = accumx3b; + + + accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); accumx5a = vmull_lane_s32( delta, vbh[3], 1); - accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); - accumx7b = vmull_lane_s32( delta, vbh[3], 1); accumx4b = accumx4a; accumx5b = accumx5a; - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); - accumx6a = accumx6b; - accumx7a = accumx7b; - accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); accumx4a += accumx4b; accumx5a += accumx5b; accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); /**/ - accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); xx_vtrnq_s64(&accumx4a, &accumx4b); xx_vtrnq_s64(&accumx5a, &accumx5b); - xx_vtrnq_s64(&accumx6a, &accumx6b); - xx_vtrnq_s64(&accumx7a, &accumx7b); accumx4a += carry; accumx4b += accumx5a; accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); - accumx6a += accumx5b; - accumx6b += accumx7a; trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); vcl[2] = trn_res.val[1] & vmask; vch[2] = trn_res.val[0] & vmask; + + + + + accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); + accumx7b = vmull_lane_s32( delta, vbh[3], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); + accumx6a = accumx6b; + accumx7a = accumx7b; + accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); + /**/ + accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); + + xx_vtrnq_s64(&accumx6a, &accumx6b); + xx_vtrnq_s64(&accumx7a, &accumx7b); + accumx6a += accumx5b; + accumx6b += accumx7a; + accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); vcl[3] = trn_res.val[1] & vmask; vch[3] = trn_res.val[0] & vmask; + accumx7b = xx_vaddup_s64(accumx7b); int32x2_t t0 = vcl[0], t1 = vch[0]; diff --git a/src/arch_neon_experimental/ec_point.c b/src/arch_neon_experimental/ec_point.c new file mode 100644 index 0000000..47c325c --- /dev/null +++ b/src/arch_neon_experimental/ec_point.c @@ -0,0 +1,962 @@ +/** + * @cond internal + * @file ec_point.c + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#include "ec_point.h" + + +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L1, x ); + p448_mul ( &L2, x, &L1 ); + p448_sqr ( &L1, &L2 ); + p448_mul ( &L2, x, &L1 ); + p448_sqrn ( &L1, &L2, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L1, &L0, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L2, &L0, 9 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( &L2, x, &L0 ); + p448_sqrn ( &L0, &L2, 18 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqrn ( &L0, &L2, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 111 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L0, &L2 ); + p448_mul ( &L1, x, &L0 ); + p448_sqrn ( &L0, &L1, 223 ); + p448_mul ( a, &L2, &L0 ); +} + +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( a, x, &L0 ); +} + +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_weak_reduce( &d->y ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 1 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 1 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_weak_reduce( &e->t ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + mask_t L4, L5, L6; + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &a->xa, &L1 ); + p448_add ( &L2, &L0, &L3 ); + p448_sub ( &L1, &L3, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &L1, &L2 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L2, &a->z0, &a->z0 ); + p448_bias ( &L2, 1 ); + p448_add ( &L0, &L2, &L2 ); + p448_add ( &L2, &L0, &L1 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L0, &a->xd, &L2 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L0, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_weak_reduce( b ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L2, L3; + struct p448_t L0, L1; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->x, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L1, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L1 ); + p448_isr ( &L0, &b->x ); + p448_mul ( &b->u, &b->t, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &b->t, &b->x, &L1 ); + p448_add ( &L1, &a->y, &a->x ); + p448_weak_reduce( &L1 ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); + p448_bias ( &b->t, 2 ); + p448_weak_reduce( &b->t ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); + p448_weak_reduce( &b->y ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + mask_t L2, L3; + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_weak_reduce( &L3 ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_weak_reduce( &a->y ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + mask_t L1, L2; + struct p448_t L0; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8; + p448_sqr ( &a->x, r ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); + p448_weak_reduce( &a->y ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); + p448_bias ( &L7, 2 ); + p448_weak_reduce( &L7 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); + p448_bias ( &a->x, 3 ); + p448_weak_reduce( &a->x ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_weak_reduce( &L3 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L8 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L5 = p448_is_zero( &L0 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->x ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L0, &L1, 39081 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L0, &L3, &L2 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L2, &L3, &L0 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + + diff --git a/src/arch_neon_experimental/p448.c b/src/arch_neon_experimental/p448.c new file mode 100644 index 0000000..c7ee0f6 --- /dev/null +++ b/src/arch_neon_experimental/p448.c @@ -0,0 +1,1207 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" +#include "p448.h" + +static inline mask_t __attribute__((always_inline)) +is_zero ( + word_t x +) { + dword_t xx = x; + xx--; + return xx >> WORD_BITS; +} + +static __inline__ void __attribute__((gnu_inline,always_inline,unused)) +xx_vtrnq_s64 ( + int64x2_t *x, + int64x2_t *y +) { + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y)); +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) +xx_vaddup_s64(int64x2_t x) { + __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); + return x; +} + +static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused)) +xx_vaddup_u64(uint64x2_t x) { + __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); + return x; +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) +vrev128_s64(int64x2_t x) { + __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); + return x; +} + +static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline)) +vrev128_u64(uint64x2_t x) { + __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); + return x; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smlal ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smlal2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smull ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smull2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +static inline int64x2_t __attribute__((always_inline,unused)) +SER(int64x2_t x) { + __asm__ __volatile__("" : "+w"(x)); + return x; +} +#define svmull_lane_s32(a,b,c) SER(vmull_lane_s32(a,b,c)) +#define svmlal_s32(a,b,c) SER(vmlal_s32(a,b,c)) +#define svmlal_lane_s32(a,b,c,d) SER(vmlal_lane_s32(a,b,c,d)) + + +// static inline int64x2_t __attribute__((always_inline,unused)) +// xvmlal_lane_s32(int64x2_t acc, int32x2_t a, int32x2_t b, const int lane) { +// __asm__ volatile ( +// "vmlal.s32 %0, %1, %2[%c3]" +// : "+w"(acc) +// : "w"(a), "w"(b), "i"(lane) +// ); +// return acc; +// } + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + register int32x4_t al0 __asm__("q6"); + register int32x4_t ah0 __asm__("q7"); + register int32x4_t as0 __asm__("q8"); + register int32x4_t al2 __asm__("q9"); + register int32x4_t ah2 __asm__("q10"); + register int32x4_t as2 __asm__("q11"); + + register int32x4_t bl0 __asm__("q0"); + register int32x4_t bh0 __asm__("q1"); + register int32x4_t bs0 __asm__("q2"); + register int32x4_t bl2 __asm__("q3"); + register int32x4_t bh2 __asm__("q4"); + register int32x4_t bs2 __asm__("q5"); + + int32x2_t *vc = (int32x2_t*) cs->limb, *vcasm = vc; + + register int64x2_t acc0a __asm__("q12"); + register int64x2_t acc0b __asm__("q13"); + register int64x2_t acc1a __asm__("q14"); + register int64x2_t acc1b __asm__("q15"); + + __asm__ __volatile__( + + "vld2.32 {%e[al0],%f[al0],%e[ah0],%f[ah0]}, [%[a],:64]!" "\n\t" + "vadd.i32 %[as0], %[al0], %[ah0]" "\n\t" + + "vld2.32 {%e[bl0],%f[bl0],%e[bh0],%f[bh0]}, [%[b],:64]!" "\n\t" + "vadd.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vsub.i32 %e[bs0], %e[bl0], %e[bh0]" "\n\t" + + "vld2.32 {%e[bl2],%f[bl2],%e[bh2],%f[bh2]}, [%[b],:64]!" "\n\t" + "vadd.i32 %[bs2], %[bl2], %[bh2]" "\n\t" + + "vld2.32 {%e[al2],%f[al2],%e[ah2],%f[ah2]}, [%[a],:64]!" "\n\t" + "vadd.i32 %[as2], %[al2], %[ah2]" "\n\t" + + "vmull.s32 %[a0b], %f[as0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as2], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[as2], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as0], %e[bh0][0]" "\n\t" + + "vmull.s32 %[a1b], %f[as0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh0][1]" "\n\t" + + "vmov %[a0a], %[a0b]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %f[bh0][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[al0], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %e[al2], %e[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %f[al2], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[al0], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %e[al2], %e[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %f[al2], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + + "vmull.s32 %[a0a], %e[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %f[as2], %e[bs2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %e[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %e[as2], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %e[al2], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %f[al2], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %e[al2], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %f[al2], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %e[bs2], %e[bl2], %e[bh2]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %f[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %e[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %e[as2], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %f[as2], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[al2], %f[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al2], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[al2], %f[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al2], %e[bs0][1]" "\n\t" + + "vsub.i32 %f[bs2], %f[bl2], %f[bh2]" "\n\t" + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %e[as0], %f[bh2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %e[as2], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %f[as2], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %e[as0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %f[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %e[bl0][0]" "\n\t" + + "vmlal.s32 %[a0b], %e[al0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al2], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al2], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %f[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlal.s32 %[a1b], %e[al0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al2], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al2], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a0a], %[a0a], %[a1b]" "\n\t" + + "vmovn.i64 %f[a0b], %[a0a]" "\n\t" + "vsra.s64 %[a1a], %[a0a], #28" "\n\t" + + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + + "vswp %e[a1a], %f[a1a]" "\n\t" + + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + "sub %[c], #64" "\n\t" + + "vadd.i64 %f[a1a], %f[a1a], %e[a1a]" "\n\t" + + "vldmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + "vaddw.s32 %[a1a], %e[a0a]" "\n\t" + "vmovn.i64 %e[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vaddw.s32 %[a1a], %f[a0a]" "\n\t" + "vmovn.i64 %f[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vbic.i32 %[a0a], #0xf0000000" "\n\t" + + "vaddw.s32 %[a1a], %e[a0b]" "\n\t" + "vmovn.i64 %e[a0b], %[a1a]" "\n\t" + + "vstmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + + : [a0a]"=w"(acc0a) + , [a0b]"=w"(acc0b) + , [a1a]"=w"(acc1a) + , [a1b]"=w"(acc1b) + , [a]"+r"(as) + , [b]"+r"(bs) + , [c]"+r"(vcasm) + + , [al0]"=w"(al0) + , [ah0]"=w"(ah0) + , [as0]"=w"(as0) + , [al2]"=w"(al2) + , [ah2]"=w"(ah2) + , [as2]"=w"(as2) + + , [bh0]"=w"(bh0) + , [bh2]"=w"(bh2) + + , [bl0]"=w"(bl0) + , [bl2]"=w"(bl2) + + , [bs0]"=w"(bs0) + , [bs2]"=w"(bs2) + + :: "memory" + ); + + /* + acc0b = vmull_lane_s32( as1, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, as0, bh0, 0); + + acc1b = vmull_lane_s32( as1, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, as0, bh0, 1); + + acc0a = acc0b; + acc0a = vmlal_lane_s32(acc0a, ah1, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bh2, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bh1, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah1, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bh2, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bh1, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al1, bl3, 0); + acc0b = vmlsl_lane_s32(acc0b, al2, bl2, 0); + acc0b = vmlsl_lane_s32(acc0b, al3, bl1, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al1, bl3, 1); + acc1b = vmlsl_lane_s32(acc1b, al2, bl2, 1); + acc1b = vmlsl_lane_s32(acc1b, al3, bl1, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs0, 1); + + xx_vtrnq_s64(&acc0b, &acc0a); + xx_vtrnq_s64(&acc1b, &acc1a); + + acc0a += acc1b; + vc[0] = vmovn_s64(acc0b) & vmask; + + acc0a = vsraq_n_s64(acc0a,acc0b,28); + vc[1] = vmovn_s64(acc0a) & vmask; + bs1 = bl1 - bh1; + carry = vsraq_n_s64(acc1a,acc0a,28); + + + acc0b = vmull_lane_s32( as2, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, as0, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh0, 0); + + acc1b = vmull_lane_s32( as2, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, as0, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh0, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(carry), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(carry), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah2, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bh2, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah2, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bh2, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al2, bl3, 0); + acc0b = vmlsl_lane_s32(acc0b, al3, bl2, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al2, bl3, 1); + acc1b = vmlsl_lane_s32(acc1b, al3, bl2, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs0, 1); + + xx_vtrnq_s64(&acc0b, &acc0a); + xx_vtrnq_s64(&acc1b, &acc1a); + //acc0b += carry; + + acc0a += acc1b; + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + vc[2] = vmovn_s64(acc0b) & vmask; + vc[3] = vmovn_s64(acc0a) & vmask; + carry = vsraq_n_s64(acc1a,acc0a,28); + + bs2 = bl2 - bh2; + + acc0b = vmull_lane_s32( as0, bh2, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bh0, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs3, 0); + + acc1b = vmull_lane_s32( as0, bh2, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bh0, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs3, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(acc1a), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(acc1a), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah3, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl2, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah3, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl2, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al3, bl3, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al2, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al3, bl3, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al2, bs0, 1); + + + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc0b), "+w"(acc0a)); + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc1b), "+w"(acc1a)); + //xx_vtrnq_s64_(acc0b, acc0a); + //xx_vtrnq_s64_(acc1b, acc1a); + + //acc0b += acc1a; + acc0a += acc1b; + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + + vc[4] = vmovn_s64(acc0b) & vmask; + vc[5] = vmovn_s64(acc0a) & vmask; + + bs3 = bl3 - bh3; + acc1a = vsraq_n_s64(acc1a,acc0a,28); + + + acc0b = vmull_lane_s32( as0, bh3, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh2, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bh0, 0); + + acc1b = vmull_lane_s32( as0, bh3, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh2, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bh0, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(acc1a), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(acc1a), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah0, bl3, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl2, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah0, bl3, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl2, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bl0, 1); + + acc0b = vmlal_lane_s32(acc0b, al0, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, al2, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al3, bs0, 0); + + acc1b = vmlal_lane_s32(acc1b, al0, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, al2, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al3, bs0, 1); + + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc0b), "+w"(acc0a)); + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc1b), "+w"(acc1a)); + //xx_vtrnq_s64_(acc0b, acc0a); + //xx_vtrnq_s64_(acc1b, acc1a); + //acc0b += acc1a; + acc0a += acc1b; + + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + vc[6] = vmovn_s64(acc0b) & vmask; + vc[7] = vmovn_s64(acc0a) & vmask; + + acc1a = vsraq_n_s64(acc1a,acc0a,28); + + acc1a = xx_vaddup_s64(vrev128_s64(acc1a)); + + acc1a = vaddw_s32(acc1a, vc[0]); + vc[0] = vmovn_s64(acc1a) & vmask; + + acc1a = vshrq_n_s64(acc1a,28); + acc1a = vaddw_s32(acc1a, vc[1]); + vc[1] = vmovn_s64(acc1a) & vmask; + + acc1a = vshrq_n_s64(acc1a,28); + vc[2] += vmovn_s64(acc1a);; + */ +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *bs +) { + const p448_t *as = bs; + register int32x4_t as0 __asm__("q6"); + register int32x4_t as2 __asm__("q7"); + + register int32x4_t bl0 __asm__("q0"); + register int32x4_t bh0 __asm__("q1"); + register int32x4_t bs0 __asm__("q2"); + register int32x4_t bl2 __asm__("q3"); + register int32x4_t bh2 __asm__("q4"); + register int32x4_t bs2 __asm__("q5"); + + int32x2_t *vc = (int32x2_t*) cs->limb, *vcasm = vc; + + register int64x2_t acc0a __asm__("q12"); + register int64x2_t acc0b __asm__("q13"); + register int64x2_t acc1a __asm__("q14"); + register int64x2_t acc1b __asm__("q15"); + + __asm__ __volatile__ ( + "vld2.32 {%e[bl0],%f[bl0],%e[bh0],%f[bh0]}, [%[b],:64]!" "\n\t" + "vadd.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vsub.i32 %e[bs0], %e[bl0], %e[bh0]" "\n\t" + "vadd.i32 %[as0], %[bl0], %[bh0]" "\n\t" + + "vld2.32 {%e[bl2],%f[bl2],%e[bh2],%f[bh2]}, [%[b],:64]!" "\n\t" + "vadd.i32 %[bs2], %[bl2], %[bh2]" "\n\t" + "vmov %[as2], %[bs2]" "\n\t" + + "vqdmull.s32 %[a0b], %f[as0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as2], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as0], %e[bh0][0]" "\n\t" + + "vqdmull.s32 %[a1b], %f[as0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh0][1]" "\n\t" + + "vmov %[a0a], %[a0b]" "\n\t" + "vqdmlal.s32 %[a0a], %f[bh0], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[bh2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[bh0], %e[bl0][0]" "\n\t" + + "vqdmlsl.s32 %[a0b], %f[bl0], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %e[bl2], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[bl0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %f[bh0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[bh2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[bh0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlsl.s32 %[a1b], %f[bl0], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %e[bl2], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[bl0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + + "vqdmull.s32 %[a0a], %e[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vqdmlal.s32 %[a0a], %e[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vqdmull.s32 %[a1b], %e[as2], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as0], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh2], %f[bh2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %f[bl0][0]" "\n\t" + + "vqdmlsl.s32 %[a0b], %e[bl2], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl0], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh2], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlsl.s32 %[a1b], %e[bl2], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl0], %f[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %e[bs2], %e[bl2], %e[bh2]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %f[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vqdmlal.s32 %[a0a], %e[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %f[as2], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %f[bh2], %f[bh2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[bh0], %f[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[bl2], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[bl0], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[bh2], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[bh0], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[bl2], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[bl0], %f[bs0][1]" "\n\t" + + "vsub.i32 %f[bs2], %f[bl2], %f[bh2]" "\n\t" + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vqdmull.s32 %[a0a], %e[as0], %f[bh2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vqdmlal.s32 %[a0a], %e[as2], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vqdmull.s32 %[a1b], %e[as0], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as2], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh2], %f[bl0][0]" "\n\t" + + "vqdmlal.s32 %[a0b], %e[bl0], %f[bs2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl2], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh2], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlal.s32 %[a1b], %e[bl0], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl2], %f[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a0a], %[a0a], %[a1b]" "\n\t" + + "vmovn.i64 %f[a0b], %[a0a]" "\n\t" + "vsra.s64 %[a1a], %[a0a], #28" "\n\t" + + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + + "vswp %e[a1a], %f[a1a]" "\n\t" + + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + "sub %[c], #64" "\n\t" + + "vadd.i64 %f[a1a], %f[a1a], %e[a1a]" "\n\t" + + "vldmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + "vaddw.s32 %[a1a], %e[a0a]" "\n\t" + "vmovn.i64 %e[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vaddw.s32 %[a1a], %f[a0a]" "\n\t" + "vmovn.i64 %f[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vbic.i32 %[a0a], #0xf0000000" "\n\t" + + "vaddw.s32 %[a1a], %e[a0b]" "\n\t" + "vmovn.i64 %e[a0b], %[a1a]" "\n\t" + + "vstmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + + : [a0a]"=w"(acc0a) + , [a0b]"=w"(acc0b) + , [a1a]"=w"(acc1a) + , [a1b]"=w"(acc1b) + , [a]"+r"(as) + , [b]"+r"(bs) + , [c]"+r"(vcasm) + + , [as0]"=w"(as0) + , [as2]"=w"(as2) + + , [bh0]"=w"(bh0) + , [bh2]"=w"(bh2) + + , [bl0]"=w"(bl0) + , [bl2]"=w"(bl2) + + , [bs0]"=w"(bs0) + , [bs2]"=w"(bs2) + + :: "memory" + ); + + + /* + const int32x2x2_t b0 = vld2_s32((const int32_t *) &bs->limb[0]); + const int32x2x2_t b1 = vld2_s32((const int32_t *) &bs->limb[4]); + const int32x2x2_t b2 = vld2_s32((const int32_t *) &bs->limb[8]); + const int32x2x2_t b3 = vld2_s32((const int32_t *) &bs->limb[12]); + const int32x2_t vbl[4] = { b0.val[0], b1.val[0], b2.val[0], b3.val[0] }; + const int32x2_t vbh[4] = { b0.val[1], b1.val[1], b2.val[1], b3.val[1] }; + int32x2_t vbm[4]; + + int i; + for (i=0; i<4; i++) { + vbm[i] = vbl[i] - vbh[i]; + } + + int32x2_t *vc = (int32x2_t*) cs->limb; + */ + + /* FUTURE possible improvements: + * don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end + * or use phi/nega-phi for everything, montgomery style + * or find some sort of phi algorithm which doesn't have this problem + * break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks + * + * These improvements are all pretty minor, but I guess together they might matter? + */ + + + + /* + int32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; + + int64x2_t acc0a, acc0b; + int64x2_t acc1a, acc1b; + int64x2_t acc2a, acc2b; + int64x2_t acc3a, acc3b; + int64x2_t acc4a, acc4b; + int64x2_t acc5a, acc5b; + int64x2_t acc6a, acc6b; + int64x2_t acc7a, acc7b; + int64x2_t carry; + + acc0a = vqdmull_lane_s32( vbh[1], vbh[3], 0); + acc1a = vqdmull_lane_s32( vbh[1], vbh[3], 1); + acc2a = vqdmull_lane_s32( vbh[2], vbh[3], 0); + acc3a = vqdmull_lane_s32( vbh[2], vbh[3], 1); + acc0a = vmlal_lane_s32(acc0a, vbh[2], vbh[2], 0); + acc1a = vmlal_lane_s32(acc1a, vbh[2], vbh[2], 1); + acc2b = acc2a; + acc3b = acc3a; + acc2b = vqdmlal_lane_s32(acc2b, vbh[0], vbh[1], 0); + acc3b = vqdmlal_lane_s32(acc3b, vbh[0], vbh[1], 1); + acc0b = acc0a; + acc1b = acc1a; + acc0b = vmlal_lane_s32(acc0b, vbh[0], vbh[0], 0); + acc1b = vmlal_lane_s32(acc1b, vbh[0], vbh[0], 1); + acc0b = vqdmlal_lane_s32(acc0b, vbl[1], vbl[3], 0); + acc1b = vqdmlal_lane_s32(acc1b, vbl[1], vbl[3], 1); + acc2b = vqdmlal_lane_s32(acc2b, vbl[2], vbl[3], 0); + acc3b = vqdmlal_lane_s32(acc3b, vbl[2], vbl[3], 1); + acc0b = vmlal_lane_s32(acc0b, vbl[2], vbl[2], 0); + acc1b = vmlal_lane_s32(acc1b, vbl[2], vbl[2], 1); + acc2a += acc2b; + acc3a += acc3b; + acc2a = vqdmlal_lane_s32(acc2a, vbl[0], vbl[1], 0); + acc3a = vqdmlal_lane_s32(acc3a, vbl[0], vbl[1], 1); + acc0a += acc0b; + acc1a += acc1b; + acc0a = vmlal_lane_s32(acc0a, vbl[0], vbl[0], 0); + acc1a = vmlal_lane_s32(acc1a, vbl[0], vbl[0], 1); + acc0a = vqdmlsl_lane_s32(acc0a, vbm[1], vbm[3], 0); + acc1a = vqdmlsl_lane_s32(acc1a, vbm[1], vbm[3], 1); + acc0a = vmlsl_lane_s32(acc0a, vbm[2], vbm[2], 0); + acc1a = vmlsl_lane_s32(acc1a, vbm[2], vbm[2], 1); + acc2a = vqdmlsl_lane_s32(acc2a, vbm[2], vbm[3], 0); + acc3a = vqdmlsl_lane_s32(acc3a, vbm[2], vbm[3], 1); + acc0b += acc0a; + acc1b += acc1a; + acc0b = vmlsl_lane_s32(acc0b, vbm[0], vbm[0], 0); + acc1b = vmlsl_lane_s32(acc1b, vbm[0], vbm[0], 1); + acc2b += acc2a; + acc3b += acc3a; + acc2b = vqdmlsl_lane_s32(acc2b, vbm[0], vbm[1], 0); + acc3b = vqdmlsl_lane_s32(acc3b, vbm[0], vbm[1], 1); + + xx_vtrnq_s64(&acc0a, &acc0b); + xx_vtrnq_s64(&acc1a, &acc1b); + xx_vtrnq_s64(&acc2a, &acc2b); + xx_vtrnq_s64(&acc3a, &acc3b); + + acc0b += acc1a; + acc0b = vsraq_n_s64(acc0b,acc0a,28); + acc1b = vsraq_n_s64(acc1b,acc0b,28); + acc2a += acc1b; + acc2b += acc3a; + acc2b = vsraq_n_s64(acc2b,acc2a,28); + acc3b = vsraq_n_s64(acc3b,acc2b,28); + + vc[0] = (vmovn_s64(acc0a)) & vmask; + vc[1] = (vmovn_s64(acc0b)) & vmask; + + vc[2] = (vmovn_s64(acc2a)) & vmask; + vc[3] = (vmovn_s64(acc2b)) & vmask; + carry = acc3b; + + acc4a = vmull_lane_s32( vbh[3], vbh[3], 0); + acc5a = vmull_lane_s32( vbh[3], vbh[3], 1); + acc6b = vqdmull_lane_s32( vbh[0], vbh[3], 0); + acc7b = vqdmull_lane_s32( vbh[0], vbh[3], 1); + acc4b = acc4a; + acc5b = acc5a; + acc4b = vqdmlal_lane_s32(acc4b, vbh[0], vbh[2], 0); + acc5b = vqdmlal_lane_s32(acc5b, vbh[0], vbh[2], 1); + acc6b = vqdmlal_lane_s32(acc6b, vbh[1], vbh[2], 0); + acc7b = vqdmlal_lane_s32(acc7b, vbh[1], vbh[2], 1); + acc4b = vmlal_lane_s32(acc4b, vbh[1], vbh[1], 0); + acc5b = vmlal_lane_s32(acc5b, vbh[1], vbh[1], 1); + acc4b = vmlal_lane_s32(acc4b, vbl[3], vbl[3], 0); + acc5b = vmlal_lane_s32(acc5b, vbl[3], vbl[3], 1); + acc6a = acc6b; + acc7a = acc7b; + acc6a = vqdmlal_lane_s32(acc6a, vbl[0], vbl[3], 0); + acc7a = vqdmlal_lane_s32(acc7a, vbl[0], vbl[3], 1); + acc4a += acc4b; + acc5a += acc5b; + acc4a = vqdmlal_lane_s32(acc4a, vbl[0], vbl[2], 0); + acc5a = vqdmlal_lane_s32(acc5a, vbl[0], vbl[2], 1); + acc6a = vqdmlal_lane_s32(acc6a, vbl[1], vbl[2], 0); + acc7a = vqdmlal_lane_s32(acc7a, vbl[1], vbl[2], 1); + acc4a = vmlal_lane_s32(acc4a, vbl[1], vbl[1], 0); + acc5a = vmlal_lane_s32(acc5a, vbl[1], vbl[1], 1); + acc4a = vmlsl_lane_s32(acc4a, vbm[3], vbm[3], 0); + acc5a = vmlsl_lane_s32(acc5a, vbm[3], vbm[3], 1); + acc6b += acc6a; + acc7b += acc7a; + acc6b = vqdmlsl_lane_s32(acc6b, vbm[0], vbm[3], 0); + acc7b = vqdmlsl_lane_s32(acc7b, vbm[0], vbm[3], 1); + acc4b += acc4a; + acc5b += acc5a; + acc4b = vqdmlsl_lane_s32(acc4b, vbm[0], vbm[2], 0); + acc5b = vqdmlsl_lane_s32(acc5b, vbm[0], vbm[2], 1); + acc4b = vmlsl_lane_s32(acc4b, vbm[1], vbm[1], 0); + acc5b = vmlsl_lane_s32(acc5b, vbm[1], vbm[1], 1); + acc6b = vqdmlsl_lane_s32(acc6b, vbm[1], vbm[2], 0); + acc7b = vqdmlsl_lane_s32(acc7b, vbm[1], vbm[2], 1); + + xx_vtrnq_s64(&acc4a, &acc4b); + xx_vtrnq_s64(&acc5a, &acc5b); + xx_vtrnq_s64(&acc6a, &acc6b); + xx_vtrnq_s64(&acc7a, &acc7b); + + acc4a += carry; + acc4b += acc5a; + acc4b = vsraq_n_s64(acc4b,acc4a,28); + acc5b = vsraq_n_s64(acc5b,acc4b,28); + acc6a += acc5b; + acc6b += acc7a; + + + vc[4] = (vmovn_s64(acc4a)) & vmask; + vc[5] = (vmovn_s64(acc4b)) & vmask; + + acc6b = vsraq_n_s64(acc6b,acc6a,28); + acc7b = vsraq_n_s64(acc7b,acc6b,28); + + vc[6] = (vmovn_s64(acc6a)) & vmask; + vc[7] = (vmovn_s64(acc6b)) & vmask; + + acc7a = xx_vaddup_s64(vrev128_s64(acc7b)); + + int32x2_t t0 = vc[0], t1 = vc[1]; + + acc7a = vaddw_s32(acc7a, t0); + t0 = vmovn_s64(acc7a) & vmask; + acc7a = vshrq_n_s64(acc7a,28); + acc7a = vaddw_s32(acc7a, t1); + t1 = vmovn_s64(acc7a) & vmask; + vc[0] = t0; + vc[1] = t1; + acc7a = vshrq_n_s64(acc7a,28); + + vc[2] += vmovn_s64(acc7a); + */ +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; + + uint64x2_t accum; + const uint32x2_t *va = (const uint32x2_t *) as->limb; + uint32x2_t *vo = (uint32x2_t *) cs->limb; + uint32x2_t vc, vn; + uint32x2_t vb = {b & ((1<<28)-1), b>>28}; + + accum = vmull_lane_u32(va[7], vb, 1); + accum = xx_vaddup_u64(vrev128_u64(accum)); + + vc = va[0]; + accum = vmlal_lane_u32(accum, vc, vb, 0); + vo[0] = vmovn_u64(accum) & vmask; + accum = vshrq_n_u64(accum,28); + + /* PERF: the right way to do this is to reduce behind, i.e. + * vmull + vmlal round 0 + * vmull + vmlal round 1 + * vmull + vmlal round 2 + * vsraq round 0, 1 + * vmull + vmlal round 3 + * vsraq round 1, 2 + * ... + */ + + int i; + for (i=1; i<8; i++) { + vn = va[i]; + accum = vmlal_lane_u32(accum, vc, vb, 1); + accum = vmlal_lane_u32(accum, vn, vb, 0); + vo[i] = vmovn_u64(accum) & vmask; + accum = vshrq_n_u64(accum,28); + vc = vn; + } + + accum = xx_vaddup_u64(vrev128_u64(accum)); + accum = vaddw_u32(accum, vo[0]); + vo[0] = vmovn_u64(accum) & vmask; + + accum = vshrq_n_u64(accum,28); + vo[1] += vmovn_u64(accum); +} + +/* TODO: vectorize? */ +void +p448_strong_reduce ( + p448_t *a +) { + word_t mask = (1ull<<28)-1; + + /* first, clear high */ + a->limb[1] += a->limb[15]>>28; + a->limb[0] += a->limb[15]>>28; + a->limb[15] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + dsword_t scarry = 0; + int i; + for (i=0; i<16; i++) { + scarry = scarry + a->limb[LIMBPERM(i)] - ((i==8)?mask-1:mask); + a->limb[LIMBPERM(i)] = scarry & mask; + scarry >>= 28; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + word_t scarry_mask = scarry & mask; + dword_t carry = 0; + + /* add it back */ + for (i=0; i<16; i++) { + carry = carry + a->limb[LIMBPERM(i)] + ((i==8)?(scarry_mask&~1):scarry_mask); + a->limb[LIMBPERM(i)] = carry & mask; + carry >>= 28; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint32_t any = 0; + int i; + for (i=0; i<16; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + + for (i=0; i<8; i++) { + uint64_t limb = red.limb[LIMBPERM(2*i)] + (((uint64_t)red.limb[LIMBPERM(2*i+1)])<<28); + for (j=0; j<7; j++) { + serial[7*i+j] = limb; + limb >>= 8; + } + assert(limb == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + uint64_t out = 0; + for (j=0; j<7; j++) { + out |= ((uint64_t)serial[7*i+j])<<(8*j); + } + x->limb[LIMBPERM(2*i)] = out & ((1ull<<28)-1); + x->limb[LIMBPERM(2*i+1)] = out >> 28; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + uint32_t ge = -1, mask = (1ull<<28)-1; + for (i=0; i<8; i++) { + ge &= x->limb[LIMBPERM(i)]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask); + + /* Propagate the rest */ + for (i=9; i<16; i++) { + ge &= x->limb[LIMBPERM(i)]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_neon_experimental/p448.h b/src/arch_neon_experimental/p448.h new file mode 100644 index 0000000..4f0be0a --- /dev/null +++ b/src/arch_neon_experimental/p448.h @@ -0,0 +1,376 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include "word.h" + +#include +#include + +typedef struct p448_t { + uint32_t limb[16]; +} __attribute__((aligned(32))) p448_t; + +#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15) +#define USE_NEON_PERM 1 + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + for (i=0; i<16; i++) { + out->limb[i] = 0; + } + out->limb[0] = x & ((1<<28)-1); + out->limb[2] = x>>28; +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = br_set_to_mask(doswap); + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = br_set_to_mask(doNegate); + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint32_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; + uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1}; + uint32x4_t *aa = (uint32x4_t*) a; + aa[0] += lo; + aa[1] += hi; + aa[2] += hi; + aa[3] += hi; +} + +void +p448_weak_reduce ( + p448_t *a +) { + + uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1}, + tmp = vshr_n_u32(aa[7],28); + + int i; + for (i=7; i>=1; i--) { + aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28); + } + aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2); +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/crandom.c b/src/crandom.c index 4b75f66..b9c1eb0 100644 --- a/src/crandom.c +++ b/src/crandom.c @@ -466,7 +466,7 @@ crandom_generate( unsigned long long copy = (length > state->fill) ? state->fill : length; state->fill -= copy; memcpy(output, state->buffer + state->fill, copy); - memset(state->buffer + state->fill, 0, copy); + really_memset(state->buffer + state->fill, 0, copy); output += copy; length -= copy; } @@ -484,5 +484,5 @@ crandom_destroy( */ } - memset(state, 0, sizeof(*state)); + really_memset(state, 0, sizeof(*state)); } diff --git a/src/goldilocks.c b/src/goldilocks.c index 440c8bd..31e38d1 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -340,7 +340,7 @@ goldilocks_sign ( word_t skw[GOLDI_FIELD_WORDS]; mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order); if (!succ) { - memset(skw,0,sizeof(skw)); + really_memset(skw,0,sizeof(skw)); return GOLDI_ECORRUPT; } @@ -389,9 +389,9 @@ goldilocks_sign ( memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); - memset((unsigned char *)tk,0,sizeof(tk)); - memset((unsigned char *)skw,0,sizeof(skw)); - memset((unsigned char *)challenge,0,sizeof(challenge)); + really_memset((unsigned char *)tk,0,sizeof(tk)); + really_memset((unsigned char *)skw,0,sizeof(skw)); + really_memset((unsigned char *)challenge,0,sizeof(challenge)); /* response = 2(nonce_secret - sk*challenge) * Nonce = 8[nonce_secret]*G @@ -494,7 +494,7 @@ goldilocks_destroy_precomputed_public_key ( ) { if (!precom) return; destroy_fixed_base(&precom->table); - memset(&precom->pub.opaque, 0, sizeof(precom->pub)); + really_memset(&precom->pub.opaque, 0, sizeof(precom->pub)); free(precom); } diff --git a/src/include/word.h b/src/include/word.h index d48d20f..26123bc 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -146,11 +146,17 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); } #endif -#if __AVX2__ || __SSE2__ +#if __AVX2__ static __inline__ big_register_t br_is_zero(big_register_t x) { return (big_register_t)(x == br_set_to_mask(0)); } +#elif __SSE2__ +static __inline__ big_register_t +br_is_zero(big_register_t x) { + return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); + //return (big_register_t)(x == br_set_to_mask(0)); +} #elif __ARM_NEON__ static __inline__ big_register_t br_is_zero(big_register_t x) { @@ -179,7 +185,25 @@ static inline uint64_t letoh64 (uint64_t x) { return x; } #endif - +/** + * Really call memset, in a way that prevents the compiler from optimizing it out. + * @param p The object to zeroize. + * @param c The char to set it to (probably zero). + * @param s The size of the object. + */ +#ifdef __STDC_LIB_EXT1__ /* which it won't be, because we're -std=c99 */ +static __inline__ void +really_memset(void *p, char c, size_t s) { + memset_s(p,s,c,s); +} +#else +static __inline__ void __attribute__((always_inline,unused)) +really_memset(void *p, char c, size_t s) { + volatile char *pv = (volatile char *)p; + size_t i; + for (i=0; in, doNegate); } -static __inline__ void +#if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__)) + /* This works around an apparent compiler bug in GCC, thanks Samuel Neves */ + static void __attribute__((optimize("O1"))) + #ifdef __OPTIMIZE_SIZE__ + #warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms" + #endif +#else + static __inline__ void +#endif constant_time_lookup_tw_pniels ( struct tw_pniels_t *out, const struct tw_pniels_t *in, @@ -76,7 +84,7 @@ constant_time_lookup_tw_pniels ( int j; unsigned int k; - memset(out, 0, sizeof(*out)); + really_memset(out, 0, sizeof(*out)); for (j=0; jtable) { - memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); + really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); } if (table->own_table) { free(table->table); } - memset(table,0,sizeof(*table)); + really_memset(table,0,sizeof(*table)); } mask_t diff --git a/test/bench.c b/test/bench.c index b80be14..0cb96b6 100644 --- a/test/bench.c +++ b/test/bench.c @@ -108,33 +108,33 @@ int main(int argc, char **argv) { q448_randomize(&crand, sk); when = now(); - for (i=0; i #include +#ifndef LIMBPERM +#define LIMBPERM(x) (x) +#endif int failed_tests, n_tests, failed_this_test, running_a_test; @@ -87,7 +90,7 @@ void p448_print ( int j; printf("%s = 0x", descr); for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) { - printf(PRIxWORD58, b.limb[j]); + printf(PRIxWORD58, b.limb[LIMBPERM(j)]); } printf("\n"); } diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c index 7fde48c..51a646c 100644 --- a/test/test_arithmetic.c +++ b/test/test_arithmetic.c @@ -170,7 +170,12 @@ int test_arithmetic () { int bits = sizeof(word_t) * 448 / sizeof(p448_t); for (j=0; j