From 04b955eabe5adb5dfc9df4e4e08325fac220538a Mon Sep 17 00:00:00 2001 From: Mike Hamburg Date: Mon, 4 Aug 2014 20:46:17 -0700 Subject: [PATCH] Added really_memset, thanks David Leon Gil. Trying to work around an apparent GCC bug on SSE2, thanks Samuel Neves. Added an experimental NEON arch. It's fast. It's not yet GCC clean. It needs some more work on general cleanliness too. --- HISTORY.txt | 20 + Makefile | 6 +- README.txt | 2 +- src/arch_neon/p448.c | 162 ++-- src/arch_neon_experimental/ec_point.c | 962 ++++++++++++++++++++ src/arch_neon_experimental/p448.c | 1207 +++++++++++++++++++++++++ src/arch_neon_experimental/p448.h | 376 ++++++++ src/crandom.c | 4 +- src/goldilocks.c | 10 +- src/include/word.h | 28 +- src/magic.c | 13 + src/scalarmul.c | 28 +- test/bench.c | 18 +- test/test.c | 5 +- test/test_arithmetic.c | 7 +- 15 files changed, 2741 insertions(+), 107 deletions(-) create mode 100644 src/arch_neon_experimental/ec_point.c create mode 100644 src/arch_neon_experimental/p448.c create mode 100644 src/arch_neon_experimental/p448.h diff --git a/HISTORY.txt b/HISTORY.txt index 1f301e9..3443d68 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,3 +1,23 @@ +August 4, 2014: + Experiments and bug fixes. + + Add really_memset = memset_s (except not because I'm setting -std=c99), + thanks David Leon Gil. I think I put it in the right places. + + Try to work around what I think is a compiler bug in GCC -O3 on non-AVX + platforms. I can't seem to work around it as -Os, so I'm just flagging + a warning (-Werror makes it an error) for now. Will take more + investigation. Thanks Samuel Neves. + + Added an experimental (not ready yet!) ARM NEON implementation in + arch_neon_experimental. This implementation seems to work, but needs + more testing. It is currently asm-heavy and not GCC clean. I am + planning to have a flag for it to use intrinsics instead of asm; + currently the intrinsics are commented out. On clang this does ECDH + in 1850kcy on my BeagleBone Black, comparable to Curve41417. Once this + is ready, I will probably move it to arch_neon proper, since arch_neon + isn't particularly tuned. + July 11, 2014: This is mostly a cleanup release. diff --git a/Makefile b/Makefile index 7050e90..aac33e4 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ endif WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ - -Wmissing-declarations -Wunused-function $(EXWARN) + -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) @@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon else ARCHFLAGS += -mfpu=vfpv3-d16 endif -ARCHFLAGS += -mcpu=cortex-a9 # FIXME -GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow +ARCHFLAGS += -mcpu=cortex-a8 # FIXME +GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow else ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO endif diff --git a/README.txt b/README.txt index c73c88a..6a53bea 100644 --- a/README.txt +++ b/README.txt @@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford University. All of these files are usable under the MIT license contained in LICENSE.txt. -The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run +The Makefile is set for my 2013 MacBook Air. You can `make bench` to run a completely arbitrary set of benchmarks and tests, or `make build/goldilocks.so` to build a stripped-down version of the library. For non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c index fe69639..ac0c051 100644 --- a/src/arch_neon/p448.c +++ b/src/arch_neon/p448.c @@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) { #include "neon_emulation.h" #endif /* ARM_NEON */ -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smlal ( uint64_t *acc, const uint32_t a, @@ -48,7 +48,7 @@ smlal ( *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smlal2 ( uint64_t *acc, const uint32_t a, @@ -57,7 +57,7 @@ smlal2 ( *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smull ( uint64_t *acc, const uint32_t a, @@ -66,7 +66,7 @@ smull ( *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; } -static inline void __attribute__((gnu_inline,always_inline)) +static inline void __attribute__((gnu_inline,always_inline,unused)) smull2 ( uint64_t *acc, const uint32_t a, @@ -84,6 +84,7 @@ p448_mul ( const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; + const int32x2_t *val = (const int32x2_t *)a, *vbl = (const int32x2_t *)b, @@ -109,155 +110,170 @@ p448_mul ( accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); accumx1a = vmull_lane_s32( delta, vbh[3], 1); - accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); - accumx3a = vmull_lane_s32( delta, vbh[3], 1); - accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0); + accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); - accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); - accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); - accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0); + accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); - accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); - accumx3b = vmull_lane_s32( delta, vbh[1], 1); - accumx0b = vmull_lane_s32( delta, vbh[0], 0); + accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0); accumx1b = vmull_lane_s32( delta, vbh[0], 1); - accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); - accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); - accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); - accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); - accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); - accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); - accumx2b += accumx2a; - accumx3b += accumx3a; - accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); - accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); accumx0b += accumx0a; accumx1b += accumx1a; accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); - accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); - accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); - accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0); - accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); - accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0); - accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); - accumx2a += accumx2b; - accumx3a += accumx3b; - accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0); - accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); accumx0a += accumx0b; accumx1a += accumx1b; accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); - accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0); - accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); xx_vtrnq_s64(&accumx0a, &accumx0b); xx_vtrnq_s64(&accumx1a, &accumx1b); - xx_vtrnq_s64(&accumx2a, &accumx2b); - xx_vtrnq_s64(&accumx3a, &accumx3b); accumx0b += accumx1a; accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); + trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); + vcl[0] = trn_res.val[1] & vmask; + vch[0] = trn_res.val[0] & vmask; + + + + + accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); + accumx3a = vmull_lane_s32( delta, vbh[3], 1); + accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); + accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); + accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); + accumx3b = vmull_lane_s32( delta, vbh[1], 1); + accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); + accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); + accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); + accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); + accumx2b += accumx2a; + accumx3b += accumx3a; + accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); + accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); + accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); + accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0); + accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); + accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0); + accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); + accumx2a += accumx2b; + accumx3a += accumx3b; + accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0); + accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); + accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0); + accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); + xx_vtrnq_s64(&accumx2a, &accumx2b); + xx_vtrnq_s64(&accumx3a, &accumx3b); accumx2a += accumx1b; accumx2b += accumx3a; accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); - trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); - vcl[0] = trn_res.val[1] & vmask; - vch[0] = trn_res.val[0] & vmask; trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); vcl[1] = trn_res.val[1] & vmask; vch[1] = trn_res.val[0] & vmask; carry = accumx3b; + + + accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); accumx5a = vmull_lane_s32( delta, vbh[3], 1); - accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); - accumx7b = vmull_lane_s32( delta, vbh[3], 1); accumx4b = accumx4a; accumx5b = accumx5a; - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); - accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0); + accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); - accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); - accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); - accumx6a = accumx6b; - accumx7a = accumx7b; - accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); accumx4a += accumx4b; accumx5a += accumx5b; accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); - accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); - accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); /**/ - accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); - accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0); - accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); xx_vtrnq_s64(&accumx4a, &accumx4b); xx_vtrnq_s64(&accumx5a, &accumx5b); - xx_vtrnq_s64(&accumx6a, &accumx6b); - xx_vtrnq_s64(&accumx7a, &accumx7b); accumx4a += carry; accumx4b += accumx5a; accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); - accumx6a += accumx5b; - accumx6b += accumx7a; trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); vcl[2] = trn_res.val[1] & vmask; vch[2] = trn_res.val[0] & vmask; + + + + + accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); + accumx7b = vmull_lane_s32( delta, vbh[3], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); + accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); + accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); + accumx6a = accumx6b; + accumx7a = accumx7b; + accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); + accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); + accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); + /**/ + accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); + accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0); + accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); + + xx_vtrnq_s64(&accumx6a, &accumx6b); + xx_vtrnq_s64(&accumx7a, &accumx7b); + accumx6a += accumx5b; + accumx6b += accumx7a; + accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); vcl[3] = trn_res.val[1] & vmask; vch[3] = trn_res.val[0] & vmask; + accumx7b = xx_vaddup_s64(accumx7b); int32x2_t t0 = vcl[0], t1 = vch[0]; diff --git a/src/arch_neon_experimental/ec_point.c b/src/arch_neon_experimental/ec_point.c new file mode 100644 index 0000000..47c325c --- /dev/null +++ b/src/arch_neon_experimental/ec_point.c @@ -0,0 +1,962 @@ +/** + * @cond internal + * @file ec_point.c + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @warning This file was automatically generated. + */ + +#include "ec_point.h" + + +void +p448_isr ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L1, x ); + p448_mul ( &L2, x, &L1 ); + p448_sqr ( &L1, &L2 ); + p448_mul ( &L2, x, &L1 ); + p448_sqrn ( &L1, &L2, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L1, &L0, 3 ); + p448_mul ( &L0, &L2, &L1 ); + p448_sqrn ( &L2, &L0, 9 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( &L2, x, &L0 ); + p448_sqrn ( &L0, &L2, 18 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqrn ( &L0, &L2, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 37 ); + p448_mul ( &L1, &L2, &L0 ); + p448_sqrn ( &L0, &L1, 111 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L0, &L2 ); + p448_mul ( &L1, x, &L0 ); + p448_sqrn ( &L0, &L1, 223 ); + p448_mul ( a, &L2, &L0 ); +} + +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +) { + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mul ( a, x, &L0 ); +} + +void +add_tw_niels_to_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->a, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->b, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_sub ( &d->y, &d->z, &d->x ); + p448_bias ( &d->y, 2 ); + p448_weak_reduce( &d->y ); + p448_add ( &L0, &d->x, &d->z ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +sub_tw_niels_from_tw_extensible ( + struct tw_extensible_t* d, + const struct tw_niels_t* e +) { + struct p448_t L0, L1; + p448_sub ( &L1, &d->y, &d->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &e->b, &L1 ); + p448_add ( &L1, &d->x, &d->y ); + p448_mul ( &d->y, &e->a, &L1 ); + p448_mul ( &L1, &d->u, &d->t ); + p448_mul ( &d->x, &e->c, &L1 ); + p448_add ( &d->u, &L0, &d->y ); + p448_sub ( &d->t, &d->y, &L0 ); + p448_bias ( &d->t, 2 ); + p448_weak_reduce( &d->t ); + p448_add ( &d->y, &d->x, &d->z ); + p448_sub ( &L0, &d->z, &d->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &d->z, &L0, &d->y ); + p448_mul ( &d->x, &d->y, &d->t ); + p448_mul ( &d->y, &L0, &d->u ); +} + +void +add_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + add_tw_niels_to_tw_extensible( e, &a->n ); +} + +void +sub_tw_pniels_from_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* a +) { + struct p448_t L0; + p448_mul ( &L0, &e->z, &a->z ); + p448_copy ( &e->z, &L0 ); + sub_tw_niels_from_tw_extensible( e, &a->n ); +} + +void +double_tw_extensible ( + struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &a->u, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &L1, &a->t ); + p448_sub ( &a->t, &L1, &a->u ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &L1, &L0, &L2 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 1 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +double_extensible ( + struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->x ); + p448_sqr ( &L0, &a->y ); + p448_add ( &L1, &L2, &L0 ); + p448_add ( &a->t, &a->y, &a->x ); + p448_sqr ( &a->u, &a->t ); + p448_sub ( &a->t, &a->u, &L1 ); + p448_bias ( &a->t, 3 ); + p448_weak_reduce( &a->t ); + p448_sub ( &a->u, &L0, &L2 ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_sqr ( &a->x, &a->z ); + p448_bias ( &a->x, 2 ); + p448_add ( &a->z, &a->x, &a->x ); + p448_sub ( &L0, &a->z, &L1 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->z, &L1, &L0 ); + p448_mul ( &a->x, &L0, &a->t ); + p448_mul ( &a->y, &L1, &a->u ); +} + +void +twist_and_double ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &b->u, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &L0, &b->t ); + p448_sub ( &b->t, &L0, &b->u ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &L0, &b->z, &b->x ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 2 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +untwist_and_double ( + struct extensible_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0; + p448_sqr ( &b->x, &a->x ); + p448_sqr ( &b->z, &a->y ); + p448_add ( &L0, &b->x, &b->z ); + p448_add ( &b->t, &a->y, &a->x ); + p448_sqr ( &b->u, &b->t ); + p448_sub ( &b->t, &b->u, &L0 ); + p448_bias ( &b->t, 3 ); + p448_weak_reduce( &b->t ); + p448_sub ( &b->u, &b->z, &b->x ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sqr ( &b->x, &a->z ); + p448_bias ( &b->x, 1 ); + p448_add ( &b->z, &b->x, &b->x ); + p448_sub ( &b->y, &b->z, &b->u ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->z, &L0, &b->y ); + p448_mul ( &b->x, &b->y, &b->t ); + p448_mul ( &b->y, &L0, &b->u ); +} + +void +convert_tw_affine_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_affine_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->y, &a->x ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_set_ui( &b->z, 2 ); +} + +void +convert_tw_affine_to_tw_extensible ( + struct tw_extensible_t* b, + const struct tw_affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_affine_to_extensible ( + struct extensible_t* b, + const struct affine_t* a +) { + p448_copy ( &b->x, &a->x ); + p448_copy ( &b->y, &a->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &a->x ); + p448_copy ( &b->u, &a->y ); +} + +void +convert_tw_extensible_to_tw_pniels ( + struct tw_pniels_t* b, + const struct tw_extensible_t* a +) { + p448_sub ( &b->n.a, &a->y, &a->x ); + p448_bias ( &b->n.a, 2 ); + p448_weak_reduce( &b->n.a ); + p448_add ( &b->n.b, &a->x, &a->y ); + p448_weak_reduce( &b->n.b ); + p448_mul ( &b->n.c, &a->u, &a->t ); + p448_mulw ( &b->z, &b->n.c, 78164 ); + p448_neg ( &b->n.c, &b->z ); + p448_bias ( &b->n.c, 2 ); + p448_weak_reduce( &b->n.c ); + p448_add ( &b->z, &a->z, &a->z ); + p448_weak_reduce( &b->z ); +} + +void +convert_tw_pniels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_pniels_t* d +) { + p448_add ( &e->u, &d->n.b, &d->n.a ); + p448_sub ( &e->t, &d->n.b, &d->n.a ); + p448_bias ( &e->t, 2 ); + p448_weak_reduce( &e->t ); + p448_mul ( &e->x, &d->z, &e->t ); + p448_mul ( &e->y, &d->z, &e->u ); + p448_sqr ( &e->z, &d->z ); +} + +void +convert_tw_niels_to_tw_extensible ( + struct tw_extensible_t* e, + const struct tw_niels_t* d +) { + p448_add ( &e->y, &d->b, &d->a ); + p448_weak_reduce( &e->y ); + p448_sub ( &e->x, &d->b, &d->a ); + p448_bias ( &e->x, 2 ); + p448_weak_reduce( &e->x ); + p448_set_ui( &e->z, 1 ); + p448_copy ( &e->t, &e->x ); + p448_copy ( &e->u, &e->y ); +} + +void +montgomery_step ( + struct montgomery_t* a +) { + struct p448_t L0, L1; + p448_add ( &L0, &a->zd, &a->xd ); + p448_sub ( &L1, &a->xd, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_sub ( &a->zd, &a->xa, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_add ( &a->zd, &a->za, &a->xa ); + p448_mul ( &a->za, &L1, &a->zd ); + p448_add ( &a->xa, &a->za, &a->xd ); + p448_sqr ( &a->zd, &a->xa ); + p448_mul ( &a->xa, &a->z0, &a->zd ); + p448_sub ( &a->zd, &a->xd, &a->za ); + p448_bias ( &a->zd, 2 ); + p448_weak_reduce( &a->zd ); + p448_sqr ( &a->za, &a->zd ); + p448_sqr ( &a->xd, &L0 ); + p448_sqr ( &L0, &L1 ); + p448_mulw ( &a->zd, &a->xd, 39082 ); + p448_sub ( &L1, &a->xd, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->xd, &L0, &a->zd ); + p448_sub ( &L0, &a->zd, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &a->zd, &L0, &L1 ); +} + +void +deserialize_montgomery ( + struct montgomery_t* a, + const struct p448_t* sbz +) { + p448_sqr ( &a->z0, sbz ); + p448_set_ui( &a->xd, 1 ); + p448_set_ui( &a->zd, 0 ); + p448_set_ui( &a->xa, 1 ); + p448_copy ( &a->za, &a->z0 ); +} + +mask_t +serialize_montgomery ( + struct p448_t* b, + const struct montgomery_t* a, + const struct p448_t* sbz +) { + mask_t L4, L5, L6; + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &a->xa, &L1 ); + p448_add ( &L2, &L0, &L3 ); + p448_sub ( &L1, &L3, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &L1, &L2 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L2, &a->z0, &a->z0 ); + p448_bias ( &L2, 1 ); + p448_add ( &L0, &L2, &L2 ); + p448_add ( &L2, &L0, &L1 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L0, &a->xd, &L2 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L0, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; +} + +void +serialize_extensible ( + struct p448_t* b, + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sub ( &L0, &a->y, &a->z ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_add ( b, &a->z, &a->y ); + p448_mul ( &L1, &a->z, &a->x ); + p448_mul ( &L2, &L0, &L1 ); + p448_mul ( &L1, &L2, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( &L2, &L1, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( b, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L2, &L1 ); +} + +void +untwist_and_double_and_serialize ( + struct p448_t* b, + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->y, &a->x ); + p448_add ( b, &a->y, &a->x ); + p448_sqr ( &L1, b ); + p448_add ( &L2, &L3, &L3 ); + p448_sub ( b, &L1, &L2 ); + p448_bias ( b, 3 ); + p448_weak_reduce( b ); + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &L2 ); + p448_add ( &L2, b, b ); + p448_mulw ( b, &L2, 39082 ); + p448_neg ( &L2, b ); + p448_bias ( &L2, 2 ); + p448_mulw ( &L0, &L2, 39082 ); + p448_neg ( b, &L0 ); + p448_bias ( b, 2 ); + p448_mul ( &L0, &L2, &L1 ); + p448_mul ( &L2, b, &L0 ); + p448_isr ( &L0, &L2 ); + p448_mul ( &L1, b, &L0 ); + p448_sqr ( b, &L0 ); + p448_mul ( &L0, &L2, b ); + p448_mul ( b, &L1, &L3 ); +} + +void +twist_even ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L0, L1; + p448_sqr ( &b->y, &a->z ); + p448_sqr ( &b->z, &a->x ); + p448_sub ( &b->u, &b->y, &b->z ); + p448_bias ( &b->u, 2 ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->z, &a->z, &a->x ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->y, &b->z, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->x, &b->z, &b->y ); + p448_mul ( &b->t, &b->x, &b->u ); + p448_mul ( &b->y, &b->x, &b->t ); + p448_isr ( &b->t, &b->y ); + p448_mul ( &b->u, &b->x, &b->t ); + p448_sqr ( &b->x, &b->t ); + p448_mul ( &b->t, &b->y, &b->x ); + p448_mul ( &b->x, &a->x, &b->u ); + p448_mul ( &b->y, &a->y, &b->u ); + L1 = p448_is_zero( &b->z ); + L0 = - L1; + p448_addw ( &b->y, L0 ); + p448_weak_reduce( &b->y ); + p448_set_ui( &b->z, 1 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +void +test_only_twist ( + struct tw_extensible_t* b, + const struct extensible_t* a +) { + mask_t L2, L3; + struct p448_t L0, L1; + p448_sqr ( &b->u, &a->z ); + p448_sqr ( &b->y, &a->x ); + p448_sub ( &b->z, &b->u, &b->y ); + p448_bias ( &b->z, 2 ); + p448_add ( &b->y, &b->z, &b->z ); + p448_add ( &b->u, &b->y, &b->y ); + p448_weak_reduce( &b->u ); + p448_sub ( &b->y, &a->z, &a->x ); + p448_bias ( &b->y, 2 ); + p448_weak_reduce( &b->y ); + p448_mul ( &b->x, &b->y, &a->y ); + p448_sub ( &b->z, &a->z, &a->y ); + p448_bias ( &b->z, 2 ); + p448_weak_reduce( &b->z ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L1, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L1 ); + p448_isr ( &L0, &b->x ); + p448_mul ( &b->u, &b->t, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &b->t, &b->x, &L1 ); + p448_add ( &L1, &a->y, &a->x ); + p448_weak_reduce( &L1 ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); + p448_bias ( &b->t, 2 ); + p448_weak_reduce( &b->t ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); + p448_weak_reduce( &b->x ); + p448_mul ( &b->y, &b->t, &b->u ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); + p448_weak_reduce( &b->y ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); + p448_copy ( &b->t, &b->x ); + p448_copy ( &b->u, &b->y ); +} + +mask_t +is_square ( + const struct p448_t* x +) { + mask_t L2, L3; + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; +} + +mask_t +is_even_pt ( + const struct extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +is_even_tw ( + const struct tw_extensible_t* a +) { + struct p448_t L0, L1, L2; + p448_sqr ( &L2, &a->z ); + p448_sqr ( &L1, &a->x ); + p448_add ( &L0, &L1, &L2 ); + p448_weak_reduce( &L0 ); + return is_square ( &L0 ); +} + +mask_t +deserialize_affine ( + struct affine_t* a, + const struct p448_t* sz +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L1, sz ); + p448_copy ( &L3, &L1 ); + p448_addw ( &L3, 1 ); + p448_sqr ( &a->x, &L3 ); + p448_mulw ( &L3, &a->x, 39082 ); + p448_neg ( &a->x, &L3 ); + p448_add ( &L3, &L1, &L1 ); + p448_bias ( &L3, 1 ); + p448_add ( &a->y, &L3, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_weak_reduce( &L3 ); + p448_copy ( &a->y, &L1 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &a->x, &a->y ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_mul ( &a->y, &a->x, &L3 ); + p448_sqr ( &L2, &a->x ); + p448_mul ( &L0, &L2, &a->y ); + p448_mul ( &a->y, &a->x, &L0 ); + p448_isr ( &L3, &a->y ); + p448_mul ( &a->y, &L2, &L3 ); + p448_sqr ( &L2, &L3 ); + p448_mul ( &L3, &L0, &L2 ); + p448_mul ( &L0, &a->x, &L3 ); + p448_add ( &L2, &a->y, &a->y ); + p448_mul ( &a->x, sz, &L2 ); + p448_addw ( &L1, 1 ); + p448_mul ( &a->y, &L1, &L3 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + return p448_is_zero( &L0 ); +} + +mask_t +deserialize_and_twist_approx ( + struct tw_extensible_t* a, + const struct p448_t* sdm1, + const struct p448_t* sz +) { + struct p448_t L0, L1; + p448_sqr ( &a->z, sz ); + p448_copy ( &a->y, &a->z ); + p448_addw ( &a->y, 1 ); + p448_sqr ( &a->x, &a->y ); + p448_mulw ( &a->y, &a->x, 39082 ); + p448_neg ( &a->x, &a->y ); + p448_add ( &a->y, &a->z, &a->z ); + p448_bias ( &a->y, 1 ); + p448_add ( &a->u, &a->y, &a->y ); + p448_add ( &a->y, &a->u, &a->x ); + p448_weak_reduce( &a->y ); + p448_sqr ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &a->u, &a->x ); + p448_bias ( &a->u, 2 ); + p448_weak_reduce( &a->u ); + p448_mul ( &a->x, sdm1, &a->u ); + p448_mul ( &L0, &a->x, &a->y ); + p448_mul ( &a->t, &L0, &a->y ); + p448_mul ( &a->u, &a->x, &a->t ); + p448_mul ( &a->t, &a->u, &L0 ); + p448_mul ( &a->y, &a->x, &a->t ); + p448_isr ( &L0, &a->y ); + p448_mul ( &a->y, &a->u, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &a->u, &a->t, &L1 ); + p448_mul ( &a->t, &a->x, &a->u ); + p448_add ( &a->x, sz, sz ); + p448_mul ( &L0, &a->u, &a->x ); + p448_copy ( &a->x, &a->z ); + p448_subw ( &a->x, 1 ); + p448_neg ( &L1, &a->x ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &a->x, &L1, &L0 ); + p448_mul ( &L0, &a->u, &a->y ); + p448_addw ( &a->z, 1 ); + p448_mul ( &a->y, &a->z, &L0 ); + p448_subw ( &a->t, 1 ); + p448_bias ( &a->t, 1 ); + mask_t ret = p448_is_zero( &a->t ); + p448_set_ui( &a->z, 1 ); + p448_copy ( &a->t, &a->x ); + p448_copy ( &a->u, &a->y ); + return ret; +} + +void +set_identity_extensible ( + struct extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_tw_extensible ( + struct tw_extensible_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); + p448_set_ui( &a->z, 1 ); + p448_set_ui( &a->t, 0 ); + p448_set_ui( &a->u, 0 ); +} + +void +set_identity_affine ( + struct affine_t* a +) { + p448_set_ui( &a->x, 0 ); + p448_set_ui( &a->y, 1 ); +} + +mask_t +eq_affine ( + const struct affine_t* a, + const struct affine_t* b +) { + mask_t L1, L2; + struct p448_t L0; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; +} + +mask_t +eq_extensible ( + const struct extensible_t* a, + const struct extensible_t* b +) { + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +mask_t +eq_tw_extensible ( + const struct tw_extensible_t* a, + const struct tw_extensible_t* b +) { + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; +} + +void +elligator_2s_inject ( + struct affine_t* a, + const struct p448_t* r +) { + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8; + p448_sqr ( &a->x, r ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); + p448_subw ( &a->y, 1 ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); + p448_weak_reduce( &a->y ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); + p448_bias ( &L7, 2 ); + p448_weak_reduce( &L7 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); + p448_addw ( &a->x, 1 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); + p448_bias ( &a->x, 3 ); + p448_weak_reduce( &a->x ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); + p448_bias ( &a->x, 2 ); + p448_weak_reduce( &a->x ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_weak_reduce( &L3 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L8 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); + p448_weak_reduce( &a->y ); +} + +mask_t +validate_affine ( + const struct affine_t* a +) { + struct p448_t L0, L1, L2, L3; + p448_sqr ( &L0, &a->y ); + p448_sqr ( &L2, &a->x ); + p448_add ( &L3, &L2, &L0 ); + p448_subw ( &L3, 1 ); + p448_mulw ( &L1, &L2, 39081 ); + p448_neg ( &L2, &L1 ); + p448_bias ( &L2, 2 ); + p448_mul ( &L1, &L0, &L2 ); + p448_sub ( &L0, &L3, &L1 ); + p448_bias ( &L0, 3 ); + return p448_is_zero( &L0 ); +} + +mask_t +validate_tw_extensible ( + const struct tw_extensible_t* ext +) { + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L5 = p448_is_zero( &L0 ); + /* + * Check invariant: + * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->x ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L0, &L1, 39081 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L0, &L3, &L2 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L2, &L3, &L0 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + +mask_t +validate_extensible ( + const struct extensible_t* ext +) { + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; + /* + * Check invariant: + * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 + */ + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); + /* + * Check invariant: + * 0 = -x*y + z*t*u + */ + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; +} + + diff --git a/src/arch_neon_experimental/p448.c b/src/arch_neon_experimental/p448.c new file mode 100644 index 0000000..c7ee0f6 --- /dev/null +++ b/src/arch_neon_experimental/p448.c @@ -0,0 +1,1207 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "word.h" +#include "p448.h" + +static inline mask_t __attribute__((always_inline)) +is_zero ( + word_t x +) { + dword_t xx = x; + xx--; + return xx >> WORD_BITS; +} + +static __inline__ void __attribute__((gnu_inline,always_inline,unused)) +xx_vtrnq_s64 ( + int64x2_t *x, + int64x2_t *y +) { + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y)); +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) +xx_vaddup_s64(int64x2_t x) { + __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); + return x; +} + +static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused)) +xx_vaddup_u64(uint64x2_t x) { + __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); + return x; +} + +static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) +vrev128_s64(int64x2_t x) { + __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); + return x; +} + +static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline)) +vrev128_u64(uint64x2_t x) { + __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); + return x; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smlal ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smlal2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smull ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; +} + +static inline void __attribute__((gnu_inline,always_inline,unused)) +smull2 ( + uint64_t *acc, + const uint32_t a, + const uint32_t b +) { + *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; +} + +static inline int64x2_t __attribute__((always_inline,unused)) +SER(int64x2_t x) { + __asm__ __volatile__("" : "+w"(x)); + return x; +} +#define svmull_lane_s32(a,b,c) SER(vmull_lane_s32(a,b,c)) +#define svmlal_s32(a,b,c) SER(vmlal_s32(a,b,c)) +#define svmlal_lane_s32(a,b,c,d) SER(vmlal_lane_s32(a,b,c,d)) + + +// static inline int64x2_t __attribute__((always_inline,unused)) +// xvmlal_lane_s32(int64x2_t acc, int32x2_t a, int32x2_t b, const int lane) { +// __asm__ volatile ( +// "vmlal.s32 %0, %1, %2[%c3]" +// : "+w"(acc) +// : "w"(a), "w"(b), "i"(lane) +// ); +// return acc; +// } + +void +p448_mul ( + p448_t *__restrict__ cs, + const p448_t *as, + const p448_t *bs +) { + register int32x4_t al0 __asm__("q6"); + register int32x4_t ah0 __asm__("q7"); + register int32x4_t as0 __asm__("q8"); + register int32x4_t al2 __asm__("q9"); + register int32x4_t ah2 __asm__("q10"); + register int32x4_t as2 __asm__("q11"); + + register int32x4_t bl0 __asm__("q0"); + register int32x4_t bh0 __asm__("q1"); + register int32x4_t bs0 __asm__("q2"); + register int32x4_t bl2 __asm__("q3"); + register int32x4_t bh2 __asm__("q4"); + register int32x4_t bs2 __asm__("q5"); + + int32x2_t *vc = (int32x2_t*) cs->limb, *vcasm = vc; + + register int64x2_t acc0a __asm__("q12"); + register int64x2_t acc0b __asm__("q13"); + register int64x2_t acc1a __asm__("q14"); + register int64x2_t acc1b __asm__("q15"); + + __asm__ __volatile__( + + "vld2.32 {%e[al0],%f[al0],%e[ah0],%f[ah0]}, [%[a],:64]!" "\n\t" + "vadd.i32 %[as0], %[al0], %[ah0]" "\n\t" + + "vld2.32 {%e[bl0],%f[bl0],%e[bh0],%f[bh0]}, [%[b],:64]!" "\n\t" + "vadd.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vsub.i32 %e[bs0], %e[bl0], %e[bh0]" "\n\t" + + "vld2.32 {%e[bl2],%f[bl2],%e[bh2],%f[bh2]}, [%[b],:64]!" "\n\t" + "vadd.i32 %[bs2], %[bl2], %[bh2]" "\n\t" + + "vld2.32 {%e[al2],%f[al2],%e[ah2],%f[ah2]}, [%[a],:64]!" "\n\t" + "vadd.i32 %[as2], %[al2], %[ah2]" "\n\t" + + "vmull.s32 %[a0b], %f[as0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as2], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[as2], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as0], %e[bh0][0]" "\n\t" + + "vmull.s32 %[a1b], %f[as0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh0][1]" "\n\t" + + "vmov %[a0a], %[a0b]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %f[bh0][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[al0], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %e[al2], %e[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %f[al2], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[al0], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %e[al2], %e[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %f[al2], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + + "vmull.s32 %[a0a], %e[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %f[as2], %e[bs2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %e[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %e[as2], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %e[al2], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %f[al2], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %e[al2], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %f[al2], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %e[bs2], %e[bl2], %e[bh2]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %f[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %e[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %e[as2], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %f[as2], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %e[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[al2], %f[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al2], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[al2], %f[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al2], %e[bs0][1]" "\n\t" + + "vsub.i32 %f[bs2], %f[bl2], %f[bh2]" "\n\t" + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %e[as0], %f[bh2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %e[as2], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vmlal.s32 %[a0a], %f[as2], %e[bh0][0]" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %e[as0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %f[bh0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as2], %e[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %e[ah0], %f[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[ah2], %f[bl0][0]" "\n\t" + "vmlal.s32 %[a0a], %f[ah2], %e[bl0][0]" "\n\t" + + "vmlal.s32 %[a0b], %e[al0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[al2], %f[bs0][0]" "\n\t" + "vmlal.s32 %[a0b], %f[al2], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %e[ah0], %f[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[ah2], %f[bl0][1]" "\n\t" + "vmlal.s32 %[a1a], %f[ah2], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlal.s32 %[a1b], %e[al0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[al2], %f[bs0][1]" "\n\t" + "vmlal.s32 %[a1b], %f[al2], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a0a], %[a0a], %[a1b]" "\n\t" + + "vmovn.i64 %f[a0b], %[a0a]" "\n\t" + "vsra.s64 %[a1a], %[a0a], #28" "\n\t" + + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + + "vswp %e[a1a], %f[a1a]" "\n\t" + + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + "sub %[c], #64" "\n\t" + + "vadd.i64 %f[a1a], %f[a1a], %e[a1a]" "\n\t" + + "vldmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + "vaddw.s32 %[a1a], %e[a0a]" "\n\t" + "vmovn.i64 %e[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vaddw.s32 %[a1a], %f[a0a]" "\n\t" + "vmovn.i64 %f[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vbic.i32 %[a0a], #0xf0000000" "\n\t" + + "vaddw.s32 %[a1a], %e[a0b]" "\n\t" + "vmovn.i64 %e[a0b], %[a1a]" "\n\t" + + "vstmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + + : [a0a]"=w"(acc0a) + , [a0b]"=w"(acc0b) + , [a1a]"=w"(acc1a) + , [a1b]"=w"(acc1b) + , [a]"+r"(as) + , [b]"+r"(bs) + , [c]"+r"(vcasm) + + , [al0]"=w"(al0) + , [ah0]"=w"(ah0) + , [as0]"=w"(as0) + , [al2]"=w"(al2) + , [ah2]"=w"(ah2) + , [as2]"=w"(as2) + + , [bh0]"=w"(bh0) + , [bh2]"=w"(bh2) + + , [bl0]"=w"(bl0) + , [bl2]"=w"(bl2) + + , [bs0]"=w"(bs0) + , [bs2]"=w"(bs2) + + :: "memory" + ); + + /* + acc0b = vmull_lane_s32( as1, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, as0, bh0, 0); + + acc1b = vmull_lane_s32( as1, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, as0, bh0, 1); + + acc0a = acc0b; + acc0a = vmlal_lane_s32(acc0a, ah1, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bh2, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bh1, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah1, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bh2, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bh1, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al1, bl3, 0); + acc0b = vmlsl_lane_s32(acc0b, al2, bl2, 0); + acc0b = vmlsl_lane_s32(acc0b, al3, bl1, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al1, bl3, 1); + acc1b = vmlsl_lane_s32(acc1b, al2, bl2, 1); + acc1b = vmlsl_lane_s32(acc1b, al3, bl1, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs0, 1); + + xx_vtrnq_s64(&acc0b, &acc0a); + xx_vtrnq_s64(&acc1b, &acc1a); + + acc0a += acc1b; + vc[0] = vmovn_s64(acc0b) & vmask; + + acc0a = vsraq_n_s64(acc0a,acc0b,28); + vc[1] = vmovn_s64(acc0a) & vmask; + bs1 = bl1 - bh1; + carry = vsraq_n_s64(acc1a,acc0a,28); + + + acc0b = vmull_lane_s32( as2, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, as0, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh0, 0); + + acc1b = vmull_lane_s32( as2, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, as0, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh0, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(carry), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(carry), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah2, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bh2, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah2, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bh2, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al2, bl3, 0); + acc0b = vmlsl_lane_s32(acc0b, al3, bl2, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al2, bl3, 1); + acc1b = vmlsl_lane_s32(acc1b, al3, bl2, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs0, 1); + + xx_vtrnq_s64(&acc0b, &acc0a); + xx_vtrnq_s64(&acc1b, &acc1a); + //acc0b += carry; + + acc0a += acc1b; + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + vc[2] = vmovn_s64(acc0b) & vmask; + vc[3] = vmovn_s64(acc0a) & vmask; + carry = vsraq_n_s64(acc1a,acc0a,28); + + bs2 = bl2 - bh2; + + acc0b = vmull_lane_s32( as0, bh2, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bh0, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bs3, 0); + + acc1b = vmull_lane_s32( as0, bh2, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bh0, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bs3, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(acc1a), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(acc1a), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah3, bh3, 0); + acc0a = vmlal_lane_s32(acc0a, ah0, bl2, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah3, bh3, 1); + acc1a = vmlal_lane_s32(acc1a, ah0, bl2, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bl0, 1); + + acc0b = vmlsl_lane_s32(acc0b, al3, bl3, 0); + acc0b = vmlal_lane_s32(acc0b, al0, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al2, bs0, 0); + + acc1b = vmlsl_lane_s32(acc1b, al3, bl3, 1); + acc1b = vmlal_lane_s32(acc1b, al0, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al2, bs0, 1); + + + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc0b), "+w"(acc0a)); + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc1b), "+w"(acc1a)); + //xx_vtrnq_s64_(acc0b, acc0a); + //xx_vtrnq_s64_(acc1b, acc1a); + + //acc0b += acc1a; + acc0a += acc1b; + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + + vc[4] = vmovn_s64(acc0b) & vmask; + vc[5] = vmovn_s64(acc0a) & vmask; + + bs3 = bl3 - bh3; + acc1a = vsraq_n_s64(acc1a,acc0a,28); + + + acc0b = vmull_lane_s32( as0, bh3, 0); + acc0b = vmlal_lane_s32(acc0b, as1, bh2, 0); + acc0b = vmlal_lane_s32(acc0b, as2, bh1, 0); + acc0b = vmlal_lane_s32(acc0b, as3, bh0, 0); + + acc1b = vmull_lane_s32( as0, bh3, 1); + acc1b = vmlal_lane_s32(acc1b, as1, bh2, 1); + acc1b = vmlal_lane_s32(acc1b, as2, bh1, 1); + acc1b = vmlal_lane_s32(acc1b, as3, bh0, 1); + + //acc0a = acc0b; + acc0a = vcombine_s64(vget_low_s64(acc0b) + vget_high_s64(acc1a), vget_high_s64(acc0b)); + acc0b = vcombine_s64(vget_low_s64(acc0b) + vget_low_s64(acc1a), vget_high_s64(acc0b)); + acc0a = vmlal_lane_s32(acc0a, ah0, bl3, 0); + acc0a = vmlal_lane_s32(acc0a, ah1, bl2, 0); + acc0a = vmlal_lane_s32(acc0a, ah2, bl1, 0); + acc0a = vmlal_lane_s32(acc0a, ah3, bl0, 0); + + acc1a = acc1b; + acc1a = vmlal_lane_s32(acc1a, ah0, bl3, 1); + acc1a = vmlal_lane_s32(acc1a, ah1, bl2, 1); + acc1a = vmlal_lane_s32(acc1a, ah2, bl1, 1); + acc1a = vmlal_lane_s32(acc1a, ah3, bl0, 1); + + acc0b = vmlal_lane_s32(acc0b, al0, bs3, 0); + acc0b = vmlal_lane_s32(acc0b, al1, bs2, 0); + acc0b = vmlal_lane_s32(acc0b, al2, bs1, 0); + acc0b = vmlal_lane_s32(acc0b, al3, bs0, 0); + + acc1b = vmlal_lane_s32(acc1b, al0, bs3, 1); + acc1b = vmlal_lane_s32(acc1b, al1, bs2, 1); + acc1b = vmlal_lane_s32(acc1b, al2, bs1, 1); + acc1b = vmlal_lane_s32(acc1b, al3, bs0, 1); + + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc0b), "+w"(acc0a)); + __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(acc1b), "+w"(acc1a)); + //xx_vtrnq_s64_(acc0b, acc0a); + //xx_vtrnq_s64_(acc1b, acc1a); + //acc0b += acc1a; + acc0a += acc1b; + + acc0a = vsraq_n_s64(acc0a,acc0b,28); + + vc[6] = vmovn_s64(acc0b) & vmask; + vc[7] = vmovn_s64(acc0a) & vmask; + + acc1a = vsraq_n_s64(acc1a,acc0a,28); + + acc1a = xx_vaddup_s64(vrev128_s64(acc1a)); + + acc1a = vaddw_s32(acc1a, vc[0]); + vc[0] = vmovn_s64(acc1a) & vmask; + + acc1a = vshrq_n_s64(acc1a,28); + acc1a = vaddw_s32(acc1a, vc[1]); + vc[1] = vmovn_s64(acc1a) & vmask; + + acc1a = vshrq_n_s64(acc1a,28); + vc[2] += vmovn_s64(acc1a);; + */ +} + +void +p448_sqr ( + p448_t *__restrict__ cs, + const p448_t *bs +) { + const p448_t *as = bs; + register int32x4_t as0 __asm__("q6"); + register int32x4_t as2 __asm__("q7"); + + register int32x4_t bl0 __asm__("q0"); + register int32x4_t bh0 __asm__("q1"); + register int32x4_t bs0 __asm__("q2"); + register int32x4_t bl2 __asm__("q3"); + register int32x4_t bh2 __asm__("q4"); + register int32x4_t bs2 __asm__("q5"); + + int32x2_t *vc = (int32x2_t*) cs->limb, *vcasm = vc; + + register int64x2_t acc0a __asm__("q12"); + register int64x2_t acc0b __asm__("q13"); + register int64x2_t acc1a __asm__("q14"); + register int64x2_t acc1b __asm__("q15"); + + __asm__ __volatile__ ( + "vld2.32 {%e[bl0],%f[bl0],%e[bh0],%f[bh0]}, [%[b],:64]!" "\n\t" + "vadd.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vsub.i32 %e[bs0], %e[bl0], %e[bh0]" "\n\t" + "vadd.i32 %[as0], %[bl0], %[bh0]" "\n\t" + + "vld2.32 {%e[bl2],%f[bl2],%e[bh2],%f[bh2]}, [%[b],:64]!" "\n\t" + "vadd.i32 %[bs2], %[bl2], %[bh2]" "\n\t" + "vmov %[as2], %[bs2]" "\n\t" + + "vqdmull.s32 %[a0b], %f[as0], %f[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as2], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[as0], %e[bh0][0]" "\n\t" + + "vqdmull.s32 %[a1b], %f[as0], %f[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as2], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[as0], %e[bh0][1]" "\n\t" + + "vmov %[a0a], %[a0b]" "\n\t" + "vqdmlal.s32 %[a0a], %f[bh0], %f[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[bh2], %e[bh2][0]" "\n\t" + "vmlal.s32 %[a0a], %e[bh0], %e[bl0][0]" "\n\t" + + "vqdmlsl.s32 %[a0b], %f[bl0], %f[bl2][0]" "\n\t" + "vmlsl.s32 %[a0b], %e[bl2], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0b], %e[bl0], %e[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %f[bh0], %f[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[bh2], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1a], %e[bh0], %e[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlsl.s32 %[a1b], %f[bl0], %f[bl2][1]" "\n\t" + "vmlsl.s32 %[a1b], %e[bl2], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1b], %e[bl0], %e[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %f[bs0], %f[bl0], %f[bh0]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + + "vqdmull.s32 %[a0a], %e[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vqdmlal.s32 %[a0a], %e[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vqdmull.s32 %[a1b], %e[as2], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as0], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh2], %f[bh2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %f[bl0][0]" "\n\t" + + "vqdmlsl.s32 %[a0b], %e[bl2], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl0], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh2], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlsl.s32 %[a1b], %e[bl2], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl0], %f[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vsub.i32 %e[bs2], %e[bl2], %e[bh2]" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vmull.s32 %[a0a], %f[as2], %f[bs2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vqdmlal.s32 %[a0a], %e[as0], %e[bh2][0]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vmlal.s32 %[a0a], %f[as0], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vmull.s32 %[a1b], %f[as2], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as0], %e[bh2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[as0], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vmlal.s32 %[a0a], %f[bh2], %f[bh2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %e[bl2][0]" "\n\t" + "vmlal.s32 %[a0a], %f[bh0], %f[bl0][0]" "\n\t" + + "vmlsl.s32 %[a0b], %f[bl2], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl0], %e[bs2][0]" "\n\t" + "vmlal.s32 %[a0b], %f[bl0], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vmlal.s32 %[a1a], %f[bh2], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %e[bl2][1]" "\n\t" + "vmlal.s32 %[a1a], %f[bh0], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vmlsl.s32 %[a1b], %f[bl2], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl0], %e[bs2][1]" "\n\t" + "vmlal.s32 %[a1b], %f[bl0], %f[bs0][1]" "\n\t" + + "vsub.i32 %f[bs2], %f[bl2], %f[bh2]" "\n\t" + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a1b], %[a0a], %[a1b]" "\n\t" + + "vqdmull.s32 %[a0a], %e[as0], %f[bh2][0]" "\n\t" + "vmovn.i64 %f[a0b], %[a1b]" "\n\t" + "vsra.s64 %[a1a], %[a1b], #28" "\n\t" + "vqdmlal.s32 %[a0a], %e[as2], %f[bh0][0]" "\n\t" + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + + "vqdmull.s32 %[a1b], %e[as0], %f[bh2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[as2], %f[bh0][1]" "\n\t" + + "vmov %f[a0b], %f[a0a]" "\n\t" + "vadd.i64 %e[a0b], %e[a0a], %e[a1a]" "\n\t" + "vadd.i64 %e[a0a], %e[a0a], %f[a1a]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh0], %f[bl2][0]" "\n\t" + "vqdmlal.s32 %[a0a], %e[bh2], %f[bl0][0]" "\n\t" + + "vqdmlal.s32 %[a0b], %e[bl0], %f[bs2][0]" "\n\t" + "vqdmlal.s32 %[a0b], %e[bl2], %f[bs0][0]" "\n\t" + + "vmov %[a1a], %[a1b]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh0], %f[bl2][1]" "\n\t" + "vqdmlal.s32 %[a1a], %e[bh2], %f[bl0][1]" "\n\t" + + "vswp %f[a0b], %e[a0a]" "\n\t" + + "vqdmlal.s32 %[a1b], %e[bl0], %f[bs2][1]" "\n\t" + "vqdmlal.s32 %[a1b], %e[bl2], %f[bs0][1]" "\n\t" + + "vsra.s64 %[a0a], %[a0b], #28" "\n\t" + "vmovn.i64 %e[a0b], %[a0b]" "\n\t" + + "vswp %f[a1b], %e[a1a]" "\n\t" + "vadd.i64 %[a0a], %[a0a], %[a1b]" "\n\t" + + "vmovn.i64 %f[a0b], %[a0a]" "\n\t" + "vsra.s64 %[a1a], %[a0a], #28" "\n\t" + + "vbic.i32 %[a0b], #0xf0000000" "\n\t" + + "vswp %e[a1a], %f[a1a]" "\n\t" + + "vstmia %[c]!, {%e[a0b], %f[a0b]}" "\n\t" + "sub %[c], #64" "\n\t" + + "vadd.i64 %f[a1a], %f[a1a], %e[a1a]" "\n\t" + + "vldmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + "vaddw.s32 %[a1a], %e[a0a]" "\n\t" + "vmovn.i64 %e[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vaddw.s32 %[a1a], %f[a0a]" "\n\t" + "vmovn.i64 %f[a0a], %[a1a]" "\n\t" + "vshr.s64 %[a1a], #28" "\n\t" + + "vbic.i32 %[a0a], #0xf0000000" "\n\t" + + "vaddw.s32 %[a1a], %e[a0b]" "\n\t" + "vmovn.i64 %e[a0b], %[a1a]" "\n\t" + + "vstmia %[c], {%e[a0a], %f[a0a], %e[a0b]}" "\n\t" + + : [a0a]"=w"(acc0a) + , [a0b]"=w"(acc0b) + , [a1a]"=w"(acc1a) + , [a1b]"=w"(acc1b) + , [a]"+r"(as) + , [b]"+r"(bs) + , [c]"+r"(vcasm) + + , [as0]"=w"(as0) + , [as2]"=w"(as2) + + , [bh0]"=w"(bh0) + , [bh2]"=w"(bh2) + + , [bl0]"=w"(bl0) + , [bl2]"=w"(bl2) + + , [bs0]"=w"(bs0) + , [bs2]"=w"(bs2) + + :: "memory" + ); + + + /* + const int32x2x2_t b0 = vld2_s32((const int32_t *) &bs->limb[0]); + const int32x2x2_t b1 = vld2_s32((const int32_t *) &bs->limb[4]); + const int32x2x2_t b2 = vld2_s32((const int32_t *) &bs->limb[8]); + const int32x2x2_t b3 = vld2_s32((const int32_t *) &bs->limb[12]); + const int32x2_t vbl[4] = { b0.val[0], b1.val[0], b2.val[0], b3.val[0] }; + const int32x2_t vbh[4] = { b0.val[1], b1.val[1], b2.val[1], b3.val[1] }; + int32x2_t vbm[4]; + + int i; + for (i=0; i<4; i++) { + vbm[i] = vbl[i] - vbh[i]; + } + + int32x2_t *vc = (int32x2_t*) cs->limb; + */ + + /* FUTURE possible improvements: + * don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end + * or use phi/nega-phi for everything, montgomery style + * or find some sort of phi algorithm which doesn't have this problem + * break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks + * + * These improvements are all pretty minor, but I guess together they might matter? + */ + + + + /* + int32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; + + int64x2_t acc0a, acc0b; + int64x2_t acc1a, acc1b; + int64x2_t acc2a, acc2b; + int64x2_t acc3a, acc3b; + int64x2_t acc4a, acc4b; + int64x2_t acc5a, acc5b; + int64x2_t acc6a, acc6b; + int64x2_t acc7a, acc7b; + int64x2_t carry; + + acc0a = vqdmull_lane_s32( vbh[1], vbh[3], 0); + acc1a = vqdmull_lane_s32( vbh[1], vbh[3], 1); + acc2a = vqdmull_lane_s32( vbh[2], vbh[3], 0); + acc3a = vqdmull_lane_s32( vbh[2], vbh[3], 1); + acc0a = vmlal_lane_s32(acc0a, vbh[2], vbh[2], 0); + acc1a = vmlal_lane_s32(acc1a, vbh[2], vbh[2], 1); + acc2b = acc2a; + acc3b = acc3a; + acc2b = vqdmlal_lane_s32(acc2b, vbh[0], vbh[1], 0); + acc3b = vqdmlal_lane_s32(acc3b, vbh[0], vbh[1], 1); + acc0b = acc0a; + acc1b = acc1a; + acc0b = vmlal_lane_s32(acc0b, vbh[0], vbh[0], 0); + acc1b = vmlal_lane_s32(acc1b, vbh[0], vbh[0], 1); + acc0b = vqdmlal_lane_s32(acc0b, vbl[1], vbl[3], 0); + acc1b = vqdmlal_lane_s32(acc1b, vbl[1], vbl[3], 1); + acc2b = vqdmlal_lane_s32(acc2b, vbl[2], vbl[3], 0); + acc3b = vqdmlal_lane_s32(acc3b, vbl[2], vbl[3], 1); + acc0b = vmlal_lane_s32(acc0b, vbl[2], vbl[2], 0); + acc1b = vmlal_lane_s32(acc1b, vbl[2], vbl[2], 1); + acc2a += acc2b; + acc3a += acc3b; + acc2a = vqdmlal_lane_s32(acc2a, vbl[0], vbl[1], 0); + acc3a = vqdmlal_lane_s32(acc3a, vbl[0], vbl[1], 1); + acc0a += acc0b; + acc1a += acc1b; + acc0a = vmlal_lane_s32(acc0a, vbl[0], vbl[0], 0); + acc1a = vmlal_lane_s32(acc1a, vbl[0], vbl[0], 1); + acc0a = vqdmlsl_lane_s32(acc0a, vbm[1], vbm[3], 0); + acc1a = vqdmlsl_lane_s32(acc1a, vbm[1], vbm[3], 1); + acc0a = vmlsl_lane_s32(acc0a, vbm[2], vbm[2], 0); + acc1a = vmlsl_lane_s32(acc1a, vbm[2], vbm[2], 1); + acc2a = vqdmlsl_lane_s32(acc2a, vbm[2], vbm[3], 0); + acc3a = vqdmlsl_lane_s32(acc3a, vbm[2], vbm[3], 1); + acc0b += acc0a; + acc1b += acc1a; + acc0b = vmlsl_lane_s32(acc0b, vbm[0], vbm[0], 0); + acc1b = vmlsl_lane_s32(acc1b, vbm[0], vbm[0], 1); + acc2b += acc2a; + acc3b += acc3a; + acc2b = vqdmlsl_lane_s32(acc2b, vbm[0], vbm[1], 0); + acc3b = vqdmlsl_lane_s32(acc3b, vbm[0], vbm[1], 1); + + xx_vtrnq_s64(&acc0a, &acc0b); + xx_vtrnq_s64(&acc1a, &acc1b); + xx_vtrnq_s64(&acc2a, &acc2b); + xx_vtrnq_s64(&acc3a, &acc3b); + + acc0b += acc1a; + acc0b = vsraq_n_s64(acc0b,acc0a,28); + acc1b = vsraq_n_s64(acc1b,acc0b,28); + acc2a += acc1b; + acc2b += acc3a; + acc2b = vsraq_n_s64(acc2b,acc2a,28); + acc3b = vsraq_n_s64(acc3b,acc2b,28); + + vc[0] = (vmovn_s64(acc0a)) & vmask; + vc[1] = (vmovn_s64(acc0b)) & vmask; + + vc[2] = (vmovn_s64(acc2a)) & vmask; + vc[3] = (vmovn_s64(acc2b)) & vmask; + carry = acc3b; + + acc4a = vmull_lane_s32( vbh[3], vbh[3], 0); + acc5a = vmull_lane_s32( vbh[3], vbh[3], 1); + acc6b = vqdmull_lane_s32( vbh[0], vbh[3], 0); + acc7b = vqdmull_lane_s32( vbh[0], vbh[3], 1); + acc4b = acc4a; + acc5b = acc5a; + acc4b = vqdmlal_lane_s32(acc4b, vbh[0], vbh[2], 0); + acc5b = vqdmlal_lane_s32(acc5b, vbh[0], vbh[2], 1); + acc6b = vqdmlal_lane_s32(acc6b, vbh[1], vbh[2], 0); + acc7b = vqdmlal_lane_s32(acc7b, vbh[1], vbh[2], 1); + acc4b = vmlal_lane_s32(acc4b, vbh[1], vbh[1], 0); + acc5b = vmlal_lane_s32(acc5b, vbh[1], vbh[1], 1); + acc4b = vmlal_lane_s32(acc4b, vbl[3], vbl[3], 0); + acc5b = vmlal_lane_s32(acc5b, vbl[3], vbl[3], 1); + acc6a = acc6b; + acc7a = acc7b; + acc6a = vqdmlal_lane_s32(acc6a, vbl[0], vbl[3], 0); + acc7a = vqdmlal_lane_s32(acc7a, vbl[0], vbl[3], 1); + acc4a += acc4b; + acc5a += acc5b; + acc4a = vqdmlal_lane_s32(acc4a, vbl[0], vbl[2], 0); + acc5a = vqdmlal_lane_s32(acc5a, vbl[0], vbl[2], 1); + acc6a = vqdmlal_lane_s32(acc6a, vbl[1], vbl[2], 0); + acc7a = vqdmlal_lane_s32(acc7a, vbl[1], vbl[2], 1); + acc4a = vmlal_lane_s32(acc4a, vbl[1], vbl[1], 0); + acc5a = vmlal_lane_s32(acc5a, vbl[1], vbl[1], 1); + acc4a = vmlsl_lane_s32(acc4a, vbm[3], vbm[3], 0); + acc5a = vmlsl_lane_s32(acc5a, vbm[3], vbm[3], 1); + acc6b += acc6a; + acc7b += acc7a; + acc6b = vqdmlsl_lane_s32(acc6b, vbm[0], vbm[3], 0); + acc7b = vqdmlsl_lane_s32(acc7b, vbm[0], vbm[3], 1); + acc4b += acc4a; + acc5b += acc5a; + acc4b = vqdmlsl_lane_s32(acc4b, vbm[0], vbm[2], 0); + acc5b = vqdmlsl_lane_s32(acc5b, vbm[0], vbm[2], 1); + acc4b = vmlsl_lane_s32(acc4b, vbm[1], vbm[1], 0); + acc5b = vmlsl_lane_s32(acc5b, vbm[1], vbm[1], 1); + acc6b = vqdmlsl_lane_s32(acc6b, vbm[1], vbm[2], 0); + acc7b = vqdmlsl_lane_s32(acc7b, vbm[1], vbm[2], 1); + + xx_vtrnq_s64(&acc4a, &acc4b); + xx_vtrnq_s64(&acc5a, &acc5b); + xx_vtrnq_s64(&acc6a, &acc6b); + xx_vtrnq_s64(&acc7a, &acc7b); + + acc4a += carry; + acc4b += acc5a; + acc4b = vsraq_n_s64(acc4b,acc4a,28); + acc5b = vsraq_n_s64(acc5b,acc4b,28); + acc6a += acc5b; + acc6b += acc7a; + + + vc[4] = (vmovn_s64(acc4a)) & vmask; + vc[5] = (vmovn_s64(acc4b)) & vmask; + + acc6b = vsraq_n_s64(acc6b,acc6a,28); + acc7b = vsraq_n_s64(acc7b,acc6b,28); + + vc[6] = (vmovn_s64(acc6a)) & vmask; + vc[7] = (vmovn_s64(acc6b)) & vmask; + + acc7a = xx_vaddup_s64(vrev128_s64(acc7b)); + + int32x2_t t0 = vc[0], t1 = vc[1]; + + acc7a = vaddw_s32(acc7a, t0); + t0 = vmovn_s64(acc7a) & vmask; + acc7a = vshrq_n_s64(acc7a,28); + acc7a = vaddw_s32(acc7a, t1); + t1 = vmovn_s64(acc7a) & vmask; + vc[0] = t0; + vc[1] = t1; + acc7a = vshrq_n_s64(acc7a,28); + + vc[2] += vmovn_s64(acc7a); + */ +} + +void +p448_mulw ( + p448_t *__restrict__ cs, + const p448_t *as, + uint64_t b +) { + uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; + + uint64x2_t accum; + const uint32x2_t *va = (const uint32x2_t *) as->limb; + uint32x2_t *vo = (uint32x2_t *) cs->limb; + uint32x2_t vc, vn; + uint32x2_t vb = {b & ((1<<28)-1), b>>28}; + + accum = vmull_lane_u32(va[7], vb, 1); + accum = xx_vaddup_u64(vrev128_u64(accum)); + + vc = va[0]; + accum = vmlal_lane_u32(accum, vc, vb, 0); + vo[0] = vmovn_u64(accum) & vmask; + accum = vshrq_n_u64(accum,28); + + /* PERF: the right way to do this is to reduce behind, i.e. + * vmull + vmlal round 0 + * vmull + vmlal round 1 + * vmull + vmlal round 2 + * vsraq round 0, 1 + * vmull + vmlal round 3 + * vsraq round 1, 2 + * ... + */ + + int i; + for (i=1; i<8; i++) { + vn = va[i]; + accum = vmlal_lane_u32(accum, vc, vb, 1); + accum = vmlal_lane_u32(accum, vn, vb, 0); + vo[i] = vmovn_u64(accum) & vmask; + accum = vshrq_n_u64(accum,28); + vc = vn; + } + + accum = xx_vaddup_u64(vrev128_u64(accum)); + accum = vaddw_u32(accum, vo[0]); + vo[0] = vmovn_u64(accum) & vmask; + + accum = vshrq_n_u64(accum,28); + vo[1] += vmovn_u64(accum); +} + +/* TODO: vectorize? */ +void +p448_strong_reduce ( + p448_t *a +) { + word_t mask = (1ull<<28)-1; + + /* first, clear high */ + a->limb[1] += a->limb[15]>>28; + a->limb[0] += a->limb[15]>>28; + a->limb[15] &= mask; + + /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + dsword_t scarry = 0; + int i; + for (i=0; i<16; i++) { + scarry = scarry + a->limb[LIMBPERM(i)] - ((i==8)?mask-1:mask); + a->limb[LIMBPERM(i)] = scarry & mask; + scarry >>= 28; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 + * so let's add back in p. will carry back off the top for 2^448. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + word_t scarry_mask = scarry & mask; + dword_t carry = 0; + + /* add it back */ + for (i=0; i<16; i++) { + carry = carry + a->limb[LIMBPERM(i)] + ((i==8)?(scarry_mask&~1):scarry_mask); + a->limb[LIMBPERM(i)] = carry & mask; + carry >>= 28; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p448_is_zero ( + const struct p448_t *a +) { + struct p448_t b; + p448_copy(&b,a); + p448_strong_reduce(&b); + + uint32_t any = 0; + int i; + for (i=0; i<16; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +) { + int i,j; + p448_t red; + p448_copy(&red, x); + p448_strong_reduce(&red); + + for (i=0; i<8; i++) { + uint64_t limb = red.limb[LIMBPERM(2*i)] + (((uint64_t)red.limb[LIMBPERM(2*i+1)])<<28); + for (j=0; j<7; j++) { + serial[7*i+j] = limb; + limb >>= 8; + } + assert(limb == 0); + } +} + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +) { + int i,j; + for (i=0; i<8; i++) { + uint64_t out = 0; + for (j=0; j<7; j++) { + out |= ((uint64_t)serial[7*i+j])<<(8*j); + } + x->limb[LIMBPERM(2*i)] = out & ((1ull<<28)-1); + x->limb[LIMBPERM(2*i+1)] = out >> 28; + } + + /* Check for reduction. + * + * The idea is to create a variable ge which is all ones (rather, 56 ones) + * if and only if the low $i$ words of $x$ are >= those of p. + * + * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) + */ + uint32_t ge = -1, mask = (1ull<<28)-1; + for (i=0; i<8; i++) { + ge &= x->limb[LIMBPERM(i)]; + } + + /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ + ge = (ge & (x->limb[LIMBPERM(8)] + 1)) | is_zero(x->limb[LIMBPERM(8)] ^ mask); + + /* Propagate the rest */ + for (i=9; i<16; i++) { + ge &= x->limb[LIMBPERM(i)]; + } + + return ~is_zero(ge ^ mask); +} + +void +simultaneous_invert_p448( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +) { + if (n==0) { + return; + } else if (n==1) { + p448_inverse(out,in); + return; + } + + p448_copy(&out[1], &in[0]); + int i; + for (i=1; i<(int) (n-1); i++) { + p448_mul(&out[i+1], &out[i], &in[i]); + } + p448_mul(&out[0], &out[n-1], &in[n-1]); + + struct p448_t tmp; + p448_inverse(&tmp, &out[0]); + p448_copy(&out[0], &tmp); + + /* at this point, out[0] = product(in[i]) ^ -1 + * out[i] = product(in[0]..in[i-1]) if i != 0 + */ + for (i=n-1; i>0; i--) { + p448_mul(&tmp, &out[i], &out[0]); + p448_copy(&out[i], &tmp); + + p448_mul(&tmp, &out[0], &in[i]); + p448_copy(&out[0], &tmp); + } +} diff --git a/src/arch_neon_experimental/p448.h b/src/arch_neon_experimental/p448.h new file mode 100644 index 0000000..4f0be0a --- /dev/null +++ b/src/arch_neon_experimental/p448.h @@ -0,0 +1,376 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P448_H__ +#define __P448_H__ 1 + +#include "word.h" + +#include +#include + +typedef struct p448_t { + uint32_t limb[16]; +} __attribute__((aligned(32))) p448_t; + +#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15) +#define USE_NEON_PERM 1 + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p448_set_ui ( + p448_t *out, + uint64_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t do_swap +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_add ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_sub ( + p448_t *out, + const p448_t *a, + const p448_t *b +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_neg ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_cond_neg ( + p448_t *a, + mask_t doNegate +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_addw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_subw ( + p448_t *a, + uint32_t x +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_copy ( + p448_t *out, + const p448_t *a +) __attribute__((unused,always_inline)); + +static __inline__ void +p448_weak_reduce ( + p448_t *inout +) __attribute__((unused,always_inline)); + +void +p448_strong_reduce ( + p448_t *inout +); + +mask_t +p448_is_zero ( + const p448_t *in +); + +static __inline__ void +p448_bias ( + p448_t *inout, + int amount +) __attribute__((unused,always_inline)); + +void +p448_mul ( + p448_t *__restrict__ out, + const p448_t *a, + const p448_t *b +); + +void +p448_mulw ( + p448_t *__restrict__ out, + const p448_t *a, + uint64_t b +); + +void +p448_sqr ( + p448_t *__restrict__ out, + const p448_t *a +); + +static __inline__ void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) __attribute__((unused,always_inline)); + +void +p448_serialize ( + uint8_t *serial, + const struct p448_t *x +); + +mask_t +p448_deserialize ( + p448_t *x, + const uint8_t serial[56] +); + +static __inline__ void +p448_mask( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) __attribute__((unused,always_inline)); + +/** +* Returns 1/x. +* +* If x=0, returns 0. +*/ +void +p448_inverse ( + struct p448_t* a, + const struct p448_t* x +); + +void +simultaneous_invert_p448 ( + struct p448_t *__restrict__ out, + const struct p448_t *in, + unsigned int n +); + +static inline mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) __attribute__((always_inline,unused)); + +/* -------------- Inline functions begin here -------------- */ + +void +p448_set_ui ( + p448_t *out, + uint64_t x +) { + int i; + for (i=0; i<16; i++) { + out->limb[i] = 0; + } + out->limb[0] = x & ((1<<28)-1); + out->limb[2] = x>>28; +} + +void +p448_cond_swap ( + p448_t *a, + p448_t *b, + mask_t doswap +) { + big_register_t *aa = (big_register_t*)a; + big_register_t *bb = (big_register_t*)b; + big_register_t m = br_set_to_mask(doswap); + + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = a->limb[i] - b->limb[i]; + } + */ +} + +void +p448_neg ( + p448_t *out, + const p448_t *a +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + out->limb[i] = -a->limb[i]; + } + */ +} + +void +p448_cond_neg( + p448_t *a, + mask_t doNegate +) { + unsigned int i; + struct p448_t negated; + big_register_t *aa = (big_register_t *)a; + big_register_t *nn = (big_register_t*)&negated; + big_register_t m = br_set_to_mask(doNegate); + + p448_neg(&negated, a); + p448_bias(&negated, 2); + + for (i=0; ilimb[0] += x; +} + +void +p448_subw ( + p448_t *a, + uint32_t x +) { + a->limb[0] -= x; +} + +void +p448_copy ( + p448_t *out, + const p448_t *a +) { + *out = *a; +} + +void +p448_bias ( + p448_t *a, + int amt +) { + uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; + uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1}; + uint32x4_t *aa = (uint32x4_t*) a; + aa[0] += lo; + aa[1] += hi; + aa[2] += hi; + aa[3] += hi; +} + +void +p448_weak_reduce ( + p448_t *a +) { + + uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1}, + tmp = vshr_n_u32(aa[7],28); + + int i; + for (i=7; i>=1; i--) { + aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28); + } + aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2); +} + +void +p448_sqrn ( + p448_t *__restrict__ y, + const p448_t *x, + int n +) { + p448_t tmp; + assert(n>0); + if (n&1) { + p448_sqr(y,x); + n--; + } else { + p448_sqr(&tmp,x); + p448_sqr(y,&tmp); + n-=2; + } + for (; n; n-=2) { + p448_sqr(&tmp,y); + p448_sqr(y,&tmp); + } +} + +mask_t +p448_eq ( + const struct p448_t *a, + const struct p448_t *b +) { + struct p448_t ra, rb; + p448_copy(&ra, a); + p448_copy(&rb, b); + p448_weak_reduce(&ra); + p448_weak_reduce(&rb); + p448_sub(&ra, &ra, &rb); + p448_bias(&ra, 2); + return p448_is_zero(&ra); +} + +void +p448_mask ( + struct p448_t *a, + const struct p448_t *b, + mask_t mask +) { + unsigned int i; + for (i=0; ilimb[0]); i++) { + a->limb[i] = b->limb[i] & mask; + } +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P448_H__ */ diff --git a/src/crandom.c b/src/crandom.c index 4b75f66..b9c1eb0 100644 --- a/src/crandom.c +++ b/src/crandom.c @@ -466,7 +466,7 @@ crandom_generate( unsigned long long copy = (length > state->fill) ? state->fill : length; state->fill -= copy; memcpy(output, state->buffer + state->fill, copy); - memset(state->buffer + state->fill, 0, copy); + really_memset(state->buffer + state->fill, 0, copy); output += copy; length -= copy; } @@ -484,5 +484,5 @@ crandom_destroy( */ } - memset(state, 0, sizeof(*state)); + really_memset(state, 0, sizeof(*state)); } diff --git a/src/goldilocks.c b/src/goldilocks.c index 440c8bd..31e38d1 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -340,7 +340,7 @@ goldilocks_sign ( word_t skw[GOLDI_FIELD_WORDS]; mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order); if (!succ) { - memset(skw,0,sizeof(skw)); + really_memset(skw,0,sizeof(skw)); return GOLDI_ECORRUPT; } @@ -389,9 +389,9 @@ goldilocks_sign ( memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); - memset((unsigned char *)tk,0,sizeof(tk)); - memset((unsigned char *)skw,0,sizeof(skw)); - memset((unsigned char *)challenge,0,sizeof(challenge)); + really_memset((unsigned char *)tk,0,sizeof(tk)); + really_memset((unsigned char *)skw,0,sizeof(skw)); + really_memset((unsigned char *)challenge,0,sizeof(challenge)); /* response = 2(nonce_secret - sk*challenge) * Nonce = 8[nonce_secret]*G @@ -494,7 +494,7 @@ goldilocks_destroy_precomputed_public_key ( ) { if (!precom) return; destroy_fixed_base(&precom->table); - memset(&precom->pub.opaque, 0, sizeof(precom->pub)); + really_memset(&precom->pub.opaque, 0, sizeof(precom->pub)); free(precom); } diff --git a/src/include/word.h b/src/include/word.h index d48d20f..26123bc 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -146,11 +146,17 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); } #endif -#if __AVX2__ || __SSE2__ +#if __AVX2__ static __inline__ big_register_t br_is_zero(big_register_t x) { return (big_register_t)(x == br_set_to_mask(0)); } +#elif __SSE2__ +static __inline__ big_register_t +br_is_zero(big_register_t x) { + return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); + //return (big_register_t)(x == br_set_to_mask(0)); +} #elif __ARM_NEON__ static __inline__ big_register_t br_is_zero(big_register_t x) { @@ -179,7 +185,25 @@ static inline uint64_t letoh64 (uint64_t x) { return x; } #endif - +/** + * Really call memset, in a way that prevents the compiler from optimizing it out. + * @param p The object to zeroize. + * @param c The char to set it to (probably zero). + * @param s The size of the object. + */ +#ifdef __STDC_LIB_EXT1__ /* which it won't be, because we're -std=c99 */ +static __inline__ void +really_memset(void *p, char c, size_t s) { + memset_s(p,s,c,s); +} +#else +static __inline__ void __attribute__((always_inline,unused)) +really_memset(void *p, char c, size_t s) { + volatile char *pv = (volatile char *)p; + size_t i; + for (i=0; in, doNegate); } -static __inline__ void +#if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__)) + /* This works around an apparent compiler bug in GCC, thanks Samuel Neves */ + static void __attribute__((optimize("O1"))) + #ifdef __OPTIMIZE_SIZE__ + #warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms" + #endif +#else + static __inline__ void +#endif constant_time_lookup_tw_pniels ( struct tw_pniels_t *out, const struct tw_pniels_t *in, @@ -76,7 +84,7 @@ constant_time_lookup_tw_pniels ( int j; unsigned int k; - memset(out, 0, sizeof(*out)); + really_memset(out, 0, sizeof(*out)); for (j=0; jtable) { - memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); + really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); } if (table->own_table) { free(table->table); } - memset(table,0,sizeof(*table)); + really_memset(table,0,sizeof(*table)); } mask_t diff --git a/test/bench.c b/test/bench.c index b80be14..0cb96b6 100644 --- a/test/bench.c +++ b/test/bench.c @@ -108,33 +108,33 @@ int main(int argc, char **argv) { q448_randomize(&crand, sk); when = now(); - for (i=0; i #include +#ifndef LIMBPERM +#define LIMBPERM(x) (x) +#endif int failed_tests, n_tests, failed_this_test, running_a_test; @@ -87,7 +90,7 @@ void p448_print ( int j; printf("%s = 0x", descr); for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) { - printf(PRIxWORD58, b.limb[j]); + printf(PRIxWORD58, b.limb[LIMBPERM(j)]); } printf("\n"); } diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c index 7fde48c..51a646c 100644 --- a/test/test_arithmetic.c +++ b/test/test_arithmetic.c @@ -170,7 +170,12 @@ int test_arithmetic () { int bits = sizeof(word_t) * 448 / sizeof(p448_t); for (j=0; j