Trying to work around an apparent GCC bug on SSE2, thanks Samuel Neves. Added an experimental NEON arch. It's fast. It's not yet GCC clean. It needs some more work on general cleanliness too.master
@@ -1,3 +1,23 @@ | |||
August 4, 2014: | |||
Experiments and bug fixes. | |||
Add really_memset = memset_s (except not because I'm setting -std=c99), | |||
thanks David Leon Gil. I think I put it in the right places. | |||
Try to work around what I think is a compiler bug in GCC -O3 on non-AVX | |||
platforms. I can't seem to work around it as -Os, so I'm just flagging | |||
a warning (-Werror makes it an error) for now. Will take more | |||
investigation. Thanks Samuel Neves. | |||
Added an experimental (not ready yet!) ARM NEON implementation in | |||
arch_neon_experimental. This implementation seems to work, but needs | |||
more testing. It is currently asm-heavy and not GCC clean. I am | |||
planning to have a flag for it to use intrinsics instead of asm; | |||
currently the intrinsics are commented out. On clang this does ECDH | |||
in 1850kcy on my BeagleBone Black, comparable to Curve41417. Once this | |||
is ready, I will probably move it to arch_neon proper, since arch_neon | |||
isn't particularly tuned. | |||
July 11, 2014: | |||
This is mostly a cleanup release. | |||
@@ -22,7 +22,7 @@ endif | |||
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | |||
-Wmissing-declarations -Wunused-function $(EXWARN) | |||
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | |||
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) | |||
@@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon | |||
else | |||
ARCHFLAGS += -mfpu=vfpv3-d16 | |||
endif | |||
ARCHFLAGS += -mcpu=cortex-a9 # FIXME | |||
GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow | |||
ARCHFLAGS += -mcpu=cortex-a8 # FIXME | |||
GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow | |||
else | |||
ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO | |||
endif | |||
@@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford | |||
University. All of these files are usable under the MIT license contained in | |||
LICENSE.txt. | |||
The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run | |||
The Makefile is set for my 2013 MacBook Air. You can `make bench` to run | |||
a completely arbitrary set of benchmarks and tests, or `make | |||
build/goldilocks.so` to build a stripped-down version of the library. For | |||
non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate | |||
@@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) { | |||
#include "neon_emulation.h" | |||
#endif /* ARM_NEON */ | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
static inline void __attribute__((gnu_inline,always_inline,unused)) | |||
smlal ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
@@ -48,7 +48,7 @@ smlal ( | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
static inline void __attribute__((gnu_inline,always_inline,unused)) | |||
smlal2 ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
@@ -57,7 +57,7 @@ smlal2 ( | |||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
static inline void __attribute__((gnu_inline,always_inline,unused)) | |||
smull ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
@@ -66,7 +66,7 @@ smull ( | |||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; | |||
} | |||
static inline void __attribute__((gnu_inline,always_inline)) | |||
static inline void __attribute__((gnu_inline,always_inline,unused)) | |||
smull2 ( | |||
uint64_t *acc, | |||
const uint32_t a, | |||
@@ -84,6 +84,7 @@ p448_mul ( | |||
const uint32_t *a = as->limb, *b = bs->limb; | |||
uint32_t *c = cs->limb; | |||
const int32x2_t | |||
*val = (const int32x2_t *)a, | |||
*vbl = (const int32x2_t *)b, | |||
@@ -109,155 +110,170 @@ p448_mul ( | |||
accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); | |||
accumx1a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); | |||
accumx3a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0); | |||
accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); | |||
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); | |||
accumx3b = vmull_lane_s32( delta, vbh[1], 1); | |||
accumx0b = vmull_lane_s32( delta, vbh[0], 0); | |||
accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0); | |||
accumx1b = vmull_lane_s32( delta, vbh[0], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); | |||
accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); | |||
accumx2b += accumx2a; | |||
accumx3b += accumx3a; | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); | |||
accumx0b += accumx0a; | |||
accumx1b += accumx1a; | |||
accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); | |||
accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); | |||
accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); | |||
accumx2a += accumx2b; | |||
accumx3a += accumx3b; | |||
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); | |||
accumx0a += accumx0b; | |||
accumx1a += accumx1b; | |||
accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); | |||
accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); | |||
xx_vtrnq_s64(&accumx0a, &accumx0b); | |||
xx_vtrnq_s64(&accumx1a, &accumx1b); | |||
xx_vtrnq_s64(&accumx2a, &accumx2b); | |||
xx_vtrnq_s64(&accumx3a, &accumx3b); | |||
accumx0b += accumx1a; | |||
accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); | |||
accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); | |||
vcl[0] = trn_res.val[1] & vmask; | |||
vch[0] = trn_res.val[0] & vmask; | |||
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0); | |||
accumx3a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1); | |||
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0); | |||
accumx3b = vmull_lane_s32( delta, vbh[1], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1); | |||
accumx2b += accumx2a; | |||
accumx3b += accumx3a; | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1); | |||
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0); | |||
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1); | |||
accumx2a += accumx2b; | |||
accumx3a += accumx3b; | |||
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1); | |||
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0); | |||
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1); | |||
xx_vtrnq_s64(&accumx2a, &accumx2b); | |||
xx_vtrnq_s64(&accumx3a, &accumx3b); | |||
accumx2a += accumx1b; | |||
accumx2b += accumx3a; | |||
accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); | |||
accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b)); | |||
vcl[0] = trn_res.val[1] & vmask; | |||
vch[0] = trn_res.val[0] & vmask; | |||
trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); | |||
vcl[1] = trn_res.val[1] & vmask; | |||
vch[1] = trn_res.val[0] & vmask; | |||
carry = accumx3b; | |||
accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); | |||
accumx5a = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); | |||
accumx7b = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx4b = accumx4a; | |||
accumx5b = accumx5a; | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0); | |||
accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); | |||
accumx6a = accumx6b; | |||
accumx7a = accumx7b; | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); | |||
accumx4a += accumx4b; | |||
accumx5a += accumx5b; | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); | |||
accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); | |||
accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); | |||
/**/ | |||
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); | |||
accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); | |||
accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); | |||
xx_vtrnq_s64(&accumx4a, &accumx4b); | |||
xx_vtrnq_s64(&accumx5a, &accumx5b); | |||
xx_vtrnq_s64(&accumx6a, &accumx6b); | |||
xx_vtrnq_s64(&accumx7a, &accumx7b); | |||
accumx4a += carry; | |||
accumx4b += accumx5a; | |||
accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); | |||
accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); | |||
accumx6a += accumx5b; | |||
accumx6b += accumx7a; | |||
trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); | |||
vcl[2] = trn_res.val[1] & vmask; | |||
vch[2] = trn_res.val[0] & vmask; | |||
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0); | |||
accumx7b = vmull_lane_s32( delta, vbh[3], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1); | |||
accumx6a = accumx6b; | |||
accumx7a = accumx7b; | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1); | |||
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0); | |||
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1); | |||
/**/ | |||
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1); | |||
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0); | |||
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1); | |||
xx_vtrnq_s64(&accumx6a, &accumx6b); | |||
xx_vtrnq_s64(&accumx7a, &accumx7b); | |||
accumx6a += accumx5b; | |||
accumx6b += accumx7a; | |||
accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); | |||
accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); | |||
trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); | |||
vcl[3] = trn_res.val[1] & vmask; | |||
vch[3] = trn_res.val[0] & vmask; | |||
accumx7b = xx_vaddup_s64(accumx7b); | |||
int32x2_t t0 = vcl[0], t1 = vch[0]; | |||
@@ -0,0 +1,962 @@ | |||
/** | |||
* @cond internal | |||
* @file ec_point.c | |||
* @copyright | |||
* Copyright (c) 2014 Cryptography Research, Inc. \n | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
* @author Mike Hamburg | |||
* @warning This file was automatically generated. | |||
*/ | |||
#include "ec_point.h" | |||
void | |||
p448_isr ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L1, x ); | |||
p448_mul ( &L2, x, &L1 ); | |||
p448_sqr ( &L1, &L2 ); | |||
p448_mul ( &L2, x, &L1 ); | |||
p448_sqrn ( &L1, &L2, 3 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_sqrn ( &L1, &L0, 3 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_sqrn ( &L2, &L0, 9 ); | |||
p448_mul ( &L1, &L0, &L2 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mul ( &L2, x, &L0 ); | |||
p448_sqrn ( &L0, &L2, 18 ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_sqrn ( &L0, &L2, 37 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_sqrn ( &L0, &L1, 37 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_sqrn ( &L0, &L1, 111 ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_sqr ( &L0, &L2 ); | |||
p448_mul ( &L1, x, &L0 ); | |||
p448_sqrn ( &L0, &L1, 223 ); | |||
p448_mul ( a, &L2, &L0 ); | |||
} | |||
void | |||
p448_inverse ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
) { | |||
struct p448_t L0, L1; | |||
p448_isr ( &L0, x ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mul ( a, x, &L0 ); | |||
} | |||
void | |||
add_tw_niels_to_tw_extensible ( | |||
struct tw_extensible_t* d, | |||
const struct tw_niels_t* e | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sub ( &L1, &d->y, &d->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L0, &e->a, &L1 ); | |||
p448_add ( &L1, &d->x, &d->y ); | |||
p448_mul ( &d->y, &e->b, &L1 ); | |||
p448_mul ( &L1, &d->u, &d->t ); | |||
p448_mul ( &d->x, &e->c, &L1 ); | |||
p448_add ( &d->u, &L0, &d->y ); | |||
p448_sub ( &d->t, &d->y, &L0 ); | |||
p448_bias ( &d->t, 2 ); | |||
p448_weak_reduce( &d->t ); | |||
p448_sub ( &d->y, &d->z, &d->x ); | |||
p448_bias ( &d->y, 2 ); | |||
p448_weak_reduce( &d->y ); | |||
p448_add ( &L0, &d->x, &d->z ); | |||
p448_mul ( &d->z, &L0, &d->y ); | |||
p448_mul ( &d->x, &d->y, &d->t ); | |||
p448_mul ( &d->y, &L0, &d->u ); | |||
} | |||
void | |||
sub_tw_niels_from_tw_extensible ( | |||
struct tw_extensible_t* d, | |||
const struct tw_niels_t* e | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sub ( &L1, &d->y, &d->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L0, &e->b, &L1 ); | |||
p448_add ( &L1, &d->x, &d->y ); | |||
p448_mul ( &d->y, &e->a, &L1 ); | |||
p448_mul ( &L1, &d->u, &d->t ); | |||
p448_mul ( &d->x, &e->c, &L1 ); | |||
p448_add ( &d->u, &L0, &d->y ); | |||
p448_sub ( &d->t, &d->y, &L0 ); | |||
p448_bias ( &d->t, 2 ); | |||
p448_weak_reduce( &d->t ); | |||
p448_add ( &d->y, &d->x, &d->z ); | |||
p448_sub ( &L0, &d->z, &d->x ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &d->z, &L0, &d->y ); | |||
p448_mul ( &d->x, &d->y, &d->t ); | |||
p448_mul ( &d->y, &L0, &d->u ); | |||
} | |||
void | |||
add_tw_pniels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_mul ( &L0, &e->z, &a->z ); | |||
p448_copy ( &e->z, &L0 ); | |||
add_tw_niels_to_tw_extensible( e, &a->n ); | |||
} | |||
void | |||
sub_tw_pniels_from_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_mul ( &L0, &e->z, &a->z ); | |||
p448_copy ( &e->z, &L0 ); | |||
sub_tw_niels_from_tw_extensible( e, &a->n ); | |||
} | |||
void | |||
double_tw_extensible ( | |||
struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->x ); | |||
p448_sqr ( &L0, &a->y ); | |||
p448_add ( &a->u, &L2, &L0 ); | |||
p448_add ( &a->t, &a->y, &a->x ); | |||
p448_sqr ( &L1, &a->t ); | |||
p448_sub ( &a->t, &L1, &a->u ); | |||
p448_bias ( &a->t, 3 ); | |||
p448_weak_reduce( &a->t ); | |||
p448_sub ( &L1, &L0, &L2 ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_bias ( &a->x, 1 ); | |||
p448_add ( &a->z, &a->x, &a->x ); | |||
p448_sub ( &L0, &a->z, &L1 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->z, &L1, &L0 ); | |||
p448_mul ( &a->x, &L0, &a->t ); | |||
p448_mul ( &a->y, &L1, &a->u ); | |||
} | |||
void | |||
double_extensible ( | |||
struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->x ); | |||
p448_sqr ( &L0, &a->y ); | |||
p448_add ( &L1, &L2, &L0 ); | |||
p448_add ( &a->t, &a->y, &a->x ); | |||
p448_sqr ( &a->u, &a->t ); | |||
p448_sub ( &a->t, &a->u, &L1 ); | |||
p448_bias ( &a->t, 3 ); | |||
p448_weak_reduce( &a->t ); | |||
p448_sub ( &a->u, &L0, &L2 ); | |||
p448_bias ( &a->u, 2 ); | |||
p448_weak_reduce( &a->u ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_add ( &a->z, &a->x, &a->x ); | |||
p448_sub ( &L0, &a->z, &L1 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->z, &L1, &L0 ); | |||
p448_mul ( &a->x, &L0, &a->t ); | |||
p448_mul ( &a->y, &L1, &a->u ); | |||
} | |||
void | |||
twist_and_double ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_sqr ( &b->x, &a->x ); | |||
p448_sqr ( &b->z, &a->y ); | |||
p448_add ( &b->u, &b->x, &b->z ); | |||
p448_add ( &b->t, &a->y, &a->x ); | |||
p448_sqr ( &L0, &b->t ); | |||
p448_sub ( &b->t, &L0, &b->u ); | |||
p448_bias ( &b->t, 3 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_sub ( &L0, &b->z, &b->x ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_sqr ( &b->x, &a->z ); | |||
p448_bias ( &b->x, 2 ); | |||
p448_add ( &b->z, &b->x, &b->x ); | |||
p448_sub ( &b->y, &b->z, &b->u ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->z, &L0, &b->y ); | |||
p448_mul ( &b->x, &b->y, &b->t ); | |||
p448_mul ( &b->y, &L0, &b->u ); | |||
} | |||
void | |||
untwist_and_double ( | |||
struct extensible_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0; | |||
p448_sqr ( &b->x, &a->x ); | |||
p448_sqr ( &b->z, &a->y ); | |||
p448_add ( &L0, &b->x, &b->z ); | |||
p448_add ( &b->t, &a->y, &a->x ); | |||
p448_sqr ( &b->u, &b->t ); | |||
p448_sub ( &b->t, &b->u, &L0 ); | |||
p448_bias ( &b->t, 3 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_sub ( &b->u, &b->z, &b->x ); | |||
p448_bias ( &b->u, 2 ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sqr ( &b->x, &a->z ); | |||
p448_bias ( &b->x, 1 ); | |||
p448_add ( &b->z, &b->x, &b->x ); | |||
p448_sub ( &b->y, &b->z, &b->u ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->z, &L0, &b->y ); | |||
p448_mul ( &b->x, &b->y, &b->t ); | |||
p448_mul ( &b->y, &L0, &b->u ); | |||
} | |||
void | |||
convert_tw_affine_to_tw_pniels ( | |||
struct tw_pniels_t* b, | |||
const struct tw_affine_t* a | |||
) { | |||
p448_sub ( &b->n.a, &a->y, &a->x ); | |||
p448_bias ( &b->n.a, 2 ); | |||
p448_weak_reduce( &b->n.a ); | |||
p448_add ( &b->n.b, &a->x, &a->y ); | |||
p448_weak_reduce( &b->n.b ); | |||
p448_mul ( &b->n.c, &a->y, &a->x ); | |||
p448_mulw ( &b->z, &b->n.c, 78164 ); | |||
p448_neg ( &b->n.c, &b->z ); | |||
p448_bias ( &b->n.c, 2 ); | |||
p448_weak_reduce( &b->n.c ); | |||
p448_set_ui( &b->z, 2 ); | |||
} | |||
void | |||
convert_tw_affine_to_tw_extensible ( | |||
struct tw_extensible_t* b, | |||
const struct tw_affine_t* a | |||
) { | |||
p448_copy ( &b->x, &a->x ); | |||
p448_copy ( &b->y, &a->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &a->x ); | |||
p448_copy ( &b->u, &a->y ); | |||
} | |||
void | |||
convert_affine_to_extensible ( | |||
struct extensible_t* b, | |||
const struct affine_t* a | |||
) { | |||
p448_copy ( &b->x, &a->x ); | |||
p448_copy ( &b->y, &a->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &a->x ); | |||
p448_copy ( &b->u, &a->y ); | |||
} | |||
void | |||
convert_tw_extensible_to_tw_pniels ( | |||
struct tw_pniels_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
p448_sub ( &b->n.a, &a->y, &a->x ); | |||
p448_bias ( &b->n.a, 2 ); | |||
p448_weak_reduce( &b->n.a ); | |||
p448_add ( &b->n.b, &a->x, &a->y ); | |||
p448_weak_reduce( &b->n.b ); | |||
p448_mul ( &b->n.c, &a->u, &a->t ); | |||
p448_mulw ( &b->z, &b->n.c, 78164 ); | |||
p448_neg ( &b->n.c, &b->z ); | |||
p448_bias ( &b->n.c, 2 ); | |||
p448_weak_reduce( &b->n.c ); | |||
p448_add ( &b->z, &a->z, &a->z ); | |||
p448_weak_reduce( &b->z ); | |||
} | |||
void | |||
convert_tw_pniels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_pniels_t* d | |||
) { | |||
p448_add ( &e->u, &d->n.b, &d->n.a ); | |||
p448_sub ( &e->t, &d->n.b, &d->n.a ); | |||
p448_bias ( &e->t, 2 ); | |||
p448_weak_reduce( &e->t ); | |||
p448_mul ( &e->x, &d->z, &e->t ); | |||
p448_mul ( &e->y, &d->z, &e->u ); | |||
p448_sqr ( &e->z, &d->z ); | |||
} | |||
void | |||
convert_tw_niels_to_tw_extensible ( | |||
struct tw_extensible_t* e, | |||
const struct tw_niels_t* d | |||
) { | |||
p448_add ( &e->y, &d->b, &d->a ); | |||
p448_weak_reduce( &e->y ); | |||
p448_sub ( &e->x, &d->b, &d->a ); | |||
p448_bias ( &e->x, 2 ); | |||
p448_weak_reduce( &e->x ); | |||
p448_set_ui( &e->z, 1 ); | |||
p448_copy ( &e->t, &e->x ); | |||
p448_copy ( &e->u, &e->y ); | |||
} | |||
void | |||
montgomery_step ( | |||
struct montgomery_t* a | |||
) { | |||
struct p448_t L0, L1; | |||
p448_add ( &L0, &a->zd, &a->xd ); | |||
p448_sub ( &L1, &a->xd, &a->zd ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_sub ( &a->zd, &a->xa, &a->za ); | |||
p448_bias ( &a->zd, 2 ); | |||
p448_weak_reduce( &a->zd ); | |||
p448_mul ( &a->xd, &L0, &a->zd ); | |||
p448_add ( &a->zd, &a->za, &a->xa ); | |||
p448_mul ( &a->za, &L1, &a->zd ); | |||
p448_add ( &a->xa, &a->za, &a->xd ); | |||
p448_sqr ( &a->zd, &a->xa ); | |||
p448_mul ( &a->xa, &a->z0, &a->zd ); | |||
p448_sub ( &a->zd, &a->xd, &a->za ); | |||
p448_bias ( &a->zd, 2 ); | |||
p448_weak_reduce( &a->zd ); | |||
p448_sqr ( &a->za, &a->zd ); | |||
p448_sqr ( &a->xd, &L0 ); | |||
p448_sqr ( &L0, &L1 ); | |||
p448_mulw ( &a->zd, &a->xd, 39082 ); | |||
p448_sub ( &L1, &a->xd, &L0 ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &a->xd, &L0, &a->zd ); | |||
p448_sub ( &L0, &a->zd, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &a->zd, &L0, &L1 ); | |||
} | |||
void | |||
deserialize_montgomery ( | |||
struct montgomery_t* a, | |||
const struct p448_t* sbz | |||
) { | |||
p448_sqr ( &a->z0, sbz ); | |||
p448_set_ui( &a->xd, 1 ); | |||
p448_set_ui( &a->zd, 0 ); | |||
p448_set_ui( &a->xa, 1 ); | |||
p448_copy ( &a->za, &a->z0 ); | |||
} | |||
mask_t | |||
serialize_montgomery ( | |||
struct p448_t* b, | |||
const struct montgomery_t* a, | |||
const struct p448_t* sbz | |||
) { | |||
mask_t L4, L5, L6; | |||
struct p448_t L0, L1, L2, L3; | |||
p448_mul ( &L3, &a->z0, &a->zd ); | |||
p448_sub ( &L1, &L3, &a->xd ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L3, &a->za, &L1 ); | |||
p448_mul ( &L2, &a->z0, &a->xd ); | |||
p448_sub ( &L1, &L2, &a->zd ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L0, &a->xa, &L1 ); | |||
p448_add ( &L2, &L0, &L3 ); | |||
p448_sub ( &L1, &L3, &L0 ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &L3, &L1, &L2 ); | |||
p448_copy ( &L2, &a->z0 ); | |||
p448_addw ( &L2, 1 ); | |||
p448_sqr ( &L1, &L2 ); | |||
p448_mulw ( &L2, &L1, 39082 ); | |||
p448_neg ( &L1, &L2 ); | |||
p448_add ( &L2, &a->z0, &a->z0 ); | |||
p448_bias ( &L2, 1 ); | |||
p448_add ( &L0, &L2, &L2 ); | |||
p448_add ( &L2, &L0, &L1 ); | |||
p448_weak_reduce( &L2 ); | |||
p448_mul ( &L0, &a->xd, &L2 ); | |||
L5 = p448_is_zero( &a->zd ); | |||
L6 = - L5; | |||
p448_mask ( &L1, &L0, L5 ); | |||
p448_add ( &L2, &L1, &a->zd ); | |||
L4 = ~ L5; | |||
p448_mul ( &L1, sbz, &L3 ); | |||
p448_addw ( &L1, L6 ); | |||
p448_mul ( &L3, &L2, &L1 ); | |||
p448_mul ( &L1, &L3, &L2 ); | |||
p448_mul ( &L2, &L3, &a->xd ); | |||
p448_mul ( &L3, &L1, &L2 ); | |||
p448_isr ( &L0, &L3 ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &L0, &L3, &L1 ); | |||
p448_mask ( b, &L2, L4 ); | |||
p448_subw ( &L0, 1 ); | |||
p448_bias ( &L0, 1 ); | |||
L5 = p448_is_zero( &L0 ); | |||
L4 = p448_is_zero( sbz ); | |||
return L5 | L4; | |||
} | |||
void | |||
serialize_extensible ( | |||
struct p448_t* b, | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sub ( &L0, &a->y, &a->z ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_add ( b, &a->z, &a->y ); | |||
p448_mul ( &L1, &a->z, &a->x ); | |||
p448_mul ( &L2, &L0, &L1 ); | |||
p448_mul ( &L1, &L2, &L0 ); | |||
p448_mul ( &L0, &L2, b ); | |||
p448_mul ( &L2, &L1, &L0 ); | |||
p448_isr ( &L0, &L2 ); | |||
p448_mul ( b, &L1, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
} | |||
void | |||
untwist_and_double_and_serialize ( | |||
struct p448_t* b, | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_mul ( &L3, &a->y, &a->x ); | |||
p448_add ( b, &a->y, &a->x ); | |||
p448_sqr ( &L1, b ); | |||
p448_add ( &L2, &L3, &L3 ); | |||
p448_sub ( b, &L1, &L2 ); | |||
p448_bias ( b, 3 ); | |||
p448_weak_reduce( b ); | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &L2 ); | |||
p448_add ( &L2, b, b ); | |||
p448_mulw ( b, &L2, 39082 ); | |||
p448_neg ( &L2, b ); | |||
p448_bias ( &L2, 2 ); | |||
p448_mulw ( &L0, &L2, 39082 ); | |||
p448_neg ( b, &L0 ); | |||
p448_bias ( b, 2 ); | |||
p448_mul ( &L0, &L2, &L1 ); | |||
p448_mul ( &L2, b, &L0 ); | |||
p448_isr ( &L0, &L2 ); | |||
p448_mul ( &L1, b, &L0 ); | |||
p448_sqr ( b, &L0 ); | |||
p448_mul ( &L0, &L2, b ); | |||
p448_mul ( b, &L1, &L3 ); | |||
} | |||
void | |||
twist_even ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
mask_t L0, L1; | |||
p448_sqr ( &b->y, &a->z ); | |||
p448_sqr ( &b->z, &a->x ); | |||
p448_sub ( &b->u, &b->y, &b->z ); | |||
p448_bias ( &b->u, 2 ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sub ( &b->z, &a->z, &a->x ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->y, &b->z, &a->y ); | |||
p448_sub ( &b->z, &a->z, &a->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->x, &b->z, &b->y ); | |||
p448_mul ( &b->t, &b->x, &b->u ); | |||
p448_mul ( &b->y, &b->x, &b->t ); | |||
p448_isr ( &b->t, &b->y ); | |||
p448_mul ( &b->u, &b->x, &b->t ); | |||
p448_sqr ( &b->x, &b->t ); | |||
p448_mul ( &b->t, &b->y, &b->x ); | |||
p448_mul ( &b->x, &a->x, &b->u ); | |||
p448_mul ( &b->y, &a->y, &b->u ); | |||
L1 = p448_is_zero( &b->z ); | |||
L0 = - L1; | |||
p448_addw ( &b->y, L0 ); | |||
p448_weak_reduce( &b->y ); | |||
p448_set_ui( &b->z, 1 ); | |||
p448_copy ( &b->t, &b->x ); | |||
p448_copy ( &b->u, &b->y ); | |||
} | |||
void | |||
test_only_twist ( | |||
struct tw_extensible_t* b, | |||
const struct extensible_t* a | |||
) { | |||
mask_t L2, L3; | |||
struct p448_t L0, L1; | |||
p448_sqr ( &b->u, &a->z ); | |||
p448_sqr ( &b->y, &a->x ); | |||
p448_sub ( &b->z, &b->u, &b->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_add ( &b->y, &b->z, &b->z ); | |||
p448_add ( &b->u, &b->y, &b->y ); | |||
p448_weak_reduce( &b->u ); | |||
p448_sub ( &b->y, &a->z, &a->x ); | |||
p448_bias ( &b->y, 2 ); | |||
p448_weak_reduce( &b->y ); | |||
p448_mul ( &b->x, &b->y, &a->y ); | |||
p448_sub ( &b->z, &a->z, &a->y ); | |||
p448_bias ( &b->z, 2 ); | |||
p448_weak_reduce( &b->z ); | |||
p448_mul ( &b->t, &b->z, &b->x ); | |||
p448_mul ( &L1, &b->t, &b->u ); | |||
p448_mul ( &b->x, &b->t, &L1 ); | |||
p448_isr ( &L0, &b->x ); | |||
p448_mul ( &b->u, &b->t, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &b->t, &b->x, &L1 ); | |||
p448_add ( &L1, &a->y, &a->x ); | |||
p448_weak_reduce( &L1 ); | |||
p448_sub ( &L0, &a->x, &a->y ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
p448_mul ( &b->x, &b->t, &L0 ); | |||
p448_add ( &L0, &b->x, &L1 ); | |||
p448_sub ( &b->t, &L1, &b->x ); | |||
p448_bias ( &b->t, 2 ); | |||
p448_weak_reduce( &b->t ); | |||
p448_mul ( &b->x, &L0, &b->u ); | |||
L2 = p448_is_zero( &b->y ); | |||
L3 = - L2; | |||
p448_addw ( &b->x, L3 ); | |||
p448_weak_reduce( &b->x ); | |||
p448_mul ( &b->y, &b->t, &b->u ); | |||
L2 = p448_is_zero( &b->z ); | |||
L3 = - L2; | |||
p448_addw ( &b->y, L3 ); | |||
p448_weak_reduce( &b->y ); | |||
L3 = p448_is_zero( &a->y ); | |||
L2 = L3 + 1; | |||
p448_set_ui( &b->z, L2 ); | |||
p448_copy ( &b->t, &b->x ); | |||
p448_copy ( &b->u, &b->y ); | |||
} | |||
mask_t | |||
is_square ( | |||
const struct p448_t* x | |||
) { | |||
mask_t L2, L3; | |||
struct p448_t L0, L1; | |||
p448_isr ( &L0, x ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &L0, x, &L1 ); | |||
p448_subw ( &L0, 1 ); | |||
p448_bias ( &L0, 1 ); | |||
L3 = p448_is_zero( &L0 ); | |||
L2 = p448_is_zero( x ); | |||
return L3 | L2; | |||
} | |||
mask_t | |||
is_even_pt ( | |||
const struct extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &a->x ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
p448_weak_reduce( &L0 ); | |||
return is_square ( &L0 ); | |||
} | |||
mask_t | |||
is_even_tw ( | |||
const struct tw_extensible_t* a | |||
) { | |||
struct p448_t L0, L1, L2; | |||
p448_sqr ( &L2, &a->z ); | |||
p448_sqr ( &L1, &a->x ); | |||
p448_add ( &L0, &L1, &L2 ); | |||
p448_weak_reduce( &L0 ); | |||
return is_square ( &L0 ); | |||
} | |||
mask_t | |||
deserialize_affine ( | |||
struct affine_t* a, | |||
const struct p448_t* sz | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_sqr ( &L1, sz ); | |||
p448_copy ( &L3, &L1 ); | |||
p448_addw ( &L3, 1 ); | |||
p448_sqr ( &a->x, &L3 ); | |||
p448_mulw ( &L3, &a->x, 39082 ); | |||
p448_neg ( &a->x, &L3 ); | |||
p448_add ( &L3, &L1, &L1 ); | |||
p448_bias ( &L3, 1 ); | |||
p448_add ( &a->y, &L3, &L3 ); | |||
p448_add ( &L3, &a->y, &a->x ); | |||
p448_weak_reduce( &L3 ); | |||
p448_copy ( &a->y, &L1 ); | |||
p448_subw ( &a->y, 1 ); | |||
p448_neg ( &a->x, &a->y ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_mul ( &a->y, &a->x, &L3 ); | |||
p448_sqr ( &L2, &a->x ); | |||
p448_mul ( &L0, &L2, &a->y ); | |||
p448_mul ( &a->y, &a->x, &L0 ); | |||
p448_isr ( &L3, &a->y ); | |||
p448_mul ( &a->y, &L2, &L3 ); | |||
p448_sqr ( &L2, &L3 ); | |||
p448_mul ( &L3, &L0, &L2 ); | |||
p448_mul ( &L0, &a->x, &L3 ); | |||
p448_add ( &L2, &a->y, &a->y ); | |||
p448_mul ( &a->x, sz, &L2 ); | |||
p448_addw ( &L1, 1 ); | |||
p448_mul ( &a->y, &L1, &L3 ); | |||
p448_subw ( &L0, 1 ); | |||
p448_bias ( &L0, 1 ); | |||
return p448_is_zero( &L0 ); | |||
} | |||
mask_t | |||
deserialize_and_twist_approx ( | |||
struct tw_extensible_t* a, | |||
const struct p448_t* sdm1, | |||
const struct p448_t* sz | |||
) { | |||
struct p448_t L0, L1; | |||
p448_sqr ( &a->z, sz ); | |||
p448_copy ( &a->y, &a->z ); | |||
p448_addw ( &a->y, 1 ); | |||
p448_sqr ( &a->x, &a->y ); | |||
p448_mulw ( &a->y, &a->x, 39082 ); | |||
p448_neg ( &a->x, &a->y ); | |||
p448_add ( &a->y, &a->z, &a->z ); | |||
p448_bias ( &a->y, 1 ); | |||
p448_add ( &a->u, &a->y, &a->y ); | |||
p448_add ( &a->y, &a->u, &a->x ); | |||
p448_weak_reduce( &a->y ); | |||
p448_sqr ( &a->x, &a->z ); | |||
p448_subw ( &a->x, 1 ); | |||
p448_neg ( &a->u, &a->x ); | |||
p448_bias ( &a->u, 2 ); | |||
p448_weak_reduce( &a->u ); | |||
p448_mul ( &a->x, sdm1, &a->u ); | |||
p448_mul ( &L0, &a->x, &a->y ); | |||
p448_mul ( &a->t, &L0, &a->y ); | |||
p448_mul ( &a->u, &a->x, &a->t ); | |||
p448_mul ( &a->t, &a->u, &L0 ); | |||
p448_mul ( &a->y, &a->x, &a->t ); | |||
p448_isr ( &L0, &a->y ); | |||
p448_mul ( &a->y, &a->u, &L0 ); | |||
p448_sqr ( &L1, &L0 ); | |||
p448_mul ( &a->u, &a->t, &L1 ); | |||
p448_mul ( &a->t, &a->x, &a->u ); | |||
p448_add ( &a->x, sz, sz ); | |||
p448_mul ( &L0, &a->u, &a->x ); | |||
p448_copy ( &a->x, &a->z ); | |||
p448_subw ( &a->x, 1 ); | |||
p448_neg ( &L1, &a->x ); | |||
p448_bias ( &L1, 2 ); | |||
p448_weak_reduce( &L1 ); | |||
p448_mul ( &a->x, &L1, &L0 ); | |||
p448_mul ( &L0, &a->u, &a->y ); | |||
p448_addw ( &a->z, 1 ); | |||
p448_mul ( &a->y, &a->z, &L0 ); | |||
p448_subw ( &a->t, 1 ); | |||
p448_bias ( &a->t, 1 ); | |||
mask_t ret = p448_is_zero( &a->t ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_copy ( &a->t, &a->x ); | |||
p448_copy ( &a->u, &a->y ); | |||
return ret; | |||
} | |||
void | |||
set_identity_extensible ( | |||
struct extensible_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_set_ui( &a->t, 0 ); | |||
p448_set_ui( &a->u, 0 ); | |||
} | |||
void | |||
set_identity_tw_extensible ( | |||
struct tw_extensible_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
p448_set_ui( &a->z, 1 ); | |||
p448_set_ui( &a->t, 0 ); | |||
p448_set_ui( &a->u, 0 ); | |||
} | |||
void | |||
set_identity_affine ( | |||
struct affine_t* a | |||
) { | |||
p448_set_ui( &a->x, 0 ); | |||
p448_set_ui( &a->y, 1 ); | |||
} | |||
mask_t | |||
eq_affine ( | |||
const struct affine_t* a, | |||
const struct affine_t* b | |||
) { | |||
mask_t L1, L2; | |||
struct p448_t L0; | |||
p448_sub ( &L0, &a->x, &b->x ); | |||
p448_bias ( &L0, 2 ); | |||
L2 = p448_is_zero( &L0 ); | |||
p448_sub ( &L0, &a->y, &b->y ); | |||
p448_bias ( &L0, 2 ); | |||
L1 = p448_is_zero( &L0 ); | |||
return L2 & L1; | |||
} | |||
mask_t | |||
eq_extensible ( | |||
const struct extensible_t* a, | |||
const struct extensible_t* b | |||
) { | |||
mask_t L3, L4; | |||
struct p448_t L0, L1, L2; | |||
p448_mul ( &L2, &b->z, &a->x ); | |||
p448_mul ( &L1, &a->z, &b->x ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
L4 = p448_is_zero( &L0 ); | |||
p448_mul ( &L2, &b->z, &a->y ); | |||
p448_mul ( &L1, &a->z, &b->y ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
L3 = p448_is_zero( &L0 ); | |||
return L4 & L3; | |||
} | |||
mask_t | |||
eq_tw_extensible ( | |||
const struct tw_extensible_t* a, | |||
const struct tw_extensible_t* b | |||
) { | |||
mask_t L3, L4; | |||
struct p448_t L0, L1, L2; | |||
p448_mul ( &L2, &b->z, &a->x ); | |||
p448_mul ( &L1, &a->z, &b->x ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
L4 = p448_is_zero( &L0 ); | |||
p448_mul ( &L2, &b->z, &a->y ); | |||
p448_mul ( &L1, &a->z, &b->y ); | |||
p448_sub ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 2 ); | |||
L3 = p448_is_zero( &L0 ); | |||
return L4 & L3; | |||
} | |||
void | |||
elligator_2s_inject ( | |||
struct affine_t* a, | |||
const struct p448_t* r | |||
) { | |||
mask_t L0, L1; | |||
struct p448_t L2, L3, L4, L5, L6, L7, L8; | |||
p448_sqr ( &a->x, r ); | |||
p448_sqr ( &L3, &a->x ); | |||
p448_copy ( &a->y, &L3 ); | |||
p448_subw ( &a->y, 1 ); | |||
p448_neg ( &L4, &a->y ); | |||
p448_bias ( &L4, 2 ); | |||
p448_weak_reduce( &L4 ); | |||
p448_sqr ( &L2, &L4 ); | |||
p448_mulw ( &L7, &L2, 1527402724 ); | |||
p448_mulw ( &L8, &L3, 6108985600 ); | |||
p448_add ( &a->y, &L8, &L7 ); | |||
p448_weak_reduce( &a->y ); | |||
p448_mulw ( &L8, &L2, 6109454568 ); | |||
p448_sub ( &L7, &a->y, &L8 ); | |||
p448_bias ( &L7, 2 ); | |||
p448_weak_reduce( &L7 ); | |||
p448_mulw ( &L6, &a->y, 78160 ); | |||
p448_mul ( &L5, &L7, &L6 ); | |||
p448_mul ( &L8, &L5, &L4 ); | |||
p448_mul ( &L4, &L5, &L6 ); | |||
p448_mul ( &L5, &L7, &L8 ); | |||
p448_mul ( &L8, &L5, &L4 ); | |||
p448_mul ( &L4, &L7, &L8 ); | |||
p448_isr ( &L6, &L4 ); | |||
p448_mul ( &L4, &L5, &L6 ); | |||
p448_sqr ( &L5, &L6 ); | |||
p448_mul ( &L6, &L8, &L5 ); | |||
p448_mul ( &L8, &L7, &L6 ); | |||
p448_mul ( &L7, &L8, &L6 ); | |||
p448_copy ( &L6, &a->x ); | |||
p448_subw ( &L6, 1 ); | |||
p448_addw ( &a->x, 1 ); | |||
p448_mul ( &L5, &a->x, &L8 ); | |||
p448_sub ( &a->x, &L6, &L5 ); | |||
p448_bias ( &a->x, 3 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_mul ( &L5, &L4, &a->x ); | |||
p448_mulw ( &L4, &L5, 78160 ); | |||
p448_neg ( &a->x, &L4 ); | |||
p448_bias ( &a->x, 2 ); | |||
p448_weak_reduce( &a->x ); | |||
p448_add ( &L4, &L3, &L3 ); | |||
p448_add ( &L3, &L4, &L2 ); | |||
p448_subw ( &L3, 2 ); | |||
p448_bias ( &L3, 1 ); | |||
p448_weak_reduce( &L3 ); | |||
p448_mul ( &L2, &L3, &L8 ); | |||
p448_mulw ( &L3, &L2, 3054649120 ); | |||
p448_add ( &L2, &L3, &a->y ); | |||
p448_mul ( &a->y, &L7, &L2 ); | |||
L1 = p448_is_zero( &L8 ); | |||
L0 = - L1; | |||
p448_addw ( &a->y, L0 ); | |||
p448_weak_reduce( &a->y ); | |||
} | |||
mask_t | |||
validate_affine ( | |||
const struct affine_t* a | |||
) { | |||
struct p448_t L0, L1, L2, L3; | |||
p448_sqr ( &L0, &a->y ); | |||
p448_sqr ( &L2, &a->x ); | |||
p448_add ( &L3, &L2, &L0 ); | |||
p448_subw ( &L3, 1 ); | |||
p448_mulw ( &L1, &L2, 39081 ); | |||
p448_neg ( &L2, &L1 ); | |||
p448_bias ( &L2, 2 ); | |||
p448_mul ( &L1, &L0, &L2 ); | |||
p448_sub ( &L0, &L3, &L1 ); | |||
p448_bias ( &L0, 3 ); | |||
return p448_is_zero( &L0 ); | |||
} | |||
mask_t | |||
validate_tw_extensible ( | |||
const struct tw_extensible_t* ext | |||
) { | |||
mask_t L4, L5; | |||
struct p448_t L0, L1, L2, L3; | |||
/* | |||
* Check invariant: | |||
* 0 = -x*y + z*t*u | |||
*/ | |||
p448_mul ( &L1, &ext->t, &ext->u ); | |||
p448_mul ( &L2, &ext->z, &L1 ); | |||
p448_addw ( &L2, 0 ); | |||
p448_mul ( &L0, &ext->x, &ext->y ); | |||
p448_neg ( &L1, &L0 ); | |||
p448_add ( &L0, &L1, &L2 ); | |||
p448_bias ( &L0, 2 ); | |||
L5 = p448_is_zero( &L0 ); | |||
/* | |||
* Check invariant: | |||
* 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 | |||
*/ | |||
p448_sqr ( &L2, &ext->y ); | |||
p448_neg ( &L1, &L2 ); | |||
p448_addw ( &L1, 0 ); | |||
p448_sqr ( &L0, &ext->x ); | |||
p448_add ( &L2, &L0, &L1 ); | |||
p448_sqr ( &L3, &ext->u ); | |||
p448_sqr ( &L0, &ext->t ); | |||
p448_mul ( &L1, &L0, &L3 ); | |||
p448_mulw ( &L0, &L1, 39081 ); | |||
p448_neg ( &L3, &L0 ); | |||
p448_add ( &L0, &L3, &L2 ); | |||
p448_neg ( &L3, &L1 ); | |||
p448_add ( &L2, &L3, &L0 ); | |||
p448_sqr ( &L1, &ext->z ); | |||
p448_add ( &L0, &L1, &L2 ); | |||
p448_bias ( &L0, 4 ); | |||
L4 = p448_is_zero( &L0 ); | |||
return L5 & L4; | |||
} | |||
mask_t | |||
validate_extensible ( | |||
const struct extensible_t* ext | |||
) { | |||
mask_t L4, L5; | |||
struct p448_t L0, L1, L2, L3; | |||
/* | |||
* Check invariant: | |||
* 0 = d*t^2*u^2 - x^2 - y^2 + z^2 | |||
*/ | |||
p448_sqr ( &L2, &ext->y ); | |||
p448_neg ( &L1, &L2 ); | |||
p448_addw ( &L1, 0 ); | |||
p448_sqr ( &L0, &ext->z ); | |||
p448_add ( &L2, &L0, &L1 ); | |||
p448_sqr ( &L3, &ext->u ); | |||
p448_sqr ( &L0, &ext->t ); | |||
p448_mul ( &L1, &L0, &L3 ); | |||
p448_mulw ( &L3, &L1, 39081 ); | |||
p448_neg ( &L0, &L3 ); | |||
p448_add ( &L1, &L0, &L2 ); | |||
p448_sqr ( &L0, &ext->x ); | |||
p448_neg ( &L2, &L0 ); | |||
p448_add ( &L0, &L2, &L1 ); | |||
p448_bias ( &L0, 4 ); | |||
L5 = p448_is_zero( &L0 ); | |||
/* | |||
* Check invariant: | |||
* 0 = -x*y + z*t*u | |||
*/ | |||
p448_mul ( &L1, &ext->t, &ext->u ); | |||
p448_mul ( &L2, &ext->z, &L1 ); | |||
p448_addw ( &L2, 0 ); | |||
p448_mul ( &L0, &ext->x, &ext->y ); | |||
p448_neg ( &L1, &L0 ); | |||
p448_add ( &L0, &L1, &L2 ); | |||
p448_bias ( &L0, 2 ); | |||
L4 = p448_is_zero( &L0 ); | |||
return L5 & L4; | |||
} | |||
@@ -0,0 +1,376 @@ | |||
/* Copyright (c) 2014 Cryptography Research, Inc. | |||
* Released under the MIT License. See LICENSE.txt for license information. | |||
*/ | |||
#ifndef __P448_H__ | |||
#define __P448_H__ 1 | |||
#include "word.h" | |||
#include <stdint.h> | |||
#include <assert.h> | |||
typedef struct p448_t { | |||
uint32_t limb[16]; | |||
} __attribute__((aligned(32))) p448_t; | |||
#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15) | |||
#define USE_NEON_PERM 1 | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
static __inline__ void | |||
p448_set_ui ( | |||
p448_t *out, | |||
uint64_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_cond_swap ( | |||
p448_t *a, | |||
p448_t *b, | |||
mask_t do_swap | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_add ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_sub ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_neg ( | |||
p448_t *out, | |||
const p448_t *a | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_cond_neg ( | |||
p448_t *a, | |||
mask_t doNegate | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_addw ( | |||
p448_t *a, | |||
uint32_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_subw ( | |||
p448_t *a, | |||
uint32_t x | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_copy ( | |||
p448_t *out, | |||
const p448_t *a | |||
) __attribute__((unused,always_inline)); | |||
static __inline__ void | |||
p448_weak_reduce ( | |||
p448_t *inout | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_strong_reduce ( | |||
p448_t *inout | |||
); | |||
mask_t | |||
p448_is_zero ( | |||
const p448_t *in | |||
); | |||
static __inline__ void | |||
p448_bias ( | |||
p448_t *inout, | |||
int amount | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_mul ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a, | |||
const p448_t *b | |||
); | |||
void | |||
p448_mulw ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a, | |||
uint64_t b | |||
); | |||
void | |||
p448_sqr ( | |||
p448_t *__restrict__ out, | |||
const p448_t *a | |||
); | |||
static __inline__ void | |||
p448_sqrn ( | |||
p448_t *__restrict__ y, | |||
const p448_t *x, | |||
int n | |||
) __attribute__((unused,always_inline)); | |||
void | |||
p448_serialize ( | |||
uint8_t *serial, | |||
const struct p448_t *x | |||
); | |||
mask_t | |||
p448_deserialize ( | |||
p448_t *x, | |||
const uint8_t serial[56] | |||
); | |||
static __inline__ void | |||
p448_mask( | |||
struct p448_t *a, | |||
const struct p448_t *b, | |||
mask_t mask | |||
) __attribute__((unused,always_inline)); | |||
/** | |||
* Returns 1/x. | |||
* | |||
* If x=0, returns 0. | |||
*/ | |||
void | |||
p448_inverse ( | |||
struct p448_t* a, | |||
const struct p448_t* x | |||
); | |||
void | |||
simultaneous_invert_p448 ( | |||
struct p448_t *__restrict__ out, | |||
const struct p448_t *in, | |||
unsigned int n | |||
); | |||
static inline mask_t | |||
p448_eq ( | |||
const struct p448_t *a, | |||
const struct p448_t *b | |||
) __attribute__((always_inline,unused)); | |||
/* -------------- Inline functions begin here -------------- */ | |||
void | |||
p448_set_ui ( | |||
p448_t *out, | |||
uint64_t x | |||
) { | |||
int i; | |||
for (i=0; i<16; i++) { | |||
out->limb[i] = 0; | |||
} | |||
out->limb[0] = x & ((1<<28)-1); | |||
out->limb[2] = x>>28; | |||
} | |||
void | |||
p448_cond_swap ( | |||
p448_t *a, | |||
p448_t *b, | |||
mask_t doswap | |||
) { | |||
big_register_t *aa = (big_register_t*)a; | |||
big_register_t *bb = (big_register_t*)b; | |||
big_register_t m = br_set_to_mask(doswap); | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) { | |||
big_register_t x = m & (aa[i]^bb[i]); | |||
aa[i] ^= x; | |||
bb[i] ^= x; | |||
} | |||
} | |||
void | |||
p448_add ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i]; | |||
} | |||
} | |||
void | |||
p448_sub ( | |||
p448_t *out, | |||
const p448_t *a, | |||
const p448_t *b | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i]; | |||
} | |||
/* | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { | |||
out->limb[i] = a->limb[i] - b->limb[i]; | |||
} | |||
*/ | |||
} | |||
void | |||
p448_neg ( | |||
p448_t *out, | |||
const p448_t *a | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) { | |||
((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i]; | |||
} | |||
/* | |||
unsigned int i; | |||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { | |||
out->limb[i] = -a->limb[i]; | |||
} | |||
*/ | |||
} | |||
void | |||
p448_cond_neg( | |||
p448_t *a, | |||
mask_t doNegate | |||
) { | |||
unsigned int i; | |||
struct p448_t negated; | |||
big_register_t *aa = (big_register_t *)a; | |||
big_register_t *nn = (big_register_t*)&negated; | |||
big_register_t m = br_set_to_mask(doNegate); | |||
p448_neg(&negated, a); | |||
p448_bias(&negated, 2); | |||
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) { | |||
aa[i] = (aa[i] & ~m) | (nn[i] & m); | |||
} | |||
} | |||
void | |||
p448_addw ( | |||
p448_t *a, | |||
uint32_t x | |||
) { | |||
a->limb[0] += x; | |||
} | |||
void | |||
p448_subw ( | |||
p448_t *a, | |||
uint32_t x | |||
) { | |||
a->limb[0] -= x; | |||
} | |||
void | |||
p448_copy ( | |||
p448_t *out, | |||
const p448_t *a | |||
) { | |||
*out = *a; | |||
} | |||
void | |||
p448_bias ( | |||
p448_t *a, | |||
int amt | |||
) { | |||
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; | |||
uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1}; | |||
uint32x4_t *aa = (uint32x4_t*) a; | |||
aa[0] += lo; | |||
aa[1] += hi; | |||
aa[2] += hi; | |||
aa[3] += hi; | |||
} | |||
void | |||
p448_weak_reduce ( | |||
p448_t *a | |||
) { | |||
uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1}, | |||
tmp = vshr_n_u32(aa[7],28); | |||
int i; | |||
for (i=7; i>=1; i--) { | |||
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28); | |||
} | |||
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2); | |||
} | |||
void | |||
p448_sqrn ( | |||
p448_t *__restrict__ y, | |||
const p448_t *x, | |||
int n | |||
) { | |||
p448_t tmp; | |||
assert(n>0); | |||
if (n&1) { | |||
p448_sqr(y,x); | |||
n--; | |||
} else { | |||
p448_sqr(&tmp,x); | |||
p448_sqr(y,&tmp); | |||
n-=2; | |||
} | |||
for (; n; n-=2) { | |||
p448_sqr(&tmp,y); | |||
p448_sqr(y,&tmp); | |||
} | |||
} | |||
mask_t | |||
p448_eq ( | |||
const struct p448_t *a, | |||
const struct p448_t *b | |||
) { | |||
struct p448_t ra, rb; | |||
p448_copy(&ra, a); | |||
p448_copy(&rb, b); | |||
p448_weak_reduce(&ra); | |||
p448_weak_reduce(&rb); | |||
p448_sub(&ra, &ra, &rb); | |||
p448_bias(&ra, 2); | |||
return p448_is_zero(&ra); | |||
} | |||
void | |||
p448_mask ( | |||
struct p448_t *a, | |||
const struct p448_t *b, | |||
mask_t mask | |||
) { | |||
unsigned int i; | |||
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) { | |||
a->limb[i] = b->limb[i] & mask; | |||
} | |||
} | |||
#ifdef __cplusplus | |||
}; /* extern "C" */ | |||
#endif | |||
#endif /* __P448_H__ */ |
@@ -466,7 +466,7 @@ crandom_generate( | |||
unsigned long long copy = (length > state->fill) ? state->fill : length; | |||
state->fill -= copy; | |||
memcpy(output, state->buffer + state->fill, copy); | |||
memset(state->buffer + state->fill, 0, copy); | |||
really_memset(state->buffer + state->fill, 0, copy); | |||
output += copy; length -= copy; | |||
} | |||
@@ -484,5 +484,5 @@ crandom_destroy( | |||
*/ | |||
} | |||
memset(state, 0, sizeof(*state)); | |||
really_memset(state, 0, sizeof(*state)); | |||
} |
@@ -340,7 +340,7 @@ goldilocks_sign ( | |||
word_t skw[GOLDI_FIELD_WORDS]; | |||
mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order); | |||
if (!succ) { | |||
memset(skw,0,sizeof(skw)); | |||
really_memset(skw,0,sizeof(skw)); | |||
return GOLDI_ECORRUPT; | |||
} | |||
@@ -389,9 +389,9 @@ goldilocks_sign ( | |||
memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); | |||
barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); | |||
memset((unsigned char *)tk,0,sizeof(tk)); | |||
memset((unsigned char *)skw,0,sizeof(skw)); | |||
memset((unsigned char *)challenge,0,sizeof(challenge)); | |||
really_memset((unsigned char *)tk,0,sizeof(tk)); | |||
really_memset((unsigned char *)skw,0,sizeof(skw)); | |||
really_memset((unsigned char *)challenge,0,sizeof(challenge)); | |||
/* response = 2(nonce_secret - sk*challenge) | |||
* Nonce = 8[nonce_secret]*G | |||
@@ -494,7 +494,7 @@ goldilocks_destroy_precomputed_public_key ( | |||
) { | |||
if (!precom) return; | |||
destroy_fixed_base(&precom->table); | |||
memset(&precom->pub.opaque, 0, sizeof(precom->pub)); | |||
really_memset(&precom->pub.opaque, 0, sizeof(precom->pub)); | |||
free(precom); | |||
} | |||
@@ -146,11 +146,17 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); | |||
} | |||
#endif | |||
#if __AVX2__ || __SSE2__ | |||
#if __AVX2__ | |||
static __inline__ big_register_t | |||
br_is_zero(big_register_t x) { | |||
return (big_register_t)(x == br_set_to_mask(0)); | |||
} | |||
#elif __SSE2__ | |||
static __inline__ big_register_t | |||
br_is_zero(big_register_t x) { | |||
return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); | |||
//return (big_register_t)(x == br_set_to_mask(0)); | |||
} | |||
#elif __ARM_NEON__ | |||
static __inline__ big_register_t | |||
br_is_zero(big_register_t x) { | |||
@@ -179,7 +185,25 @@ static inline uint64_t | |||
letoh64 (uint64_t x) { return x; } | |||
#endif | |||
/** | |||
* Really call memset, in a way that prevents the compiler from optimizing it out. | |||
* @param p The object to zeroize. | |||
* @param c The char to set it to (probably zero). | |||
* @param s The size of the object. | |||
*/ | |||
#ifdef __STDC_LIB_EXT1__ /* which it won't be, because we're -std=c99 */ | |||
static __inline__ void | |||
really_memset(void *p, char c, size_t s) { | |||
memset_s(p,s,c,s); | |||
} | |||
#else | |||
static __inline__ void __attribute__((always_inline,unused)) | |||
really_memset(void *p, char c, size_t s) { | |||
volatile char *pv = (volatile char *)p; | |||
size_t i; | |||
for (i=0; i<s; i++) pv[i] = c; | |||
} | |||
#endif | |||
/** | |||
* Allocate memory which is sufficiently aligned to be used for the | |||
@@ -27,11 +27,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { | |||
}; | |||
const struct affine_t goldilocks_base_point = { | |||
#ifdef USE_NEON_PERM | |||
{{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a, | |||
0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e | |||
}}, | |||
#else | |||
{{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), | |||
U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), | |||
U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324), | |||
U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff) | |||
}}, | |||
#endif | |||
{{ 19 }} | |||
}; | |||
@@ -50,6 +56,12 @@ const struct barrett_prime_t curve_prime_order = { | |||
const struct field_t | |||
sqrt_d_minus_1 = {{ | |||
#ifdef USE_NEON_PERM | |||
0x6749f46,0x24d9770,0xd2e2183,0xa49f7b4, | |||
0xb4f0179,0x8c5f656,0x888db42,0xdcac462, | |||
0xbdeea38,0x748734a,0x5a189aa,0x49443b8, | |||
0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c | |||
#else | |||
U58LE(0xd2e21836749f46), | |||
U58LE(0x888db42b4f0179), | |||
U58LE(0x5a189aabdeea38), | |||
@@ -58,4 +70,5 @@ sqrt_d_minus_1 = {{ | |||
U58LE(0xdcac4628c5f656), | |||
U58LE(0x49443b8748734a), | |||
U58LE(0x12fec0c0b25b7a) | |||
#endif | |||
}}; |
@@ -63,7 +63,15 @@ cond_negate_tw_pniels ( | |||
cond_negate_tw_niels(&n->n, doNegate); | |||
} | |||
static __inline__ void | |||
#if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__)) | |||
/* This works around an apparent compiler bug in GCC, thanks Samuel Neves */ | |||
static void __attribute__((optimize("O1"))) | |||
#ifdef __OPTIMIZE_SIZE__ | |||
#warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms" | |||
#endif | |||
#else | |||
static __inline__ void | |||
#endif | |||
constant_time_lookup_tw_pniels ( | |||
struct tw_pniels_t *out, | |||
const struct tw_pniels_t *in, | |||
@@ -76,7 +84,7 @@ constant_time_lookup_tw_pniels ( | |||
int j; | |||
unsigned int k; | |||
memset(out, 0, sizeof(*out)); | |||
really_memset(out, 0, sizeof(*out)); | |||
for (j=0; j<nin; j++, big_i-=big_one) { | |||
big_register_t mask = br_is_zero(big_i); | |||
for (k=0; k<sizeof(*out)/sizeof(*o); k++) { | |||
@@ -98,7 +106,7 @@ constant_time_lookup_tw_niels ( | |||
int j; | |||
unsigned int k; | |||
memset(out, 0, sizeof(*out)); | |||
really_memset(out, 0, sizeof(*out)); | |||
for (j=0; j<nin; j++, big_i-=big_one) { | |||
big_register_t mask = br_is_zero(big_i); | |||
for (k=0; k<sizeof(*out)/sizeof(*o); k++) { | |||
@@ -449,7 +457,7 @@ precompute_fixed_base ( | |||
struct tw_niels_t *prealloc | |||
) { | |||
if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) { | |||
memset(out, 0, sizeof(*out)); | |||
really_memset(out, 0, sizeof(*out)); | |||
return 0; | |||
} | |||
@@ -478,8 +486,8 @@ precompute_fixed_base ( | |||
free(doubles); | |||
free(zs); | |||
free(zis); | |||
memset(out, 0, sizeof(*out)); | |||
memset(table, 0, sizeof(*table) * (n<<(t-1))); | |||
really_memset(out, 0, sizeof(*out)); | |||
really_memset(table, 0, sizeof(*table) * (n<<(t-1))); | |||
if (!prealloc) free(table); | |||
return 0; | |||
} | |||
@@ -593,9 +601,9 @@ precompute_fixed_base ( | |||
free(zis); | |||
if (unlikely(!ret)) { | |||
memset(table, 0, sizeof(*table) * (n<<(t-1))); | |||
really_memset(table, 0, sizeof(*table) * (n<<(t-1))); | |||
if (!prealloc) free(table); | |||
memset(out, 0, sizeof(*out)); | |||
really_memset(out, 0, sizeof(*out)); | |||
return 0; | |||
} | |||
@@ -607,12 +615,12 @@ destroy_fixed_base ( | |||
struct fixed_base_table_t *table | |||
) { | |||
if (table->table) { | |||
memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); | |||
really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1))); | |||
} | |||
if (table->own_table) { | |||
free(table->table); | |||
} | |||
memset(table,0,sizeof(*table)); | |||
really_memset(table,0,sizeof(*table)); | |||
} | |||
mask_t | |||
@@ -108,33 +108,33 @@ int main(int argc, char **argv) { | |||
q448_randomize(&crand, sk); | |||
when = now(); | |||
for (i=0; i<nbase*1000; i++) { | |||
for (i=0; i<nbase*5000; i++) { | |||
p448_mul(&c, &b, &a); | |||
} | |||
when = now() - when; | |||
printf("mul: %5.1fns\n", when * 1e9 / i); | |||
when = now(); | |||
for (i=0; i<nbase*1000; i++) { | |||
for (i=0; i<nbase*5000; i++) { | |||
p448_sqr(&c, &a); | |||
} | |||
when = now() - when; | |||
printf("sqr: %5.1fns\n", when * 1e9 / i); | |||
when = now(); | |||
for (i=0; i<nbase*500; i++) { | |||
p448_mul(&c, &b, &a); | |||
p448_mul(&a, &b, &c); | |||
for (i=0; i<nbase*5000; i++) { | |||
p448_mulw(&c, &b, 1234562); | |||
} | |||
when = now() - when; | |||
printf("mul dep: %5.1fns\n", when * 1e9 / i / 2); | |||
printf("mulw: %5.1fns\n", when * 1e9 / i); | |||
when = now(); | |||
for (i=0; i<nbase*1000; i++) { | |||
p448_mulw(&c, &b, 1234562); | |||
for (i=0; i<nbase*500; i++) { | |||
p448_mul(&c, &b, &a); | |||
p448_mul(&a, &b, &c); | |||
} | |||
when = now() - when; | |||
printf("mulw: %5.1fns\n", when * 1e9 / i); | |||
printf("mul dep: %5.1fns\n", when * 1e9 / i / 2); | |||
when = now(); | |||
for (i=0; i<nbase*10; i++) { | |||
@@ -3,6 +3,9 @@ | |||
#include <stdio.h> | |||
#include <string.h> | |||
#ifndef LIMBPERM | |||
#define LIMBPERM(x) (x) | |||
#endif | |||
int failed_tests, n_tests, failed_this_test, running_a_test; | |||
@@ -87,7 +90,7 @@ void p448_print ( | |||
int j; | |||
printf("%s = 0x", descr); | |||
for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) { | |||
printf(PRIxWORD58, b.limb[j]); | |||
printf(PRIxWORD58, b.limb[LIMBPERM(j)]); | |||
} | |||
printf("\n"); | |||
} | |||
@@ -170,7 +170,12 @@ int test_arithmetic () { | |||
int bits = sizeof(word_t) * 448 / sizeof(p448_t); | |||
for (j=0; j<ntests; j++) { | |||
if (j&1) { | |||
if (j<256) { | |||
mpz_set_ui(x,0); | |||
mpz_set_ui(y,0); | |||
mpz_setbit(x,(j%16)*28); | |||
mpz_setbit(y,(j/16)*28); | |||
} else if (j&1) { | |||
mpz_rrandomb(x, state, 448); | |||
mpz_rrandomb(y, state, 448); | |||
} else { | |||