Browse Source

Added really_memset, thanks David Leon Gil.

Trying to work around an apparent GCC bug on SSE2, thanks Samuel
Neves.

Added an experimental NEON arch.  It's fast.  It's not yet GCC clean.
It needs some more work on general cleanliness too.
master
Mike Hamburg 10 years ago
parent
commit
04b955eabe
15 changed files with 2741 additions and 107 deletions
  1. +20
    -0
      HISTORY.txt
  2. +3
    -3
      Makefile
  3. +1
    -1
      README.txt
  4. +89
    -73
      src/arch_neon/p448.c
  5. +962
    -0
      src/arch_neon_experimental/ec_point.c
  6. +1207
    -0
      src/arch_neon_experimental/p448.c
  7. +376
    -0
      src/arch_neon_experimental/p448.h
  8. +2
    -2
      src/crandom.c
  9. +5
    -5
      src/goldilocks.c
  10. +26
    -2
      src/include/word.h
  11. +13
    -0
      src/magic.c
  12. +18
    -10
      src/scalarmul.c
  13. +9
    -9
      test/bench.c
  14. +4
    -1
      test/test.c
  15. +6
    -1
      test/test_arithmetic.c

+ 20
- 0
HISTORY.txt View File

@@ -1,3 +1,23 @@
August 4, 2014:
Experiments and bug fixes.

Add really_memset = memset_s (except not because I'm setting -std=c99),
thanks David Leon Gil. I think I put it in the right places.

Try to work around what I think is a compiler bug in GCC -O3 on non-AVX
platforms. I can't seem to work around it as -Os, so I'm just flagging
a warning (-Werror makes it an error) for now. Will take more
investigation. Thanks Samuel Neves.

Added an experimental (not ready yet!) ARM NEON implementation in
arch_neon_experimental. This implementation seems to work, but needs
more testing. It is currently asm-heavy and not GCC clean. I am
planning to have a flag for it to use intrinsics instead of asm;
currently the intrinsics are commented out. On clang this does ECDH
in 1850kcy on my BeagleBone Black, comparable to Curve41417. Once this
is ready, I will probably move it to arch_neon proper, since arch_neon
isn't particularly tuned.

July 11, 2014: July 11, 2014:
This is mostly a cleanup release. This is mostly a cleanup release.




+ 3
- 3
Makefile View File

@@ -22,7 +22,7 @@ endif




WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function $(EXWARN)
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH) INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
@@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon
else else
ARCHFLAGS += -mfpu=vfpv3-d16 ARCHFLAGS += -mfpu=vfpv3-d16
endif endif
ARCHFLAGS += -mcpu=cortex-a9 # FIXME
GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
ARCHFLAGS += -mcpu=cortex-a8 # FIXME
GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
else else
ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
endif endif


+ 1
- 1
README.txt View File

@@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford
University. All of these files are usable under the MIT license contained in University. All of these files are usable under the MIT license contained in
LICENSE.txt. LICENSE.txt.


The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run
The Makefile is set for my 2013 MacBook Air. You can `make bench` to run
a completely arbitrary set of benchmarks and tests, or `make a completely arbitrary set of benchmarks and tests, or `make
build/goldilocks.so` to build a stripped-down version of the library. For build/goldilocks.so` to build a stripped-down version of the library. For
non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate


+ 89
- 73
src/arch_neon/p448.c View File

@@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) {
#include "neon_emulation.h" #include "neon_emulation.h"
#endif /* ARM_NEON */ #endif /* ARM_NEON */


static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smlal ( smlal (
uint64_t *acc, uint64_t *acc,
const uint32_t a, const uint32_t a,
@@ -48,7 +48,7 @@ smlal (
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
} }


static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smlal2 ( smlal2 (
uint64_t *acc, uint64_t *acc,
const uint32_t a, const uint32_t a,
@@ -57,7 +57,7 @@ smlal2 (
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
} }


static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smull ( smull (
uint64_t *acc, uint64_t *acc,
const uint32_t a, const uint32_t a,
@@ -66,7 +66,7 @@ smull (
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
} }


static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smull2 ( smull2 (
uint64_t *acc, uint64_t *acc,
const uint32_t a, const uint32_t a,
@@ -84,6 +84,7 @@ p448_mul (
const uint32_t *a = as->limb, *b = bs->limb; const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb; uint32_t *c = cs->limb;
const int32x2_t const int32x2_t
*val = (const int32x2_t *)a, *val = (const int32x2_t *)a,
*vbl = (const int32x2_t *)b, *vbl = (const int32x2_t *)b,
@@ -109,155 +110,170 @@ p448_mul (
accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0); accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0);
accumx1a = vmull_lane_s32( delta, vbh[3], 1); accumx1a = vmull_lane_s32( delta, vbh[3], 1);
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
accumx3a = vmull_lane_s32( delta, vbh[3], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1); accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
accumx3b = vmull_lane_s32( delta, vbh[1], 1);
accumx0b = vmull_lane_s32( delta, vbh[0], 0);
accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0);
accumx1b = vmull_lane_s32( delta, vbh[0], 1); accumx1b = vmull_lane_s32( delta, vbh[0], 1);
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0); accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1); accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0); accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1); accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0); accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1); accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
accumx0b += accumx0a; accumx0b += accumx0a;
accumx1b += accumx1a; accumx1b += accumx1a;
accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0); accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1); accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0); accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1); accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0); accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1); accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0); accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1); accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
accumx0a += accumx0b; accumx0a += accumx0b;
accumx1a += accumx1b; accumx1a += accumx1b;
accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0); accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1); accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
xx_vtrnq_s64(&accumx0a, &accumx0b); xx_vtrnq_s64(&accumx0a, &accumx0b);
xx_vtrnq_s64(&accumx1a, &accumx1b); xx_vtrnq_s64(&accumx1a, &accumx1b);
xx_vtrnq_s64(&accumx2a, &accumx2b);
xx_vtrnq_s64(&accumx3a, &accumx3b);
accumx0b += accumx1a; accumx0b += accumx1a;
accumx0b = vsraq_n_s64(accumx0b,accumx0a,28); accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
accumx1b = vsraq_n_s64(accumx1b,accumx0b,28); accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
accumx3a = vmull_lane_s32( delta, vbh[3], 1);
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
accumx3b = vmull_lane_s32( delta, vbh[1], 1);
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0);
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0);
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
xx_vtrnq_s64(&accumx2a, &accumx2b);
xx_vtrnq_s64(&accumx3a, &accumx3b);
accumx2a += accumx1b; accumx2a += accumx1b;
accumx2b += accumx3a; accumx2b += accumx3a;
accumx2b = vsraq_n_s64(accumx2b,accumx2a,28); accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
accumx3b = vsraq_n_s64(accumx3b,accumx2b,28); accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b)); trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
vcl[1] = trn_res.val[1] & vmask; vcl[1] = trn_res.val[1] & vmask;
vch[1] = trn_res.val[0] & vmask; vch[1] = trn_res.val[0] & vmask;
carry = accumx3b; carry = accumx3b;
accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0); accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0);
accumx5a = vmull_lane_s32( delta, vbh[3], 1); accumx5a = vmull_lane_s32( delta, vbh[3], 1);
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
accumx7b = vmull_lane_s32( delta, vbh[3], 1);
accumx4b = accumx4a; accumx4b = accumx4a;
accumx5b = accumx5a; accumx5b = accumx5a;
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1); accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0); accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1); accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
accumx4a += accumx4b; accumx4a += accumx4b;
accumx5a += accumx5b; accumx5a += accumx5b;
accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0); accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1); accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0); accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1); accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0); accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1); accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0); accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1); accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
/**/ /**/
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0); accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1); accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0); accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1); accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0); accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1); accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
xx_vtrnq_s64(&accumx4a, &accumx4b); xx_vtrnq_s64(&accumx4a, &accumx4b);
xx_vtrnq_s64(&accumx5a, &accumx5b); xx_vtrnq_s64(&accumx5a, &accumx5b);
xx_vtrnq_s64(&accumx6a, &accumx6b);
xx_vtrnq_s64(&accumx7a, &accumx7b);
accumx4a += carry; accumx4a += carry;
accumx4b += accumx5a; accumx4b += accumx5a;
accumx4b = vsraq_n_s64(accumx4b,accumx4a,28); accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
accumx5b = vsraq_n_s64(accumx5b,accumx4b,28); accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
accumx6a += accumx5b;
accumx6b += accumx7a;
trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b)); trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
vcl[2] = trn_res.val[1] & vmask; vcl[2] = trn_res.val[1] & vmask;
vch[2] = trn_res.val[0] & vmask; vch[2] = trn_res.val[0] & vmask;
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
accumx7b = vmull_lane_s32( delta, vbh[3], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
/**/
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);

xx_vtrnq_s64(&accumx6a, &accumx6b);
xx_vtrnq_s64(&accumx7a, &accumx7b);
accumx6a += accumx5b;
accumx6b += accumx7a;
accumx6b = vsraq_n_s64(accumx6b,accumx6a,28); accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
accumx7b = vsraq_n_s64(accumx7b,accumx6b,28); accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b)); trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
vcl[3] = trn_res.val[1] & vmask; vcl[3] = trn_res.val[1] & vmask;
vch[3] = trn_res.val[0] & vmask; vch[3] = trn_res.val[0] & vmask;
accumx7b = xx_vaddup_s64(accumx7b); accumx7b = xx_vaddup_s64(accumx7b);


int32x2_t t0 = vcl[0], t1 = vch[0]; int32x2_t t0 = vcl[0], t1 = vch[0];


+ 962
- 0
src/arch_neon_experimental/ec_point.c View File

@@ -0,0 +1,962 @@
/**
* @cond internal
* @file ec_point.c
* @copyright
* Copyright (c) 2014 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
* @author Mike Hamburg
* @warning This file was automatically generated.
*/

#include "ec_point.h"


void
p448_isr (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L1, x );
p448_mul ( &L2, x, &L1 );
p448_sqr ( &L1, &L2 );
p448_mul ( &L2, x, &L1 );
p448_sqrn ( &L1, &L2, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L1, &L0, 3 );
p448_mul ( &L0, &L2, &L1 );
p448_sqrn ( &L2, &L0, 9 );
p448_mul ( &L1, &L0, &L2 );
p448_sqr ( &L0, &L1 );
p448_mul ( &L2, x, &L0 );
p448_sqrn ( &L0, &L2, 18 );
p448_mul ( &L2, &L1, &L0 );
p448_sqrn ( &L0, &L2, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 37 );
p448_mul ( &L1, &L2, &L0 );
p448_sqrn ( &L0, &L1, 111 );
p448_mul ( &L2, &L1, &L0 );
p448_sqr ( &L0, &L2 );
p448_mul ( &L1, x, &L0 );
p448_sqrn ( &L0, &L1, 223 );
p448_mul ( a, &L2, &L0 );
}

void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
) {
struct p448_t L0, L1;
p448_isr ( &L0, x );
p448_sqr ( &L1, &L0 );
p448_sqr ( &L0, &L1 );
p448_mul ( a, x, &L0 );
}

void
add_tw_niels_to_tw_extensible (
struct tw_extensible_t* d,
const struct tw_niels_t* e
) {
struct p448_t L0, L1;
p448_sub ( &L1, &d->y, &d->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L0, &e->a, &L1 );
p448_add ( &L1, &d->x, &d->y );
p448_mul ( &d->y, &e->b, &L1 );
p448_mul ( &L1, &d->u, &d->t );
p448_mul ( &d->x, &e->c, &L1 );
p448_add ( &d->u, &L0, &d->y );
p448_sub ( &d->t, &d->y, &L0 );
p448_bias ( &d->t, 2 );
p448_weak_reduce( &d->t );
p448_sub ( &d->y, &d->z, &d->x );
p448_bias ( &d->y, 2 );
p448_weak_reduce( &d->y );
p448_add ( &L0, &d->x, &d->z );
p448_mul ( &d->z, &L0, &d->y );
p448_mul ( &d->x, &d->y, &d->t );
p448_mul ( &d->y, &L0, &d->u );
}

void
sub_tw_niels_from_tw_extensible (
struct tw_extensible_t* d,
const struct tw_niels_t* e
) {
struct p448_t L0, L1;
p448_sub ( &L1, &d->y, &d->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L0, &e->b, &L1 );
p448_add ( &L1, &d->x, &d->y );
p448_mul ( &d->y, &e->a, &L1 );
p448_mul ( &L1, &d->u, &d->t );
p448_mul ( &d->x, &e->c, &L1 );
p448_add ( &d->u, &L0, &d->y );
p448_sub ( &d->t, &d->y, &L0 );
p448_bias ( &d->t, 2 );
p448_weak_reduce( &d->t );
p448_add ( &d->y, &d->x, &d->z );
p448_sub ( &L0, &d->z, &d->x );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_mul ( &d->z, &L0, &d->y );
p448_mul ( &d->x, &d->y, &d->t );
p448_mul ( &d->y, &L0, &d->u );
}

void
add_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
) {
struct p448_t L0;
p448_mul ( &L0, &e->z, &a->z );
p448_copy ( &e->z, &L0 );
add_tw_niels_to_tw_extensible( e, &a->n );
}

void
sub_tw_pniels_from_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* a
) {
struct p448_t L0;
p448_mul ( &L0, &e->z, &a->z );
p448_copy ( &e->z, &L0 );
sub_tw_niels_from_tw_extensible( e, &a->n );
}

void
double_tw_extensible (
struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &a->u, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &L1, &a->t );
p448_sub ( &a->t, &L1, &a->u );
p448_bias ( &a->t, 3 );
p448_weak_reduce( &a->t );
p448_sub ( &L1, &L0, &L2 );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 1 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_weak_reduce( &L0 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
double_extensible (
struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->x );
p448_sqr ( &L0, &a->y );
p448_add ( &L1, &L2, &L0 );
p448_add ( &a->t, &a->y, &a->x );
p448_sqr ( &a->u, &a->t );
p448_sub ( &a->t, &a->u, &L1 );
p448_bias ( &a->t, 3 );
p448_weak_reduce( &a->t );
p448_sub ( &a->u, &L0, &L2 );
p448_bias ( &a->u, 2 );
p448_weak_reduce( &a->u );
p448_sqr ( &a->x, &a->z );
p448_bias ( &a->x, 2 );
p448_add ( &a->z, &a->x, &a->x );
p448_sub ( &L0, &a->z, &L1 );
p448_weak_reduce( &L0 );
p448_mul ( &a->z, &L1, &L0 );
p448_mul ( &a->x, &L0, &a->t );
p448_mul ( &a->y, &L1, &a->u );
}

void
twist_and_double (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &b->u, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &L0, &b->t );
p448_sub ( &b->t, &L0, &b->u );
p448_bias ( &b->t, 3 );
p448_weak_reduce( &b->t );
p448_sub ( &L0, &b->z, &b->x );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 2 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_weak_reduce( &b->y );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
untwist_and_double (
struct extensible_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0;
p448_sqr ( &b->x, &a->x );
p448_sqr ( &b->z, &a->y );
p448_add ( &L0, &b->x, &b->z );
p448_add ( &b->t, &a->y, &a->x );
p448_sqr ( &b->u, &b->t );
p448_sub ( &b->t, &b->u, &L0 );
p448_bias ( &b->t, 3 );
p448_weak_reduce( &b->t );
p448_sub ( &b->u, &b->z, &b->x );
p448_bias ( &b->u, 2 );
p448_weak_reduce( &b->u );
p448_sqr ( &b->x, &a->z );
p448_bias ( &b->x, 1 );
p448_add ( &b->z, &b->x, &b->x );
p448_sub ( &b->y, &b->z, &b->u );
p448_weak_reduce( &b->y );
p448_mul ( &b->z, &L0, &b->y );
p448_mul ( &b->x, &b->y, &b->t );
p448_mul ( &b->y, &L0, &b->u );
}

void
convert_tw_affine_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_affine_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->y, &a->x );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_set_ui( &b->z, 2 );
}

void
convert_tw_affine_to_tw_extensible (
struct tw_extensible_t* b,
const struct tw_affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_affine_to_extensible (
struct extensible_t* b,
const struct affine_t* a
) {
p448_copy ( &b->x, &a->x );
p448_copy ( &b->y, &a->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &a->x );
p448_copy ( &b->u, &a->y );
}

void
convert_tw_extensible_to_tw_pniels (
struct tw_pniels_t* b,
const struct tw_extensible_t* a
) {
p448_sub ( &b->n.a, &a->y, &a->x );
p448_bias ( &b->n.a, 2 );
p448_weak_reduce( &b->n.a );
p448_add ( &b->n.b, &a->x, &a->y );
p448_weak_reduce( &b->n.b );
p448_mul ( &b->n.c, &a->u, &a->t );
p448_mulw ( &b->z, &b->n.c, 78164 );
p448_neg ( &b->n.c, &b->z );
p448_bias ( &b->n.c, 2 );
p448_weak_reduce( &b->n.c );
p448_add ( &b->z, &a->z, &a->z );
p448_weak_reduce( &b->z );
}

void
convert_tw_pniels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_pniels_t* d
) {
p448_add ( &e->u, &d->n.b, &d->n.a );
p448_sub ( &e->t, &d->n.b, &d->n.a );
p448_bias ( &e->t, 2 );
p448_weak_reduce( &e->t );
p448_mul ( &e->x, &d->z, &e->t );
p448_mul ( &e->y, &d->z, &e->u );
p448_sqr ( &e->z, &d->z );
}

void
convert_tw_niels_to_tw_extensible (
struct tw_extensible_t* e,
const struct tw_niels_t* d
) {
p448_add ( &e->y, &d->b, &d->a );
p448_weak_reduce( &e->y );
p448_sub ( &e->x, &d->b, &d->a );
p448_bias ( &e->x, 2 );
p448_weak_reduce( &e->x );
p448_set_ui( &e->z, 1 );
p448_copy ( &e->t, &e->x );
p448_copy ( &e->u, &e->y );
}

void
montgomery_step (
struct montgomery_t* a
) {
struct p448_t L0, L1;
p448_add ( &L0, &a->zd, &a->xd );
p448_sub ( &L1, &a->xd, &a->zd );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_sub ( &a->zd, &a->xa, &a->za );
p448_bias ( &a->zd, 2 );
p448_weak_reduce( &a->zd );
p448_mul ( &a->xd, &L0, &a->zd );
p448_add ( &a->zd, &a->za, &a->xa );
p448_mul ( &a->za, &L1, &a->zd );
p448_add ( &a->xa, &a->za, &a->xd );
p448_sqr ( &a->zd, &a->xa );
p448_mul ( &a->xa, &a->z0, &a->zd );
p448_sub ( &a->zd, &a->xd, &a->za );
p448_bias ( &a->zd, 2 );
p448_weak_reduce( &a->zd );
p448_sqr ( &a->za, &a->zd );
p448_sqr ( &a->xd, &L0 );
p448_sqr ( &L0, &L1 );
p448_mulw ( &a->zd, &a->xd, 39082 );
p448_sub ( &L1, &a->xd, &L0 );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &a->xd, &L0, &a->zd );
p448_sub ( &L0, &a->zd, &L1 );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_mul ( &a->zd, &L0, &L1 );
}

void
deserialize_montgomery (
struct montgomery_t* a,
const struct p448_t* sbz
) {
p448_sqr ( &a->z0, sbz );
p448_set_ui( &a->xd, 1 );
p448_set_ui( &a->zd, 0 );
p448_set_ui( &a->xa, 1 );
p448_copy ( &a->za, &a->z0 );
}

mask_t
serialize_montgomery (
struct p448_t* b,
const struct montgomery_t* a,
const struct p448_t* sbz
) {
mask_t L4, L5, L6;
struct p448_t L0, L1, L2, L3;
p448_mul ( &L3, &a->z0, &a->zd );
p448_sub ( &L1, &L3, &a->xd );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L3, &a->za, &L1 );
p448_mul ( &L2, &a->z0, &a->xd );
p448_sub ( &L1, &L2, &a->zd );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L0, &a->xa, &L1 );
p448_add ( &L2, &L0, &L3 );
p448_sub ( &L1, &L3, &L0 );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &L3, &L1, &L2 );
p448_copy ( &L2, &a->z0 );
p448_addw ( &L2, 1 );
p448_sqr ( &L1, &L2 );
p448_mulw ( &L2, &L1, 39082 );
p448_neg ( &L1, &L2 );
p448_add ( &L2, &a->z0, &a->z0 );
p448_bias ( &L2, 1 );
p448_add ( &L0, &L2, &L2 );
p448_add ( &L2, &L0, &L1 );
p448_weak_reduce( &L2 );
p448_mul ( &L0, &a->xd, &L2 );
L5 = p448_is_zero( &a->zd );
L6 = - L5;
p448_mask ( &L1, &L0, L5 );
p448_add ( &L2, &L1, &a->zd );
L4 = ~ L5;
p448_mul ( &L1, sbz, &L3 );
p448_addw ( &L1, L6 );
p448_mul ( &L3, &L2, &L1 );
p448_mul ( &L1, &L3, &L2 );
p448_mul ( &L2, &L3, &a->xd );
p448_mul ( &L3, &L1, &L2 );
p448_isr ( &L0, &L3 );
p448_mul ( &L2, &L1, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, &L3, &L1 );
p448_mask ( b, &L2, L4 );
p448_subw ( &L0, 1 );
p448_bias ( &L0, 1 );
L5 = p448_is_zero( &L0 );
L4 = p448_is_zero( sbz );
return L5 | L4;
}

void
serialize_extensible (
struct p448_t* b,
const struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sub ( &L0, &a->y, &a->z );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_add ( b, &a->z, &a->y );
p448_mul ( &L1, &a->z, &a->x );
p448_mul ( &L2, &L0, &L1 );
p448_mul ( &L1, &L2, &L0 );
p448_mul ( &L0, &L2, b );
p448_mul ( &L2, &L1, &L0 );
p448_isr ( &L0, &L2 );
p448_mul ( b, &L1, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, &L2, &L1 );
}

void
untwist_and_double_and_serialize (
struct p448_t* b,
const struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_mul ( &L3, &a->y, &a->x );
p448_add ( b, &a->y, &a->x );
p448_sqr ( &L1, b );
p448_add ( &L2, &L3, &L3 );
p448_sub ( b, &L1, &L2 );
p448_bias ( b, 3 );
p448_weak_reduce( b );
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &L2 );
p448_add ( &L2, b, b );
p448_mulw ( b, &L2, 39082 );
p448_neg ( &L2, b );
p448_bias ( &L2, 2 );
p448_mulw ( &L0, &L2, 39082 );
p448_neg ( b, &L0 );
p448_bias ( b, 2 );
p448_mul ( &L0, &L2, &L1 );
p448_mul ( &L2, b, &L0 );
p448_isr ( &L0, &L2 );
p448_mul ( &L1, b, &L0 );
p448_sqr ( b, &L0 );
p448_mul ( &L0, &L2, b );
p448_mul ( b, &L1, &L3 );
}

void
twist_even (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
mask_t L0, L1;
p448_sqr ( &b->y, &a->z );
p448_sqr ( &b->z, &a->x );
p448_sub ( &b->u, &b->y, &b->z );
p448_bias ( &b->u, 2 );
p448_weak_reduce( &b->u );
p448_sub ( &b->z, &a->z, &a->x );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->y, &b->z, &a->y );
p448_sub ( &b->z, &a->z, &a->y );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->x, &b->z, &b->y );
p448_mul ( &b->t, &b->x, &b->u );
p448_mul ( &b->y, &b->x, &b->t );
p448_isr ( &b->t, &b->y );
p448_mul ( &b->u, &b->x, &b->t );
p448_sqr ( &b->x, &b->t );
p448_mul ( &b->t, &b->y, &b->x );
p448_mul ( &b->x, &a->x, &b->u );
p448_mul ( &b->y, &a->y, &b->u );
L1 = p448_is_zero( &b->z );
L0 = - L1;
p448_addw ( &b->y, L0 );
p448_weak_reduce( &b->y );
p448_set_ui( &b->z, 1 );
p448_copy ( &b->t, &b->x );
p448_copy ( &b->u, &b->y );
}

void
test_only_twist (
struct tw_extensible_t* b,
const struct extensible_t* a
) {
mask_t L2, L3;
struct p448_t L0, L1;
p448_sqr ( &b->u, &a->z );
p448_sqr ( &b->y, &a->x );
p448_sub ( &b->z, &b->u, &b->y );
p448_bias ( &b->z, 2 );
p448_add ( &b->y, &b->z, &b->z );
p448_add ( &b->u, &b->y, &b->y );
p448_weak_reduce( &b->u );
p448_sub ( &b->y, &a->z, &a->x );
p448_bias ( &b->y, 2 );
p448_weak_reduce( &b->y );
p448_mul ( &b->x, &b->y, &a->y );
p448_sub ( &b->z, &a->z, &a->y );
p448_bias ( &b->z, 2 );
p448_weak_reduce( &b->z );
p448_mul ( &b->t, &b->z, &b->x );
p448_mul ( &L1, &b->t, &b->u );
p448_mul ( &b->x, &b->t, &L1 );
p448_isr ( &L0, &b->x );
p448_mul ( &b->u, &b->t, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &b->t, &b->x, &L1 );
p448_add ( &L1, &a->y, &a->x );
p448_weak_reduce( &L1 );
p448_sub ( &L0, &a->x, &a->y );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
p448_mul ( &b->x, &b->t, &L0 );
p448_add ( &L0, &b->x, &L1 );
p448_sub ( &b->t, &L1, &b->x );
p448_bias ( &b->t, 2 );
p448_weak_reduce( &b->t );
p448_mul ( &b->x, &L0, &b->u );
L2 = p448_is_zero( &b->y );
L3 = - L2;
p448_addw ( &b->x, L3 );
p448_weak_reduce( &b->x );
p448_mul ( &b->y, &b->t, &b->u );
L2 = p448_is_zero( &b->z );
L3 = - L2;
p448_addw ( &b->y, L3 );
p448_weak_reduce( &b->y );
L3 = p448_is_zero( &a->y );
L2 = L3 + 1;
p448_set_ui( &b->z, L2 );
p448_copy ( &b->t, &b->x );
p448_copy ( &b->u, &b->y );
}

mask_t
is_square (
const struct p448_t* x
) {
mask_t L2, L3;
struct p448_t L0, L1;
p448_isr ( &L0, x );
p448_sqr ( &L1, &L0 );
p448_mul ( &L0, x, &L1 );
p448_subw ( &L0, 1 );
p448_bias ( &L0, 1 );
L3 = p448_is_zero( &L0 );
L2 = p448_is_zero( x );
return L3 | L2;
}

mask_t
is_even_pt (
const struct extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &a->x );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
p448_weak_reduce( &L0 );
return is_square ( &L0 );
}

mask_t
is_even_tw (
const struct tw_extensible_t* a
) {
struct p448_t L0, L1, L2;
p448_sqr ( &L2, &a->z );
p448_sqr ( &L1, &a->x );
p448_add ( &L0, &L1, &L2 );
p448_weak_reduce( &L0 );
return is_square ( &L0 );
}

mask_t
deserialize_affine (
struct affine_t* a,
const struct p448_t* sz
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L1, sz );
p448_copy ( &L3, &L1 );
p448_addw ( &L3, 1 );
p448_sqr ( &a->x, &L3 );
p448_mulw ( &L3, &a->x, 39082 );
p448_neg ( &a->x, &L3 );
p448_add ( &L3, &L1, &L1 );
p448_bias ( &L3, 1 );
p448_add ( &a->y, &L3, &L3 );
p448_add ( &L3, &a->y, &a->x );
p448_weak_reduce( &L3 );
p448_copy ( &a->y, &L1 );
p448_subw ( &a->y, 1 );
p448_neg ( &a->x, &a->y );
p448_bias ( &a->x, 2 );
p448_weak_reduce( &a->x );
p448_mul ( &a->y, &a->x, &L3 );
p448_sqr ( &L2, &a->x );
p448_mul ( &L0, &L2, &a->y );
p448_mul ( &a->y, &a->x, &L0 );
p448_isr ( &L3, &a->y );
p448_mul ( &a->y, &L2, &L3 );
p448_sqr ( &L2, &L3 );
p448_mul ( &L3, &L0, &L2 );
p448_mul ( &L0, &a->x, &L3 );
p448_add ( &L2, &a->y, &a->y );
p448_mul ( &a->x, sz, &L2 );
p448_addw ( &L1, 1 );
p448_mul ( &a->y, &L1, &L3 );
p448_subw ( &L0, 1 );
p448_bias ( &L0, 1 );
return p448_is_zero( &L0 );
}

mask_t
deserialize_and_twist_approx (
struct tw_extensible_t* a,
const struct p448_t* sdm1,
const struct p448_t* sz
) {
struct p448_t L0, L1;
p448_sqr ( &a->z, sz );
p448_copy ( &a->y, &a->z );
p448_addw ( &a->y, 1 );
p448_sqr ( &a->x, &a->y );
p448_mulw ( &a->y, &a->x, 39082 );
p448_neg ( &a->x, &a->y );
p448_add ( &a->y, &a->z, &a->z );
p448_bias ( &a->y, 1 );
p448_add ( &a->u, &a->y, &a->y );
p448_add ( &a->y, &a->u, &a->x );
p448_weak_reduce( &a->y );
p448_sqr ( &a->x, &a->z );
p448_subw ( &a->x, 1 );
p448_neg ( &a->u, &a->x );
p448_bias ( &a->u, 2 );
p448_weak_reduce( &a->u );
p448_mul ( &a->x, sdm1, &a->u );
p448_mul ( &L0, &a->x, &a->y );
p448_mul ( &a->t, &L0, &a->y );
p448_mul ( &a->u, &a->x, &a->t );
p448_mul ( &a->t, &a->u, &L0 );
p448_mul ( &a->y, &a->x, &a->t );
p448_isr ( &L0, &a->y );
p448_mul ( &a->y, &a->u, &L0 );
p448_sqr ( &L1, &L0 );
p448_mul ( &a->u, &a->t, &L1 );
p448_mul ( &a->t, &a->x, &a->u );
p448_add ( &a->x, sz, sz );
p448_mul ( &L0, &a->u, &a->x );
p448_copy ( &a->x, &a->z );
p448_subw ( &a->x, 1 );
p448_neg ( &L1, &a->x );
p448_bias ( &L1, 2 );
p448_weak_reduce( &L1 );
p448_mul ( &a->x, &L1, &L0 );
p448_mul ( &L0, &a->u, &a->y );
p448_addw ( &a->z, 1 );
p448_mul ( &a->y, &a->z, &L0 );
p448_subw ( &a->t, 1 );
p448_bias ( &a->t, 1 );
mask_t ret = p448_is_zero( &a->t );
p448_set_ui( &a->z, 1 );
p448_copy ( &a->t, &a->x );
p448_copy ( &a->u, &a->y );
return ret;
}

void
set_identity_extensible (
struct extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_tw_extensible (
struct tw_extensible_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
p448_set_ui( &a->z, 1 );
p448_set_ui( &a->t, 0 );
p448_set_ui( &a->u, 0 );
}

void
set_identity_affine (
struct affine_t* a
) {
p448_set_ui( &a->x, 0 );
p448_set_ui( &a->y, 1 );
}

mask_t
eq_affine (
const struct affine_t* a,
const struct affine_t* b
) {
mask_t L1, L2;
struct p448_t L0;
p448_sub ( &L0, &a->x, &b->x );
p448_bias ( &L0, 2 );
L2 = p448_is_zero( &L0 );
p448_sub ( &L0, &a->y, &b->y );
p448_bias ( &L0, 2 );
L1 = p448_is_zero( &L0 );
return L2 & L1;
}

mask_t
eq_extensible (
const struct extensible_t* a,
const struct extensible_t* b
) {
mask_t L3, L4;
struct p448_t L0, L1, L2;
p448_mul ( &L2, &b->z, &a->x );
p448_mul ( &L1, &a->z, &b->x );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
L4 = p448_is_zero( &L0 );
p448_mul ( &L2, &b->z, &a->y );
p448_mul ( &L1, &a->z, &b->y );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
L3 = p448_is_zero( &L0 );
return L4 & L3;
}

mask_t
eq_tw_extensible (
const struct tw_extensible_t* a,
const struct tw_extensible_t* b
) {
mask_t L3, L4;
struct p448_t L0, L1, L2;
p448_mul ( &L2, &b->z, &a->x );
p448_mul ( &L1, &a->z, &b->x );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
L4 = p448_is_zero( &L0 );
p448_mul ( &L2, &b->z, &a->y );
p448_mul ( &L1, &a->z, &b->y );
p448_sub ( &L0, &L2, &L1 );
p448_bias ( &L0, 2 );
L3 = p448_is_zero( &L0 );
return L4 & L3;
}

void
elligator_2s_inject (
struct affine_t* a,
const struct p448_t* r
) {
mask_t L0, L1;
struct p448_t L2, L3, L4, L5, L6, L7, L8;
p448_sqr ( &a->x, r );
p448_sqr ( &L3, &a->x );
p448_copy ( &a->y, &L3 );
p448_subw ( &a->y, 1 );
p448_neg ( &L4, &a->y );
p448_bias ( &L4, 2 );
p448_weak_reduce( &L4 );
p448_sqr ( &L2, &L4 );
p448_mulw ( &L7, &L2, 1527402724 );
p448_mulw ( &L8, &L3, 6108985600 );
p448_add ( &a->y, &L8, &L7 );
p448_weak_reduce( &a->y );
p448_mulw ( &L8, &L2, 6109454568 );
p448_sub ( &L7, &a->y, &L8 );
p448_bias ( &L7, 2 );
p448_weak_reduce( &L7 );
p448_mulw ( &L6, &a->y, 78160 );
p448_mul ( &L5, &L7, &L6 );
p448_mul ( &L8, &L5, &L4 );
p448_mul ( &L4, &L5, &L6 );
p448_mul ( &L5, &L7, &L8 );
p448_mul ( &L8, &L5, &L4 );
p448_mul ( &L4, &L7, &L8 );
p448_isr ( &L6, &L4 );
p448_mul ( &L4, &L5, &L6 );
p448_sqr ( &L5, &L6 );
p448_mul ( &L6, &L8, &L5 );
p448_mul ( &L8, &L7, &L6 );
p448_mul ( &L7, &L8, &L6 );
p448_copy ( &L6, &a->x );
p448_subw ( &L6, 1 );
p448_addw ( &a->x, 1 );
p448_mul ( &L5, &a->x, &L8 );
p448_sub ( &a->x, &L6, &L5 );
p448_bias ( &a->x, 3 );
p448_weak_reduce( &a->x );
p448_mul ( &L5, &L4, &a->x );
p448_mulw ( &L4, &L5, 78160 );
p448_neg ( &a->x, &L4 );
p448_bias ( &a->x, 2 );
p448_weak_reduce( &a->x );
p448_add ( &L4, &L3, &L3 );
p448_add ( &L3, &L4, &L2 );
p448_subw ( &L3, 2 );
p448_bias ( &L3, 1 );
p448_weak_reduce( &L3 );
p448_mul ( &L2, &L3, &L8 );
p448_mulw ( &L3, &L2, 3054649120 );
p448_add ( &L2, &L3, &a->y );
p448_mul ( &a->y, &L7, &L2 );
L1 = p448_is_zero( &L8 );
L0 = - L1;
p448_addw ( &a->y, L0 );
p448_weak_reduce( &a->y );
}

mask_t
validate_affine (
const struct affine_t* a
) {
struct p448_t L0, L1, L2, L3;
p448_sqr ( &L0, &a->y );
p448_sqr ( &L2, &a->x );
p448_add ( &L3, &L2, &L0 );
p448_subw ( &L3, 1 );
p448_mulw ( &L1, &L2, 39081 );
p448_neg ( &L2, &L1 );
p448_bias ( &L2, 2 );
p448_mul ( &L1, &L0, &L2 );
p448_sub ( &L0, &L3, &L1 );
p448_bias ( &L0, 3 );
return p448_is_zero( &L0 );
}

mask_t
validate_tw_extensible (
const struct tw_extensible_t* ext
) {
mask_t L4, L5;
struct p448_t L0, L1, L2, L3;
/*
* Check invariant:
* 0 = -x*y + z*t*u
*/
p448_mul ( &L1, &ext->t, &ext->u );
p448_mul ( &L2, &ext->z, &L1 );
p448_addw ( &L2, 0 );
p448_mul ( &L0, &ext->x, &ext->y );
p448_neg ( &L1, &L0 );
p448_add ( &L0, &L1, &L2 );
p448_bias ( &L0, 2 );
L5 = p448_is_zero( &L0 );
/*
* Check invariant:
* 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
*/
p448_sqr ( &L2, &ext->y );
p448_neg ( &L1, &L2 );
p448_addw ( &L1, 0 );
p448_sqr ( &L0, &ext->x );
p448_add ( &L2, &L0, &L1 );
p448_sqr ( &L3, &ext->u );
p448_sqr ( &L0, &ext->t );
p448_mul ( &L1, &L0, &L3 );
p448_mulw ( &L0, &L1, 39081 );
p448_neg ( &L3, &L0 );
p448_add ( &L0, &L3, &L2 );
p448_neg ( &L3, &L1 );
p448_add ( &L2, &L3, &L0 );
p448_sqr ( &L1, &ext->z );
p448_add ( &L0, &L1, &L2 );
p448_bias ( &L0, 4 );
L4 = p448_is_zero( &L0 );
return L5 & L4;
}

mask_t
validate_extensible (
const struct extensible_t* ext
) {
mask_t L4, L5;
struct p448_t L0, L1, L2, L3;
/*
* Check invariant:
* 0 = d*t^2*u^2 - x^2 - y^2 + z^2
*/
p448_sqr ( &L2, &ext->y );
p448_neg ( &L1, &L2 );
p448_addw ( &L1, 0 );
p448_sqr ( &L0, &ext->z );
p448_add ( &L2, &L0, &L1 );
p448_sqr ( &L3, &ext->u );
p448_sqr ( &L0, &ext->t );
p448_mul ( &L1, &L0, &L3 );
p448_mulw ( &L3, &L1, 39081 );
p448_neg ( &L0, &L3 );
p448_add ( &L1, &L0, &L2 );
p448_sqr ( &L0, &ext->x );
p448_neg ( &L2, &L0 );
p448_add ( &L0, &L2, &L1 );
p448_bias ( &L0, 4 );
L5 = p448_is_zero( &L0 );
/*
* Check invariant:
* 0 = -x*y + z*t*u
*/
p448_mul ( &L1, &ext->t, &ext->u );
p448_mul ( &L2, &ext->z, &L1 );
p448_addw ( &L2, 0 );
p448_mul ( &L0, &ext->x, &ext->y );
p448_neg ( &L1, &L0 );
p448_add ( &L0, &L1, &L2 );
p448_bias ( &L0, 2 );
L4 = p448_is_zero( &L0 );
return L5 & L4;
}



+ 1207
- 0
src/arch_neon_experimental/p448.c
File diff suppressed because it is too large
View File


+ 376
- 0
src/arch_neon_experimental/p448.h View File

@@ -0,0 +1,376 @@
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_H__
#define __P448_H__ 1

#include "word.h"

#include <stdint.h>
#include <assert.h>

typedef struct p448_t {
uint32_t limb[16];
} __attribute__((aligned(32))) p448_t;

#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
#define USE_NEON_PERM 1

#ifdef __cplusplus
extern "C" {
#endif

static __inline__ void
p448_set_ui (
p448_t *out,
uint64_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t do_swap
) __attribute__((unused,always_inline));

static __inline__ void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_neg (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_neg (
p448_t *a,
mask_t doNegate
) __attribute__((unused,always_inline));

static __inline__ void
p448_addw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_subw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_copy (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_weak_reduce (
p448_t *inout
) __attribute__((unused,always_inline));
void
p448_strong_reduce (
p448_t *inout
);

mask_t
p448_is_zero (
const p448_t *in
);
static __inline__ void
p448_bias (
p448_t *inout,
int amount
) __attribute__((unused,always_inline));

void
p448_mul (
p448_t *__restrict__ out,
const p448_t *a,
const p448_t *b
);

void
p448_mulw (
p448_t *__restrict__ out,
const p448_t *a,
uint64_t b
);

void
p448_sqr (
p448_t *__restrict__ out,
const p448_t *a
);
static __inline__ void
p448_sqrn (
p448_t *__restrict__ y,
const p448_t *x,
int n
) __attribute__((unused,always_inline));

void
p448_serialize (
uint8_t *serial,
const struct p448_t *x
);

mask_t
p448_deserialize (
p448_t *x,
const uint8_t serial[56]
);
static __inline__ void
p448_mask(
struct p448_t *a,
const struct p448_t *b,
mask_t mask
) __attribute__((unused,always_inline));

/**
* Returns 1/x.
*
* If x=0, returns 0.
*/
void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
);
void
simultaneous_invert_p448 (
struct p448_t *__restrict__ out,
const struct p448_t *in,
unsigned int n
);

static inline mask_t
p448_eq (
const struct p448_t *a,
const struct p448_t *b
) __attribute__((always_inline,unused));

/* -------------- Inline functions begin here -------------- */

void
p448_set_ui (
p448_t *out,
uint64_t x
) {
int i;
for (i=0; i<16; i++) {
out->limb[i] = 0;
}
out->limb[0] = x & ((1<<28)-1);
out->limb[2] = x>>28;
}
void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t doswap
) {
big_register_t *aa = (big_register_t*)a;
big_register_t *bb = (big_register_t*)b;
big_register_t m = br_set_to_mask(doswap);

unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
big_register_t x = m & (aa[i]^bb[i]);
aa[i] ^= x;
bb[i] ^= x;
}
}

void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
}
}

void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
*/
}

void
p448_neg (
p448_t *out,
const p448_t *a
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = -a->limb[i];
}
*/
}

void
p448_cond_neg(
p448_t *a,
mask_t doNegate
) {
unsigned int i;
struct p448_t negated;
big_register_t *aa = (big_register_t *)a;
big_register_t *nn = (big_register_t*)&negated;
big_register_t m = br_set_to_mask(doNegate);
p448_neg(&negated, a);
p448_bias(&negated, 2);
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
aa[i] = (aa[i] & ~m) | (nn[i] & m);
}
}

void
p448_addw (
p448_t *a,
uint32_t x
) {
a->limb[0] += x;
}
void
p448_subw (
p448_t *a,
uint32_t x
) {
a->limb[0] -= x;
}

void
p448_copy (
p448_t *out,
const p448_t *a
) {
*out = *a;
}

void
p448_bias (
p448_t *a,
int amt
) {
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
uint32x4_t *aa = (uint32x4_t*) a;
aa[0] += lo;
aa[1] += hi;
aa[2] += hi;
aa[3] += hi;
}

void
p448_weak_reduce (
p448_t *a
) {

uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
tmp = vshr_n_u32(aa[7],28);
int i;
for (i=7; i>=1; i--) {
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28);
}
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
}

void
p448_sqrn (
p448_t *__restrict__ y,
const p448_t *x,
int n
) {
p448_t tmp;
assert(n>0);
if (n&1) {
p448_sqr(y,x);
n--;
} else {
p448_sqr(&tmp,x);
p448_sqr(y,&tmp);
n-=2;
}
for (; n; n-=2) {
p448_sqr(&tmp,y);
p448_sqr(y,&tmp);
}
}

mask_t
p448_eq (
const struct p448_t *a,
const struct p448_t *b
) {
struct p448_t ra, rb;
p448_copy(&ra, a);
p448_copy(&rb, b);
p448_weak_reduce(&ra);
p448_weak_reduce(&rb);
p448_sub(&ra, &ra, &rb);
p448_bias(&ra, 2);
return p448_is_zero(&ra);
}

void
p448_mask (
struct p448_t *a,
const struct p448_t *b,
mask_t mask
) {
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
a->limb[i] = b->limb[i] & mask;
}
}

#ifdef __cplusplus
}; /* extern "C" */
#endif

#endif /* __P448_H__ */

+ 2
- 2
src/crandom.c View File

@@ -466,7 +466,7 @@ crandom_generate(
unsigned long long copy = (length > state->fill) ? state->fill : length; unsigned long long copy = (length > state->fill) ? state->fill : length;
state->fill -= copy; state->fill -= copy;
memcpy(output, state->buffer + state->fill, copy); memcpy(output, state->buffer + state->fill, copy);
memset(state->buffer + state->fill, 0, copy);
really_memset(state->buffer + state->fill, 0, copy);
output += copy; length -= copy; output += copy; length -= copy;
} }


@@ -484,5 +484,5 @@ crandom_destroy(
*/ */
} }


memset(state, 0, sizeof(*state));
really_memset(state, 0, sizeof(*state));
} }

+ 5
- 5
src/goldilocks.c View File

@@ -340,7 +340,7 @@ goldilocks_sign (
word_t skw[GOLDI_FIELD_WORDS]; word_t skw[GOLDI_FIELD_WORDS];
mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order); mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order);
if (!succ) { if (!succ) {
memset(skw,0,sizeof(skw));
really_memset(skw,0,sizeof(skw));
return GOLDI_ECORRUPT; return GOLDI_ECORRUPT;
} }
@@ -389,9 +389,9 @@ goldilocks_sign (
memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES);
barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES);
memset((unsigned char *)tk,0,sizeof(tk));
memset((unsigned char *)skw,0,sizeof(skw));
memset((unsigned char *)challenge,0,sizeof(challenge));
really_memset((unsigned char *)tk,0,sizeof(tk));
really_memset((unsigned char *)skw,0,sizeof(skw));
really_memset((unsigned char *)challenge,0,sizeof(challenge));
/* response = 2(nonce_secret - sk*challenge) /* response = 2(nonce_secret - sk*challenge)
* Nonce = 8[nonce_secret]*G * Nonce = 8[nonce_secret]*G
@@ -494,7 +494,7 @@ goldilocks_destroy_precomputed_public_key (
) { ) {
if (!precom) return; if (!precom) return;
destroy_fixed_base(&precom->table); destroy_fixed_base(&precom->table);
memset(&precom->pub.opaque, 0, sizeof(precom->pub));
really_memset(&precom->pub.opaque, 0, sizeof(precom->pub));
free(precom); free(precom);
} }




+ 26
- 2
src/include/word.h View File

@@ -146,11 +146,17 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
} }
#endif #endif


#if __AVX2__ || __SSE2__
#if __AVX2__
static __inline__ big_register_t static __inline__ big_register_t
br_is_zero(big_register_t x) { br_is_zero(big_register_t x) {
return (big_register_t)(x == br_set_to_mask(0)); return (big_register_t)(x == br_set_to_mask(0));
} }
#elif __SSE2__
static __inline__ big_register_t
br_is_zero(big_register_t x) {
return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
//return (big_register_t)(x == br_set_to_mask(0));
}
#elif __ARM_NEON__ #elif __ARM_NEON__
static __inline__ big_register_t static __inline__ big_register_t
br_is_zero(big_register_t x) { br_is_zero(big_register_t x) {
@@ -179,7 +185,25 @@ static inline uint64_t
letoh64 (uint64_t x) { return x; } letoh64 (uint64_t x) { return x; }
#endif #endif



/**
* Really call memset, in a way that prevents the compiler from optimizing it out.
* @param p The object to zeroize.
* @param c The char to set it to (probably zero).
* @param s The size of the object.
*/
#ifdef __STDC_LIB_EXT1__ /* which it won't be, because we're -std=c99 */
static __inline__ void
really_memset(void *p, char c, size_t s) {
memset_s(p,s,c,s);
}
#else
static __inline__ void __attribute__((always_inline,unused))
really_memset(void *p, char c, size_t s) {
volatile char *pv = (volatile char *)p;
size_t i;
for (i=0; i<s; i++) pv[i] = c;
}
#endif


/** /**
* Allocate memory which is sufficiently aligned to be used for the * Allocate memory which is sufficiently aligned to be used for the


+ 13
- 0
src/magic.c View File

@@ -27,11 +27,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
}; };


const struct affine_t goldilocks_base_point = { const struct affine_t goldilocks_base_point = {
#ifdef USE_NEON_PERM
{{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a,
0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
}},
#else
{{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7),
U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa),
U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324), U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324),
U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff) U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff)
}}, }},
#endif
{{ 19 }} {{ 19 }}
}; };


@@ -50,6 +56,12 @@ const struct barrett_prime_t curve_prime_order = {


const struct field_t const struct field_t
sqrt_d_minus_1 = {{ sqrt_d_minus_1 = {{
#ifdef USE_NEON_PERM
0x6749f46,0x24d9770,0xd2e2183,0xa49f7b4,
0xb4f0179,0x8c5f656,0x888db42,0xdcac462,
0xbdeea38,0x748734a,0x5a189aa,0x49443b8,
0x6f14c06,0x0b25b7a,0x51e65ca,0x12fec0c
#else
U58LE(0xd2e21836749f46), U58LE(0xd2e21836749f46),
U58LE(0x888db42b4f0179), U58LE(0x888db42b4f0179),
U58LE(0x5a189aabdeea38), U58LE(0x5a189aabdeea38),
@@ -58,4 +70,5 @@ sqrt_d_minus_1 = {{
U58LE(0xdcac4628c5f656), U58LE(0xdcac4628c5f656),
U58LE(0x49443b8748734a), U58LE(0x49443b8748734a),
U58LE(0x12fec0c0b25b7a) U58LE(0x12fec0c0b25b7a)
#endif
}}; }};

+ 18
- 10
src/scalarmul.c View File

@@ -63,7 +63,15 @@ cond_negate_tw_pniels (
cond_negate_tw_niels(&n->n, doNegate); cond_negate_tw_niels(&n->n, doNegate);
} }


static __inline__ void
#if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__))
/* This works around an apparent compiler bug in GCC, thanks Samuel Neves */
static void __attribute__((optimize("O1")))
#ifdef __OPTIMIZE_SIZE__
#warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms"
#endif
#else
static __inline__ void
#endif
constant_time_lookup_tw_pniels ( constant_time_lookup_tw_pniels (
struct tw_pniels_t *out, struct tw_pniels_t *out,
const struct tw_pniels_t *in, const struct tw_pniels_t *in,
@@ -76,7 +84,7 @@ constant_time_lookup_tw_pniels (
int j; int j;
unsigned int k; unsigned int k;
memset(out, 0, sizeof(*out));
really_memset(out, 0, sizeof(*out));
for (j=0; j<nin; j++, big_i-=big_one) { for (j=0; j<nin; j++, big_i-=big_one) {
big_register_t mask = br_is_zero(big_i); big_register_t mask = br_is_zero(big_i);
for (k=0; k<sizeof(*out)/sizeof(*o); k++) { for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
@@ -98,7 +106,7 @@ constant_time_lookup_tw_niels (
int j; int j;
unsigned int k; unsigned int k;
memset(out, 0, sizeof(*out));
really_memset(out, 0, sizeof(*out));
for (j=0; j<nin; j++, big_i-=big_one) { for (j=0; j<nin; j++, big_i-=big_one) {
big_register_t mask = br_is_zero(big_i); big_register_t mask = br_is_zero(big_i);
for (k=0; k<sizeof(*out)/sizeof(*o); k++) { for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
@@ -449,7 +457,7 @@ precompute_fixed_base (
struct tw_niels_t *prealloc struct tw_niels_t *prealloc
) { ) {
if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) { if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
memset(out, 0, sizeof(*out));
really_memset(out, 0, sizeof(*out));
return 0; return 0;
} }
@@ -478,8 +486,8 @@ precompute_fixed_base (
free(doubles); free(doubles);
free(zs); free(zs);
free(zis); free(zis);
memset(out, 0, sizeof(*out));
memset(table, 0, sizeof(*table) * (n<<(t-1)));
really_memset(out, 0, sizeof(*out));
really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
if (!prealloc) free(table); if (!prealloc) free(table);
return 0; return 0;
} }
@@ -593,9 +601,9 @@ precompute_fixed_base (
free(zis); free(zis);


if (unlikely(!ret)) { if (unlikely(!ret)) {
memset(table, 0, sizeof(*table) * (n<<(t-1)));
really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
if (!prealloc) free(table); if (!prealloc) free(table);
memset(out, 0, sizeof(*out));
really_memset(out, 0, sizeof(*out));
return 0; return 0;
} }


@@ -607,12 +615,12 @@ destroy_fixed_base (
struct fixed_base_table_t *table struct fixed_base_table_t *table
) { ) {
if (table->table) { if (table->table) {
memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
} }
if (table->own_table) { if (table->own_table) {
free(table->table); free(table->table);
} }
memset(table,0,sizeof(*table));
really_memset(table,0,sizeof(*table));
} }


mask_t mask_t


+ 9
- 9
test/bench.c View File

@@ -108,33 +108,33 @@ int main(int argc, char **argv) {
q448_randomize(&crand, sk); q448_randomize(&crand, sk);
when = now(); when = now();
for (i=0; i<nbase*1000; i++) {
for (i=0; i<nbase*5000; i++) {
p448_mul(&c, &b, &a); p448_mul(&c, &b, &a);
} }
when = now() - when; when = now() - when;
printf("mul: %5.1fns\n", when * 1e9 / i); printf("mul: %5.1fns\n", when * 1e9 / i);
when = now(); when = now();
for (i=0; i<nbase*1000; i++) {
for (i=0; i<nbase*5000; i++) {
p448_sqr(&c, &a); p448_sqr(&c, &a);
} }
when = now() - when; when = now() - when;
printf("sqr: %5.1fns\n", when * 1e9 / i); printf("sqr: %5.1fns\n", when * 1e9 / i);
when = now(); when = now();
for (i=0; i<nbase*500; i++) {
p448_mul(&c, &b, &a);
p448_mul(&a, &b, &c);
for (i=0; i<nbase*5000; i++) {
p448_mulw(&c, &b, 1234562);
} }
when = now() - when; when = now() - when;
printf("mul dep: %5.1fns\n", when * 1e9 / i / 2);
printf("mulw: %5.1fns\n", when * 1e9 / i);
when = now(); when = now();
for (i=0; i<nbase*1000; i++) {
p448_mulw(&c, &b, 1234562);
for (i=0; i<nbase*500; i++) {
p448_mul(&c, &b, &a);
p448_mul(&a, &b, &c);
} }
when = now() - when; when = now() - when;
printf("mulw: %5.1fns\n", when * 1e9 / i);
printf("mul dep: %5.1fns\n", when * 1e9 / i / 2);
when = now(); when = now();
for (i=0; i<nbase*10; i++) { for (i=0; i<nbase*10; i++) {


+ 4
- 1
test/test.c View File

@@ -3,6 +3,9 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>


#ifndef LIMBPERM
#define LIMBPERM(x) (x)
#endif


int failed_tests, n_tests, failed_this_test, running_a_test; int failed_tests, n_tests, failed_this_test, running_a_test;


@@ -87,7 +90,7 @@ void p448_print (
int j; int j;
printf("%s = 0x", descr); printf("%s = 0x", descr);
for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) { for (j=sizeof(*a)/sizeof(word_t)-1; j>=0; j--) {
printf(PRIxWORD58, b.limb[j]);
printf(PRIxWORD58, b.limb[LIMBPERM(j)]);
} }
printf("\n"); printf("\n");
} }


+ 6
- 1
test/test_arithmetic.c View File

@@ -170,7 +170,12 @@ int test_arithmetic () {
int bits = sizeof(word_t) * 448 / sizeof(p448_t); int bits = sizeof(word_t) * 448 / sizeof(p448_t);
for (j=0; j<ntests; j++) { for (j=0; j<ntests; j++) {
if (j&1) {
if (j<256) {
mpz_set_ui(x,0);
mpz_set_ui(y,0);
mpz_setbit(x,(j%16)*28);
mpz_setbit(y,(j/16)*28);
} else if (j&1) {
mpz_rrandomb(x, state, 448); mpz_rrandomb(x, state, 448);
mpz_rrandomb(y, state, 448); mpz_rrandomb(y, state, 448);
} else { } else {


Loading…
Cancel
Save