diff --git a/HISTORY.txt b/HISTORY.txt index 702513e..1f301e9 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,4 +1,52 @@ -May 3, 2104: +July 11, 2014: + This is mostly a cleanup release. + + Added CRANDOM_MIGHT_IS_MUST config flag (default: 1). When set, this + causes crandom to assume that all features in the target arch will + be available, instead of detecting them. This makes sense because + the rest of the Goldilocks code is not (yet?) able to detect features. + Also, I'd like to submit this to SUPERCOP eventually, and SUPERCOP won't + pass -DMUST_HAVE_XXX on the command line the way the Makefile here did. + + Flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES to disable the crandom + output buffer. This buffer improves performance (very marginally at + Goldilocks sizes), but can cause problems with forking and VM + snapshotting. By default, the buffer is now disabled. + + I've slightly tweaked the Elligator implementation (which is still + unused) to make it easier to invert. This makes anything using Elligator + (i.e. nothing) incompatible with previous releases. + + I've been factoring "magic" constants such as curve orders, window sizes, + etc into a few headers, to reduce the effort to port the code to other + primes, curves, etc. For example, I could test the Microsoft curves, and + something like: + x^2 + y^2 = 1 +- 5382[45] x^2 y^2 mod 2^480-2^240-1 + ("Goldeneye"? "Ridinghood"?) might be a reasonable thing to try for + 64-bit CPUs. + + In a similar vein, most of the internal code has been changed to say + "field" instead of p448, so that a future version of magic.h can decide + which field header to include. + + You can now `make bat` to create an eBAT in build/ed448-goldilocks. This + is only minimally tested, though, because SUPERCOP doesn't work on my + machine and I'm too lazy to reverse engineer it. It sets a new macro, + SUPERCOP_WONT_LET_ME_OPEN_FILES, which causes goldilocks_init() to fall + back to something horribly insecure if crandom_init_from_file raises + EMFILE. + + Slightly improved documentation. + + Removed some old commented-out code; restored the /* C-style */ comment + discipline. + + The AMD-64 version should now be GCC clean, at least for reasonably + recent GCC (tested on OS X.9.3, Haswell, gcc-4.9). + + History no longer says "2104". + +May 3, 2014: Minor changes to internal routines mean that this version is not compatible with the previous one. diff --git a/Makefile b/Makefile index 3efda9d..7050e90 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ endif ARCHFLAGS += -mcpu=cortex-a9 # FIXME GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow else -ARCHFLAGS += -mssse3 -maes -mavx -mavx2 -DMUST_HAVE_AVX2 -mbmi2 #TODO +ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO endif ifeq ($(CC),clang) @@ -48,26 +48,28 @@ endif ifeq (,$(findstring 64,$(ARCH))$(findstring gcc,$(CC))) # ARCHFLAGS += -m32 -ARCHFLAGS += -DGOLDI_FORCE_32_BIT=1 +XCFLAGS += -DGOLDI_FORCE_32_BIT=1 endif CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS) LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS) ASFLAGS = $(ARCHFLAGS) -.PHONY: clean all test bench todo doc lib +.PHONY: clean all test bench todo doc lib bat .PRECIOUS: build/%.s HEADERS= Makefile $(shell find . -name "*.h") build/timestamp LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ - build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o + build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ - build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o + build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o BENCHCOMPONENTS=build/bench.o +BATNAME=build/ed448-goldilocks + all: lib build/test build/bench scan: clean @@ -118,6 +120,19 @@ doc/timestamp: doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h doxygen +bat: $(BATNAME) + +$(BATNAME): include/* src/* src/*/* + rm -fr $@ + for arch in src/arch*; do \ + mkdir -p $@/`basename $$arch`; \ + cp include/* src/*.c src/include/* $$arch/* $@/`basename $$arch`; \ + perl -p -i -e 's/.*endif.*GOLDILOCKS_CONFIG_H/#define SUPERCOP_WONT_LET_ME_OPEN_FILES 1\n\n$$&/' $@/`basename $$arch`/config.h; \ + done + echo 'Mike Hamburg' > $@/designers + echo 'Ed448-Goldilocks sign and dh' > $@/description + + todo:: @(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \ 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' @@ -139,4 +154,4 @@ test: build/test ./$< clean: - rm -fr build doc + rm -fr build doc $(BATNAME) diff --git a/TODO.txt b/TODO.txt index e1d05f2..df1a782 100644 --- a/TODO.txt +++ b/TODO.txt @@ -25,8 +25,8 @@ Important work items for Ed448-Goldilocks: * [DONE] Bugfix: make sure that init() and randomization are thread-safe. -* Security: check on deserialization that points are < p. - * Check also that they're nonzero or otherwise non-pathological? +* [DONE] Security: check on deserialization that points are < p. + * [NEEDS TESTING] Check also that they're nonzero or otherwise non-pathological? * Testing: * Corner-case testing @@ -39,16 +39,16 @@ Important work items for Ed448-Goldilocks: * Most functions now have warn on ignored return. * Safety: - * Check for init() if it's still required once we've done the above + * [DONE] Check for init() if it's still required once we've done the above * Decide what to do about RNG failures * abort * return error and zeroize * return error but continue if RNG is kind of mostly OK * Flexibility: decide which API options are good. - * Eg, should functions take nbits and table sizes? + * [DONE?] Eg, should functions take nbits and table sizes? - * Remove hardcoded adjustments from comb control. + * [DONE] Remove hardcoded adjustments from comb control. * These adjustments make the output wrong when it's not 450 bits. * Other slow Barrett fields? Montgomery fields? @@ -71,6 +71,7 @@ Important work items for Ed448-Goldilocks: * Portability: test and make clean with other compilers * Using a fair amount of __attribute__ code. + * [DONE] Should work for GCC now. * Portability: try to make the vector code as portable as possible * Currently using clang ext_vector_length. @@ -79,15 +80,15 @@ Important work items for Ed448-Goldilocks: * Portability: make the inner layers of the code 32-bit clean. * Write new versions of the field code. - * 28-bit limbs give less headroom for carries. - * Now have a vectorless ARM version; need NEON. + * [DONE] 28-bit limbs give less headroom for carries. + * [DONE] Now have a vectorless ARM version; need NEON. * Improve speed of 32-bit field code. - * Run through the SAGE tool to generate new bias & bound. + * [DONE] Run through the SAGE tool to generate new bias & bound. * [DONE] Portability: make the outer layers of the code 32-bit clean. -* Performance/flexibility: decide which parameters should be hard-coded. +* [DONE] Performance/flexibility: decide which parameters should be hard-coded. * Perhaps useful for comb precomputation. * Performance: Improve SHA512. @@ -120,4 +121,4 @@ Important work items for Ed448-Goldilocks: * Clear other TODO/FIXME/HACK/PERF items in the code -* Submit to SUPERCOP +* [DONE?] Submit to SUPERCOP diff --git a/src/arch_32/ec_point.c b/src/arch_32/ec_point.c index 823e43d..47c325c 100644 --- a/src/arch_32/ec_point.c +++ b/src/arch_32/ec_point.c @@ -380,55 +380,55 @@ serialize_montgomery ( const struct montgomery_t* a, const struct p448_t* sbz ) { - mask_t L0, L1, L2; - struct p448_t L3, L4, L5, L6; - p448_mul ( &L6, &a->z0, &a->zd ); - p448_sub ( &L4, &L6, &a->xd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &a->za, &L4 ); - p448_mul ( &L5, &a->z0, &a->xd ); - p448_sub ( &L4, &L5, &a->zd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L3, &a->xa, &L4 ); - p448_add ( &L5, &L3, &L6 ); - p448_sub ( &L4, &L6, &L3 ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &L4, &L5 ); - p448_copy ( &L5, &a->z0 ); - p448_addw ( &L5, 1 ); - p448_sqr ( &L4, &L5 ); - p448_mulw ( &L5, &L4, 39082 ); - p448_neg ( &L4, &L5 ); - p448_add ( &L5, &a->z0, &a->z0 ); - p448_bias ( &L5, 1 ); - p448_add ( &L3, &L5, &L5 ); - p448_add ( &L5, &L3, &L4 ); - p448_weak_reduce( &L5 ); - p448_mul ( &L3, &a->xd, &L5 ); - L1 = p448_is_zero( &a->zd ); - L2 = - L1; - p448_mask ( &L4, &L3, L1 ); - p448_add ( &L5, &L4, &a->zd ); - L0 = ~ L1; - p448_mul ( &L4, sbz, &L6 ); - p448_addw ( &L4, L2 ); - p448_mul ( &L6, &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_mul ( &L5, &L6, &a->xd ); - p448_mul ( &L6, &L4, &L5 ); - p448_isr ( &L3, &L6 ); - p448_mul ( &L5, &L4, &L3 ); - p448_sqr ( &L4, &L3 ); - p448_mul ( &L3, &L6, &L4 ); - p448_mask ( b, &L5, L0 ); - p448_subw ( &L3, 1 ); - p448_bias ( &L3, 1 ); - L1 = p448_is_zero( &L3 ); - L0 = p448_is_zero( sbz ); - return L1 | L0; + mask_t L4, L5, L6; + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &a->xa, &L1 ); + p448_add ( &L2, &L0, &L3 ); + p448_sub ( &L1, &L3, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &L1, &L2 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L2, &a->z0, &a->z0 ); + p448_bias ( &L2, 1 ); + p448_add ( &L0, &L2, &L2 ); + p448_add ( &L2, &L0, &L1 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L0, &a->xd, &L2 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L0, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; } void @@ -524,8 +524,8 @@ test_only_twist ( struct tw_extensible_t* b, const struct extensible_t* a ) { - mask_t L0, L1; - struct p448_t L2, L3; + mask_t L2, L3; + struct p448_t L0, L1; p448_sqr ( &b->u, &a->z ); p448_sqr ( &b->y, &a->x ); p448_sub ( &b->z, &b->u, &b->y ); @@ -541,35 +541,35 @@ test_only_twist ( p448_bias ( &b->z, 2 ); p448_weak_reduce( &b->z ); p448_mul ( &b->t, &b->z, &b->x ); - p448_mul ( &L3, &b->t, &b->u ); - p448_mul ( &b->x, &b->t, &L3 ); - p448_isr ( &L2, &b->x ); - p448_mul ( &b->u, &b->t, &L2 ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &b->t, &b->x, &L3 ); - p448_add ( &b->x, &a->y, &a->x ); - p448_weak_reduce( &b->x ); - p448_sub ( &L2, &a->x, &a->y ); - p448_bias ( &L2, 2 ); - p448_weak_reduce( &L2 ); - p448_mul ( &L3, &b->t, &L2 ); - p448_add ( &L2, &L3, &b->x ); - p448_sub ( &b->t, &b->x, &L3 ); + p448_mul ( &L1, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L1 ); + p448_isr ( &L0, &b->x ); + p448_mul ( &b->u, &b->t, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &b->t, &b->x, &L1 ); + p448_add ( &L1, &a->y, &a->x ); + p448_weak_reduce( &L1 ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); p448_bias ( &b->t, 2 ); p448_weak_reduce( &b->t ); - p448_mul ( &b->x, &L2, &b->u ); - L0 = p448_is_zero( &b->y ); - L1 = - L0; - p448_addw ( &b->x, L1 ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); p448_weak_reduce( &b->x ); p448_mul ( &b->y, &b->t, &b->u ); - L0 = p448_is_zero( &b->z ); - L1 = - L0; - p448_addw ( &b->y, L1 ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); p448_weak_reduce( &b->y ); - L1 = p448_is_zero( &a->y ); - L0 = L1 + 1; - p448_set_ui( &b->z, L0 ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); p448_copy ( &b->t, &b->x ); p448_copy ( &b->u, &b->y ); } @@ -578,16 +578,16 @@ mask_t is_square ( const struct p448_t* x ) { - mask_t L0, L1; - struct p448_t L2, L3; - p448_isr ( &L2, x ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &L2, x, &L3 ); - p448_subw ( &L2, 1 ); - p448_bias ( &L2, 1 ); - L1 = p448_is_zero( &L2 ); - L0 = p448_is_zero( x ); - return L1 | L0; + mask_t L2, L3; + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; } mask_t @@ -744,15 +744,15 @@ eq_affine ( const struct affine_t* a, const struct affine_t* b ) { - mask_t L0, L1; - struct p448_t L2; - p448_sub ( &L2, &a->x, &b->x ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_sub ( &L2, &a->y, &b->y ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L1, L2; + struct p448_t L0; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; } mask_t @@ -760,19 +760,19 @@ eq_extensible ( const struct extensible_t* a, const struct extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } mask_t @@ -780,19 +780,19 @@ eq_tw_extensible ( const struct tw_extensible_t* a, const struct tw_extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } void @@ -801,38 +801,41 @@ elligator_2s_inject ( const struct p448_t* r ) { mask_t L0, L1; - struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + struct p448_t L2, L3, L4, L5, L6, L7, L8; p448_sqr ( &a->x, r ); p448_sqr ( &L3, &a->x ); p448_copy ( &a->y, &L3 ); p448_subw ( &a->y, 1 ); - p448_neg ( &L9, &a->y ); - p448_bias ( &L9, 2 ); - p448_weak_reduce( &L9 ); - p448_sqr ( &L2, &L9 ); - p448_mulw ( &L8, &L2, 1527402724 ); - p448_mulw ( &L7, &L3, 6108985600 ); - p448_add ( &a->y, &L7, &L8 ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); p448_weak_reduce( &a->y ); p448_mulw ( &L8, &L2, 6109454568 ); p448_sub ( &L7, &a->y, &L8 ); p448_bias ( &L7, 2 ); p448_weak_reduce( &L7 ); - p448_mulw ( &L4, &a->y, 78160 ); - p448_mul ( &L6, &L7, &L9 ); - p448_mul ( &L8, &L6, &L4 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); p448_mul ( &L4, &L7, &L8 ); - p448_isr ( &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_sqr ( &L6, &L5 ); - p448_mul ( &L5, &L8, &L6 ); - p448_mul ( &L8, &L7, &L5 ); - p448_mul ( &L7, &L8, &L5 ); - p448_copy ( &L5, &a->x ); - p448_subw ( &L5, 1 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); p448_addw ( &a->x, 1 ); - p448_mul ( &L6, &a->x, &L8 ); - p448_sub ( &a->x, &L5, &L6 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); p448_bias ( &a->x, 3 ); p448_weak_reduce( &a->x ); p448_mul ( &L5, &L4, &a->x ); @@ -849,7 +852,7 @@ elligator_2s_inject ( p448_mulw ( &L3, &L2, 3054649120 ); p448_add ( &L2, &L3, &a->y ); p448_mul ( &a->y, &L7, &L2 ); - L1 = p448_is_zero( &L9 ); + L1 = p448_is_zero( &L8 ); L0 = - L1; p448_addw ( &a->y, L0 ); p448_weak_reduce( &a->y ); @@ -877,83 +880,83 @@ mask_t validate_tw_extensible ( const struct tw_extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L2, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L2 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L3, &ext->x, &ext->y ); - p448_neg ( &L2, &L3 ); - p448_add ( &L3, &L2, &L4 ); - p448_bias ( &L3, 2 ); - L1 = p448_is_zero( &L3 ); + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L2, &L4 ); - p448_addw ( &L2, 0 ); - p448_sqr ( &L3, &ext->x ); - p448_add ( &L4, &L3, &L2 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L3, &ext->t ); - p448_mul ( &L2, &L3, &L5 ); - p448_mulw ( &L3, &L2, 39081 ); - p448_neg ( &L5, &L3 ); - p448_add ( &L3, &L5, &L4 ); - p448_neg ( &L5, &L2 ); - p448_add ( &L4, &L5, &L3 ); - p448_sqr ( &L3, &ext->z ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 4 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->x ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L0, &L1, 39081 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L0, &L3, &L2 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L2, &L3, &L0 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } mask_t validate_extensible ( const struct extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L3, &L4 ); - p448_addw ( &L3, 0 ); - p448_sqr ( &L2, &ext->z ); - p448_add ( &L4, &L2, &L3 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L2, &ext->t ); - p448_mul ( &L3, &L2, &L5 ); - p448_mulw ( &L5, &L3, 39081 ); - p448_neg ( &L2, &L5 ); - p448_add ( &L3, &L2, &L4 ); - p448_sqr ( &L2, &ext->x ); - p448_neg ( &L4, &L2 ); - p448_add ( &L2, &L4, &L3 ); - p448_bias ( &L2, 4 ); - L1 = p448_is_zero( &L2 ); + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L3, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L3 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L2, &ext->x, &ext->y ); - p448_neg ( &L3, &L2 ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } diff --git a/src/arch_32/p448.c b/src/arch_32/p448.c index d3b2956..e45778a 100644 --- a/src/arch_32/p448.c +++ b/src/arch_32/p448.c @@ -4,7 +4,6 @@ #include "word.h" #include "p448.h" -//#include "x86-64-arith.h" static inline mask_t __attribute__((always_inline)) is_zero ( @@ -27,13 +26,7 @@ p448_mul ( p448_t *__restrict__ cs, const p448_t *as, const p448_t *bs -) { - // p448_t ar, br; -// p448_copy(&ar,as); -// p448_copy(&br,bs); -// p448_weak_reduce(&ar); -// p448_weak_reduce(&br); - +) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; @@ -41,13 +34,7 @@ p448_mul ( uint32_t mask = (1<<28) - 1; uint32_t aa[8], bb[8]; - - /* For some reason clang doesn't vectorize this without prompting? */ - // unsigned int i; - // for (i=0; iz0, &a->zd ); - p448_sub ( &L4, &L6, &a->xd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &a->za, &L4 ); - p448_mul ( &L5, &a->z0, &a->xd ); - p448_sub ( &L4, &L5, &a->zd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L3, &a->xa, &L4 ); - p448_add ( &L5, &L3, &L6 ); - p448_sub ( &L4, &L6, &L3 ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &L4, &L5 ); - p448_copy ( &L5, &a->z0 ); - p448_addw ( &L5, 1 ); - p448_sqr ( &L4, &L5 ); - p448_mulw ( &L5, &L4, 39082 ); - p448_neg ( &L4, &L5 ); - p448_add ( &L5, &a->z0, &a->z0 ); - p448_bias ( &L5, 1 ); - p448_add ( &L3, &L5, &L5 ); - p448_add ( &L5, &L3, &L4 ); - p448_weak_reduce( &L5 ); - p448_mul ( &L3, &a->xd, &L5 ); - L1 = p448_is_zero( &a->zd ); - L2 = - L1; - p448_mask ( &L4, &L3, L1 ); - p448_add ( &L5, &L4, &a->zd ); - L0 = ~ L1; - p448_mul ( &L4, sbz, &L6 ); - p448_addw ( &L4, L2 ); - p448_mul ( &L6, &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_mul ( &L5, &L6, &a->xd ); - p448_mul ( &L6, &L4, &L5 ); - p448_isr ( &L3, &L6 ); - p448_mul ( &L5, &L4, &L3 ); - p448_sqr ( &L4, &L3 ); - p448_mul ( &L3, &L6, &L4 ); - p448_mask ( b, &L5, L0 ); - p448_subw ( &L3, 1 ); - p448_bias ( &L3, 1 ); - L1 = p448_is_zero( &L3 ); - L0 = p448_is_zero( sbz ); - return L1 | L0; + mask_t L4, L5, L6; + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &a->xa, &L1 ); + p448_add ( &L2, &L0, &L3 ); + p448_sub ( &L1, &L3, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &L1, &L2 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L2, &a->z0, &a->z0 ); + p448_bias ( &L2, 1 ); + p448_add ( &L0, &L2, &L2 ); + p448_add ( &L2, &L0, &L1 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L0, &a->xd, &L2 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L0, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; } void @@ -524,8 +524,8 @@ test_only_twist ( struct tw_extensible_t* b, const struct extensible_t* a ) { - mask_t L0, L1; - struct p448_t L2, L3; + mask_t L2, L3; + struct p448_t L0, L1; p448_sqr ( &b->u, &a->z ); p448_sqr ( &b->y, &a->x ); p448_sub ( &b->z, &b->u, &b->y ); @@ -541,35 +541,35 @@ test_only_twist ( p448_bias ( &b->z, 2 ); p448_weak_reduce( &b->z ); p448_mul ( &b->t, &b->z, &b->x ); - p448_mul ( &L3, &b->t, &b->u ); - p448_mul ( &b->x, &b->t, &L3 ); - p448_isr ( &L2, &b->x ); - p448_mul ( &b->u, &b->t, &L2 ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &b->t, &b->x, &L3 ); - p448_add ( &b->x, &a->y, &a->x ); - p448_weak_reduce( &b->x ); - p448_sub ( &L2, &a->x, &a->y ); - p448_bias ( &L2, 2 ); - p448_weak_reduce( &L2 ); - p448_mul ( &L3, &b->t, &L2 ); - p448_add ( &L2, &L3, &b->x ); - p448_sub ( &b->t, &b->x, &L3 ); + p448_mul ( &L1, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L1 ); + p448_isr ( &L0, &b->x ); + p448_mul ( &b->u, &b->t, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &b->t, &b->x, &L1 ); + p448_add ( &L1, &a->y, &a->x ); + p448_weak_reduce( &L1 ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); p448_bias ( &b->t, 2 ); p448_weak_reduce( &b->t ); - p448_mul ( &b->x, &L2, &b->u ); - L0 = p448_is_zero( &b->y ); - L1 = - L0; - p448_addw ( &b->x, L1 ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); p448_weak_reduce( &b->x ); p448_mul ( &b->y, &b->t, &b->u ); - L0 = p448_is_zero( &b->z ); - L1 = - L0; - p448_addw ( &b->y, L1 ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); p448_weak_reduce( &b->y ); - L1 = p448_is_zero( &a->y ); - L0 = L1 + 1; - p448_set_ui( &b->z, L0 ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); p448_copy ( &b->t, &b->x ); p448_copy ( &b->u, &b->y ); } @@ -578,16 +578,16 @@ mask_t is_square ( const struct p448_t* x ) { - mask_t L0, L1; - struct p448_t L2, L3; - p448_isr ( &L2, x ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &L2, x, &L3 ); - p448_subw ( &L2, 1 ); - p448_bias ( &L2, 1 ); - L1 = p448_is_zero( &L2 ); - L0 = p448_is_zero( x ); - return L1 | L0; + mask_t L2, L3; + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; } mask_t @@ -744,15 +744,15 @@ eq_affine ( const struct affine_t* a, const struct affine_t* b ) { - mask_t L0, L1; - struct p448_t L2; - p448_sub ( &L2, &a->x, &b->x ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_sub ( &L2, &a->y, &b->y ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L1, L2; + struct p448_t L0; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; } mask_t @@ -760,19 +760,19 @@ eq_extensible ( const struct extensible_t* a, const struct extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } mask_t @@ -780,19 +780,19 @@ eq_tw_extensible ( const struct tw_extensible_t* a, const struct tw_extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } void @@ -801,38 +801,41 @@ elligator_2s_inject ( const struct p448_t* r ) { mask_t L0, L1; - struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + struct p448_t L2, L3, L4, L5, L6, L7, L8; p448_sqr ( &a->x, r ); p448_sqr ( &L3, &a->x ); p448_copy ( &a->y, &L3 ); p448_subw ( &a->y, 1 ); - p448_neg ( &L9, &a->y ); - p448_bias ( &L9, 2 ); - p448_weak_reduce( &L9 ); - p448_sqr ( &L2, &L9 ); - p448_mulw ( &L8, &L2, 1527402724 ); - p448_mulw ( &L7, &L3, 6108985600 ); - p448_add ( &a->y, &L7, &L8 ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); p448_weak_reduce( &a->y ); p448_mulw ( &L8, &L2, 6109454568 ); p448_sub ( &L7, &a->y, &L8 ); p448_bias ( &L7, 2 ); p448_weak_reduce( &L7 ); - p448_mulw ( &L4, &a->y, 78160 ); - p448_mul ( &L6, &L7, &L9 ); - p448_mul ( &L8, &L6, &L4 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); p448_mul ( &L4, &L7, &L8 ); - p448_isr ( &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_sqr ( &L6, &L5 ); - p448_mul ( &L5, &L8, &L6 ); - p448_mul ( &L8, &L7, &L5 ); - p448_mul ( &L7, &L8, &L5 ); - p448_copy ( &L5, &a->x ); - p448_subw ( &L5, 1 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); p448_addw ( &a->x, 1 ); - p448_mul ( &L6, &a->x, &L8 ); - p448_sub ( &a->x, &L5, &L6 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); p448_bias ( &a->x, 3 ); p448_weak_reduce( &a->x ); p448_mul ( &L5, &L4, &a->x ); @@ -849,7 +852,7 @@ elligator_2s_inject ( p448_mulw ( &L3, &L2, 3054649120 ); p448_add ( &L2, &L3, &a->y ); p448_mul ( &a->y, &L7, &L2 ); - L1 = p448_is_zero( &L9 ); + L1 = p448_is_zero( &L8 ); L0 = - L1; p448_addw ( &a->y, L0 ); p448_weak_reduce( &a->y ); @@ -877,83 +880,83 @@ mask_t validate_tw_extensible ( const struct tw_extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L2, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L2 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L3, &ext->x, &ext->y ); - p448_neg ( &L2, &L3 ); - p448_add ( &L3, &L2, &L4 ); - p448_bias ( &L3, 2 ); - L1 = p448_is_zero( &L3 ); + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L2, &L4 ); - p448_addw ( &L2, 0 ); - p448_sqr ( &L3, &ext->x ); - p448_add ( &L4, &L3, &L2 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L3, &ext->t ); - p448_mul ( &L2, &L3, &L5 ); - p448_mulw ( &L3, &L2, 39081 ); - p448_neg ( &L5, &L3 ); - p448_add ( &L3, &L5, &L4 ); - p448_neg ( &L5, &L2 ); - p448_add ( &L4, &L5, &L3 ); - p448_sqr ( &L3, &ext->z ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 4 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->x ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L0, &L1, 39081 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L0, &L3, &L2 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L2, &L3, &L0 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } mask_t validate_extensible ( const struct extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L3, &L4 ); - p448_addw ( &L3, 0 ); - p448_sqr ( &L2, &ext->z ); - p448_add ( &L4, &L2, &L3 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L2, &ext->t ); - p448_mul ( &L3, &L2, &L5 ); - p448_mulw ( &L5, &L3, 39081 ); - p448_neg ( &L2, &L5 ); - p448_add ( &L3, &L2, &L4 ); - p448_sqr ( &L2, &ext->x ); - p448_neg ( &L4, &L2 ); - p448_add ( &L2, &L4, &L3 ); - p448_bias ( &L2, 4 ); - L1 = p448_is_zero( &L2 ); + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L3, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L3 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L2, &ext->x, &ext->y ); - p448_neg ( &L3, &L2 ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } diff --git a/src/arch_arm_32/p448.c b/src/arch_arm_32/p448.c index fa3c583..ec08fb8 100644 --- a/src/arch_arm_32/p448.c +++ b/src/arch_arm_32/p448.c @@ -4,7 +4,6 @@ #include "word.h" #include "p448.h" -//#include "x86-64-arith.h" static inline mask_t __attribute__((always_inline)) is_zero ( @@ -105,11 +104,6 @@ p448_mul ( const p448_t *as, const p448_t *bs ) { - // p448_t ar, br; -// p448_copy(&ar,as); -// p448_copy(&br,bs); -// p448_weak_reduce(&ar); -// p448_weak_reduce(&br); const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; @@ -119,12 +113,6 @@ p448_mul ( uint32_t aa[8], bm[8]; - /* For some reason clang doesn't vectorize this without prompting? */ - // unsigned int i; - // for (i=0; ilimb; uint32_t *c = cs->limb; @@ -479,13 +461,7 @@ p448_sqr ( uint32_t mask = (1<<28) - 1; uint32_t bm[8]; - - /* For some reason clang doesn't vectorize this without prompting? */ - // unsigned int i; - // for (i=0; iz0, &a->zd ); - p448_sub ( &L4, &L6, &a->xd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &a->za, &L4 ); - p448_mul ( &L5, &a->z0, &a->xd ); - p448_sub ( &L4, &L5, &a->zd ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L3, &a->xa, &L4 ); - p448_add ( &L5, &L3, &L6 ); - p448_sub ( &L4, &L6, &L3 ); - p448_bias ( &L4, 2 ); - p448_weak_reduce( &L4 ); - p448_mul ( &L6, &L4, &L5 ); - p448_copy ( &L5, &a->z0 ); - p448_addw ( &L5, 1 ); - p448_sqr ( &L4, &L5 ); - p448_mulw ( &L5, &L4, 39082 ); - p448_neg ( &L4, &L5 ); - p448_add ( &L5, &a->z0, &a->z0 ); - p448_bias ( &L5, 1 ); - p448_add ( &L3, &L5, &L5 ); - p448_add ( &L5, &L3, &L4 ); - p448_weak_reduce( &L5 ); - p448_mul ( &L3, &a->xd, &L5 ); - L1 = p448_is_zero( &a->zd ); - L2 = - L1; - p448_mask ( &L4, &L3, L1 ); - p448_add ( &L5, &L4, &a->zd ); - L0 = ~ L1; - p448_mul ( &L4, sbz, &L6 ); - p448_addw ( &L4, L2 ); - p448_mul ( &L6, &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_mul ( &L5, &L6, &a->xd ); - p448_mul ( &L6, &L4, &L5 ); - p448_isr ( &L3, &L6 ); - p448_mul ( &L5, &L4, &L3 ); - p448_sqr ( &L4, &L3 ); - p448_mul ( &L3, &L6, &L4 ); - p448_mask ( b, &L5, L0 ); - p448_subw ( &L3, 1 ); - p448_bias ( &L3, 1 ); - L1 = p448_is_zero( &L3 ); - L0 = p448_is_zero( sbz ); - return L1 | L0; + mask_t L4, L5, L6; + struct p448_t L0, L1, L2, L3; + p448_mul ( &L3, &a->z0, &a->zd ); + p448_sub ( &L1, &L3, &a->xd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &a->za, &L1 ); + p448_mul ( &L2, &a->z0, &a->xd ); + p448_sub ( &L1, &L2, &a->zd ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L0, &a->xa, &L1 ); + p448_add ( &L2, &L0, &L3 ); + p448_sub ( &L1, &L3, &L0 ); + p448_bias ( &L1, 2 ); + p448_weak_reduce( &L1 ); + p448_mul ( &L3, &L1, &L2 ); + p448_copy ( &L2, &a->z0 ); + p448_addw ( &L2, 1 ); + p448_sqr ( &L1, &L2 ); + p448_mulw ( &L2, &L1, 39082 ); + p448_neg ( &L1, &L2 ); + p448_add ( &L2, &a->z0, &a->z0 ); + p448_bias ( &L2, 1 ); + p448_add ( &L0, &L2, &L2 ); + p448_add ( &L2, &L0, &L1 ); + p448_weak_reduce( &L2 ); + p448_mul ( &L0, &a->xd, &L2 ); + L5 = p448_is_zero( &a->zd ); + L6 = - L5; + p448_mask ( &L1, &L0, L5 ); + p448_add ( &L2, &L1, &a->zd ); + L4 = ~ L5; + p448_mul ( &L1, sbz, &L3 ); + p448_addw ( &L1, L6 ); + p448_mul ( &L3, &L2, &L1 ); + p448_mul ( &L1, &L3, &L2 ); + p448_mul ( &L2, &L3, &a->xd ); + p448_mul ( &L3, &L1, &L2 ); + p448_isr ( &L0, &L3 ); + p448_mul ( &L2, &L1, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, &L3, &L1 ); + p448_mask ( b, &L2, L4 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L5 = p448_is_zero( &L0 ); + L4 = p448_is_zero( sbz ); + return L5 | L4; } void @@ -524,8 +524,8 @@ test_only_twist ( struct tw_extensible_t* b, const struct extensible_t* a ) { - mask_t L0, L1; - struct p448_t L2, L3; + mask_t L2, L3; + struct p448_t L0, L1; p448_sqr ( &b->u, &a->z ); p448_sqr ( &b->y, &a->x ); p448_sub ( &b->z, &b->u, &b->y ); @@ -541,35 +541,35 @@ test_only_twist ( p448_bias ( &b->z, 2 ); p448_weak_reduce( &b->z ); p448_mul ( &b->t, &b->z, &b->x ); - p448_mul ( &L3, &b->t, &b->u ); - p448_mul ( &b->x, &b->t, &L3 ); - p448_isr ( &L2, &b->x ); - p448_mul ( &b->u, &b->t, &L2 ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &b->t, &b->x, &L3 ); - p448_add ( &b->x, &a->y, &a->x ); - p448_weak_reduce( &b->x ); - p448_sub ( &L2, &a->x, &a->y ); - p448_bias ( &L2, 2 ); - p448_weak_reduce( &L2 ); - p448_mul ( &L3, &b->t, &L2 ); - p448_add ( &L2, &L3, &b->x ); - p448_sub ( &b->t, &b->x, &L3 ); + p448_mul ( &L1, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L1 ); + p448_isr ( &L0, &b->x ); + p448_mul ( &b->u, &b->t, &L0 ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &b->t, &b->x, &L1 ); + p448_add ( &L1, &a->y, &a->x ); + p448_weak_reduce( &L1 ); + p448_sub ( &L0, &a->x, &a->y ); + p448_bias ( &L0, 2 ); + p448_weak_reduce( &L0 ); + p448_mul ( &b->x, &b->t, &L0 ); + p448_add ( &L0, &b->x, &L1 ); + p448_sub ( &b->t, &L1, &b->x ); p448_bias ( &b->t, 2 ); p448_weak_reduce( &b->t ); - p448_mul ( &b->x, &L2, &b->u ); - L0 = p448_is_zero( &b->y ); - L1 = - L0; - p448_addw ( &b->x, L1 ); + p448_mul ( &b->x, &L0, &b->u ); + L2 = p448_is_zero( &b->y ); + L3 = - L2; + p448_addw ( &b->x, L3 ); p448_weak_reduce( &b->x ); p448_mul ( &b->y, &b->t, &b->u ); - L0 = p448_is_zero( &b->z ); - L1 = - L0; - p448_addw ( &b->y, L1 ); + L2 = p448_is_zero( &b->z ); + L3 = - L2; + p448_addw ( &b->y, L3 ); p448_weak_reduce( &b->y ); - L1 = p448_is_zero( &a->y ); - L0 = L1 + 1; - p448_set_ui( &b->z, L0 ); + L3 = p448_is_zero( &a->y ); + L2 = L3 + 1; + p448_set_ui( &b->z, L2 ); p448_copy ( &b->t, &b->x ); p448_copy ( &b->u, &b->y ); } @@ -578,16 +578,16 @@ mask_t is_square ( const struct p448_t* x ) { - mask_t L0, L1; - struct p448_t L2, L3; - p448_isr ( &L2, x ); - p448_sqr ( &L3, &L2 ); - p448_mul ( &L2, x, &L3 ); - p448_subw ( &L2, 1 ); - p448_bias ( &L2, 1 ); - L1 = p448_is_zero( &L2 ); - L0 = p448_is_zero( x ); - return L1 | L0; + mask_t L2, L3; + struct p448_t L0, L1; + p448_isr ( &L0, x ); + p448_sqr ( &L1, &L0 ); + p448_mul ( &L0, x, &L1 ); + p448_subw ( &L0, 1 ); + p448_bias ( &L0, 1 ); + L3 = p448_is_zero( &L0 ); + L2 = p448_is_zero( x ); + return L3 | L2; } mask_t @@ -744,15 +744,15 @@ eq_affine ( const struct affine_t* a, const struct affine_t* b ) { - mask_t L0, L1; - struct p448_t L2; - p448_sub ( &L2, &a->x, &b->x ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_sub ( &L2, &a->y, &b->y ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L1, L2; + struct p448_t L0; + p448_sub ( &L0, &a->x, &b->x ); + p448_bias ( &L0, 2 ); + L2 = p448_is_zero( &L0 ); + p448_sub ( &L0, &a->y, &b->y ); + p448_bias ( &L0, 2 ); + L1 = p448_is_zero( &L0 ); + return L2 & L1; } mask_t @@ -760,19 +760,19 @@ eq_extensible ( const struct extensible_t* a, const struct extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } mask_t @@ -780,19 +780,19 @@ eq_tw_extensible ( const struct tw_extensible_t* a, const struct tw_extensible_t* b ) { - mask_t L0, L1; - struct p448_t L2, L3, L4; - p448_mul ( &L4, &b->z, &a->x ); - p448_mul ( &L3, &a->z, &b->x ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L1 = p448_is_zero( &L2 ); - p448_mul ( &L4, &b->z, &a->y ); - p448_mul ( &L3, &a->z, &b->y ); - p448_sub ( &L2, &L4, &L3 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + mask_t L3, L4; + struct p448_t L0, L1, L2; + p448_mul ( &L2, &b->z, &a->x ); + p448_mul ( &L1, &a->z, &b->x ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + p448_mul ( &L2, &b->z, &a->y ); + p448_mul ( &L1, &a->z, &b->y ); + p448_sub ( &L0, &L2, &L1 ); + p448_bias ( &L0, 2 ); + L3 = p448_is_zero( &L0 ); + return L4 & L3; } void @@ -801,38 +801,41 @@ elligator_2s_inject ( const struct p448_t* r ) { mask_t L0, L1; - struct p448_t L2, L3, L4, L5, L6, L7, L8, L9; + struct p448_t L2, L3, L4, L5, L6, L7, L8; p448_sqr ( &a->x, r ); p448_sqr ( &L3, &a->x ); p448_copy ( &a->y, &L3 ); p448_subw ( &a->y, 1 ); - p448_neg ( &L9, &a->y ); - p448_bias ( &L9, 2 ); - p448_weak_reduce( &L9 ); - p448_sqr ( &L2, &L9 ); - p448_mulw ( &L8, &L2, 1527402724 ); - p448_mulw ( &L7, &L3, 6108985600 ); - p448_add ( &a->y, &L7, &L8 ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_weak_reduce( &L4 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); p448_weak_reduce( &a->y ); p448_mulw ( &L8, &L2, 6109454568 ); p448_sub ( &L7, &a->y, &L8 ); p448_bias ( &L7, 2 ); p448_weak_reduce( &L7 ); - p448_mulw ( &L4, &a->y, 78160 ); - p448_mul ( &L6, &L7, &L9 ); - p448_mul ( &L8, &L6, &L4 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); p448_mul ( &L4, &L7, &L8 ); - p448_isr ( &L5, &L4 ); - p448_mul ( &L4, &L6, &L5 ); - p448_sqr ( &L6, &L5 ); - p448_mul ( &L5, &L8, &L6 ); - p448_mul ( &L8, &L7, &L5 ); - p448_mul ( &L7, &L8, &L5 ); - p448_copy ( &L5, &a->x ); - p448_subw ( &L5, 1 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); p448_addw ( &a->x, 1 ); - p448_mul ( &L6, &a->x, &L8 ); - p448_sub ( &a->x, &L5, &L6 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); p448_bias ( &a->x, 3 ); p448_weak_reduce( &a->x ); p448_mul ( &L5, &L4, &a->x ); @@ -849,7 +852,7 @@ elligator_2s_inject ( p448_mulw ( &L3, &L2, 3054649120 ); p448_add ( &L2, &L3, &a->y ); p448_mul ( &a->y, &L7, &L2 ); - L1 = p448_is_zero( &L9 ); + L1 = p448_is_zero( &L8 ); L0 = - L1; p448_addw ( &a->y, L0 ); p448_weak_reduce( &a->y ); @@ -877,83 +880,83 @@ mask_t validate_tw_extensible ( const struct tw_extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L2, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L2 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L3, &ext->x, &ext->y ); - p448_neg ( &L2, &L3 ); - p448_add ( &L3, &L2, &L4 ); - p448_bias ( &L3, 2 ); - L1 = p448_is_zero( &L3 ); + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L2, &L4 ); - p448_addw ( &L2, 0 ); - p448_sqr ( &L3, &ext->x ); - p448_add ( &L4, &L3, &L2 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L3, &ext->t ); - p448_mul ( &L2, &L3, &L5 ); - p448_mulw ( &L3, &L2, 39081 ); - p448_neg ( &L5, &L3 ); - p448_add ( &L3, &L5, &L4 ); - p448_neg ( &L5, &L2 ); - p448_add ( &L4, &L5, &L3 ); - p448_sqr ( &L3, &ext->z ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 4 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->x ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L0, &L1, 39081 ); + p448_neg ( &L3, &L0 ); + p448_add ( &L0, &L3, &L2 ); + p448_neg ( &L3, &L1 ); + p448_add ( &L2, &L3, &L0 ); + p448_sqr ( &L1, &ext->z ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 4 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } mask_t validate_extensible ( const struct extensible_t* ext ) { - mask_t L0, L1; - struct p448_t L2, L3, L4, L5; + mask_t L4, L5; + struct p448_t L0, L1, L2, L3; /* * Check invariant: * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 */ - p448_sqr ( &L4, &ext->y ); - p448_neg ( &L3, &L4 ); - p448_addw ( &L3, 0 ); - p448_sqr ( &L2, &ext->z ); - p448_add ( &L4, &L2, &L3 ); - p448_sqr ( &L5, &ext->u ); - p448_sqr ( &L2, &ext->t ); - p448_mul ( &L3, &L2, &L5 ); - p448_mulw ( &L5, &L3, 39081 ); - p448_neg ( &L2, &L5 ); - p448_add ( &L3, &L2, &L4 ); - p448_sqr ( &L2, &ext->x ); - p448_neg ( &L4, &L2 ); - p448_add ( &L2, &L4, &L3 ); - p448_bias ( &L2, 4 ); - L1 = p448_is_zero( &L2 ); + p448_sqr ( &L2, &ext->y ); + p448_neg ( &L1, &L2 ); + p448_addw ( &L1, 0 ); + p448_sqr ( &L0, &ext->z ); + p448_add ( &L2, &L0, &L1 ); + p448_sqr ( &L3, &ext->u ); + p448_sqr ( &L0, &ext->t ); + p448_mul ( &L1, &L0, &L3 ); + p448_mulw ( &L3, &L1, 39081 ); + p448_neg ( &L0, &L3 ); + p448_add ( &L1, &L0, &L2 ); + p448_sqr ( &L0, &ext->x ); + p448_neg ( &L2, &L0 ); + p448_add ( &L0, &L2, &L1 ); + p448_bias ( &L0, 4 ); + L5 = p448_is_zero( &L0 ); /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L3, &ext->t, &ext->u ); - p448_mul ( &L4, &ext->z, &L3 ); - p448_addw ( &L4, 0 ); - p448_mul ( &L2, &ext->x, &ext->y ); - p448_neg ( &L3, &L2 ); - p448_add ( &L2, &L3, &L4 ); - p448_bias ( &L2, 2 ); - L0 = p448_is_zero( &L2 ); - return L1 & L0; + p448_mul ( &L1, &ext->t, &ext->u ); + p448_mul ( &L2, &ext->z, &L1 ); + p448_addw ( &L2, 0 ); + p448_mul ( &L0, &ext->x, &ext->y ); + p448_neg ( &L1, &L0 ); + p448_add ( &L0, &L1, &L2 ); + p448_bias ( &L0, 2 ); + L4 = p448_is_zero( &L0 ); + return L5 & L4; } diff --git a/src/arch_neon/neon_emulation.h b/src/arch_neon/neon_emulation.h index 6fecbc7..a97978c 100644 --- a/src/arch_neon/neon_emulation.h +++ b/src/arch_neon/neon_emulation.h @@ -8,9 +8,12 @@ * * This lets you test and debug NEON code on x86. */ + #ifndef __NEON_EMULATION_H__ #define __NEON_EMULATION_H__ 1 +/** @cond internal */ + #include "word.h" #include @@ -147,4 +150,6 @@ static inline int64x2_t vmull_lane_s32 ( return xx*(lane?yy.yy:yy.xx); } +/** @endcond */ + #endif /* __NEON_EMULATION_H__ */ diff --git a/src/arch_neon/p448.c b/src/arch_neon/p448.c index 6cd78aa..fe69639 100644 --- a/src/arch_neon/p448.c +++ b/src/arch_neon/p448.c @@ -37,7 +37,7 @@ xx_vaddup_s64(int64x2_t x) { } #else #include "neon_emulation.h" -#endif // ARM_NEON +#endif /* ARM_NEON */ static inline void __attribute__((gnu_inline,always_inline)) smlal ( @@ -75,12 +75,6 @@ smull2 ( *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; } -// static inline int64x2_t copy_now(int64x2_t x) { -// int64x2_t y; -// __asm__ ("vmov %0, %1" : "=w"(y) : "w"(x)); -// return y; -// } - void p448_mul ( p448_t *__restrict__ cs, diff --git a/src/arch_x86_64/ec_point.c b/src/arch_x86_64/ec_point.c index 87df79f..1fba091 100644 --- a/src/arch_x86_64/ec_point.c +++ b/src/arch_x86_64/ec_point.c @@ -356,51 +356,51 @@ serialize_montgomery ( const struct montgomery_t* a, const struct p448_t* sbz ) { - struct p448_t L0, L1, L2, L3; - mask_t L4, L5, L6; - p448_mul ( &L3, &a->z0, &a->zd ); - p448_sub ( &L1, &L3, &a->xd ); - p448_bias ( &L1, 2 ); - p448_mul ( &L3, &a->za, &L1 ); - p448_mul ( &L2, &a->z0, &a->xd ); - p448_sub ( &L1, &L2, &a->zd ); - p448_bias ( &L1, 2 ); - p448_mul ( &L2, &a->xa, &L1 ); - p448_add ( &L1, &L2, &L3 ); - p448_sub ( &L0, &L3, &L2 ); - p448_bias ( &L0, 2 ); - p448_mul ( &L3, &L0, &L1 ); - p448_copy ( &L2, &a->z0 ); - p448_addw ( &L2, 1 ); - p448_sqr ( &L1, &L2 ); - p448_mulw ( &L2, &L1, 39082 ); - p448_neg ( &L1, &L2 ); - p448_add ( &L0, &a->z0, &a->z0 ); - p448_bias ( &L0, 1 ); - p448_add ( &L2, &L0, &L0 ); - p448_add ( &L0, &L2, &L1 ); - p448_mul ( &L2, &a->xd, &L0 ); - L5 = p448_is_zero( &a->zd ); - L6 = - L5; - p448_mask ( &L1, &L2, L5 ); - p448_add ( &L2, &L1, &a->zd ); - L4 = ~ L5; - p448_mul ( &L1, sbz, &L3 ); - p448_addw ( &L1, L6 ); - p448_mul ( &L3, &L2, &L1 ); - p448_mul ( &L1, &L3, &L2 ); - p448_mul ( &L2, &L3, &a->xd ); - p448_mul ( &L3, &L1, &L2 ); - p448_isr ( &L0, &L3 ); - p448_mul ( &L2, &L1, &L0 ); - p448_sqr ( &L1, &L0 ); - p448_mul ( &L0, &L3, &L1 ); - p448_mask ( b, &L2, L4 ); - p448_subw ( &L0, 1 ); - p448_bias ( &L0, 1 ); - L5 = p448_is_zero( &L0 ); - L4 = p448_is_zero( sbz ); - return L5 | L4; + mask_t L0, L1, L2; + struct p448_t L3, L4, L5, L6; + p448_mul ( &L6, &a->z0, &a->zd ); + p448_sub ( &L4, &L6, &a->xd ); + p448_bias ( &L4, 2 ); + p448_mul ( &L6, &a->za, &L4 ); + p448_mul ( &L5, &a->z0, &a->xd ); + p448_sub ( &L4, &L5, &a->zd ); + p448_bias ( &L4, 2 ); + p448_mul ( &L3, &a->xa, &L4 ); + p448_add ( &L5, &L3, &L6 ); + p448_sub ( &L4, &L6, &L3 ); + p448_bias ( &L4, 2 ); + p448_mul ( &L6, &L4, &L5 ); + p448_copy ( &L5, &a->z0 ); + p448_addw ( &L5, 1 ); + p448_sqr ( &L4, &L5 ); + p448_mulw ( &L5, &L4, 39082 ); + p448_neg ( &L4, &L5 ); + p448_add ( &L3, &a->z0, &a->z0 ); + p448_bias ( &L3, 1 ); + p448_add ( &L5, &L3, &L3 ); + p448_add ( &L3, &L5, &L4 ); + p448_mul ( &L5, &a->xd, &L3 ); + L1 = p448_is_zero( &a->zd ); + L2 = - L1; + p448_mask ( &L4, &L5, L1 ); + p448_add ( &L5, &L4, &a->zd ); + L0 = ~ L1; + p448_mul ( &L4, sbz, &L6 ); + p448_addw ( &L4, L2 ); + p448_mul ( &L6, &L5, &L4 ); + p448_mul ( &L4, &L6, &L5 ); + p448_mul ( &L5, &L6, &a->xd ); + p448_mul ( &L6, &L4, &L5 ); + p448_isr ( &L3, &L6 ); + p448_mul ( &L5, &L4, &L3 ); + p448_sqr ( &L4, &L3 ); + p448_mul ( &L3, &L6, &L4 ); + p448_mask ( b, &L5, L0 ); + p448_subw ( &L3, 1 ); + p448_bias ( &L3, 1 ); + L1 = p448_is_zero( &L3 ); + L0 = p448_is_zero( sbz ); + return L1 | L0; } void @@ -491,8 +491,8 @@ test_only_twist ( struct tw_extensible_t* b, const struct extensible_t* a ) { - struct p448_t L0, L1; - mask_t L2, L3; + mask_t L0, L1; + struct p448_t L2, L3; p448_sqr ( &b->u, &a->z ); p448_sqr ( &b->y, &a->x ); p448_sub ( &b->z, &b->u, &b->y ); @@ -501,36 +501,36 @@ test_only_twist ( p448_add ( &b->u, &b->y, &b->y ); p448_sub ( &b->y, &a->z, &a->x ); p448_bias ( &b->y, 2 ); - p448_mul ( &b->t, &b->y, &a->y ); + p448_mul ( &b->x, &b->y, &a->y ); p448_sub ( &b->z, &a->z, &a->y ); p448_bias ( &b->z, 2 ); - p448_mul ( &b->x, &b->z, &b->t ); - p448_mul ( &b->t, &b->x, &b->u ); - p448_mul ( &L1, &b->x, &b->t ); - p448_isr ( &b->t, &L1 ); - p448_mul ( &b->u, &b->x, &b->t ); - p448_sqr ( &b->x, &b->t ); - p448_mul ( &b->t, &L1, &b->x ); - p448_add ( &L1, &a->y, &a->x ); - p448_sub ( &L0, &a->x, &a->y ); - p448_bias ( &L0, 2 ); - p448_mul ( &b->x, &b->t, &L0 ); - p448_add ( &L0, &b->x, &L1 ); - p448_sub ( &b->t, &L1, &b->x ); + p448_mul ( &b->t, &b->z, &b->x ); + p448_mul ( &L3, &b->t, &b->u ); + p448_mul ( &b->x, &b->t, &L3 ); + p448_isr ( &L2, &b->x ); + p448_mul ( &b->u, &b->t, &L2 ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &b->t, &b->x, &L3 ); + p448_add ( &L3, &a->y, &a->x ); + p448_sub ( &L2, &a->x, &a->y ); + p448_bias ( &L2, 2 ); + p448_mul ( &b->x, &b->t, &L2 ); + p448_add ( &L2, &b->x, &L3 ); + p448_sub ( &b->t, &L3, &b->x ); p448_bias ( &b->t, 2 ); - p448_mul ( &b->x, &L0, &b->u ); - L2 = p448_is_zero( &b->y ); - L3 = - L2; - p448_addw ( &b->x, L3 ); + p448_mul ( &b->x, &L2, &b->u ); + L0 = p448_is_zero( &b->y ); + L1 = - L0; + p448_addw ( &b->x, L1 ); p448_weak_reduce( &b->x ); p448_mul ( &b->y, &b->t, &b->u ); - L2 = p448_is_zero( &b->z ); - L3 = - L2; - p448_addw ( &b->y, L3 ); + L0 = p448_is_zero( &b->z ); + L1 = - L0; + p448_addw ( &b->y, L1 ); p448_weak_reduce( &b->y ); - L3 = p448_is_zero( &a->y ); - L2 = L3 + 1; - p448_set_ui( &b->z, L2 ); + L1 = p448_is_zero( &a->y ); + L0 = L1 + 1; + p448_set_ui( &b->z, L0 ); p448_copy ( &b->t, &b->x ); p448_copy ( &b->u, &b->y ); } @@ -539,16 +539,16 @@ mask_t is_square ( const struct p448_t* x ) { - struct p448_t L0, L1; - mask_t L2, L3; - p448_isr ( &L0, x ); - p448_sqr ( &L1, &L0 ); - p448_mul ( &L0, x, &L1 ); - p448_subw ( &L0, 1 ); - p448_bias ( &L0, 1 ); - L3 = p448_is_zero( &L0 ); - L2 = p448_is_zero( x ); - return L3 | L2; + mask_t L0, L1; + struct p448_t L2, L3; + p448_isr ( &L2, x ); + p448_sqr ( &L3, &L2 ); + p448_mul ( &L2, x, &L3 ); + p448_subw ( &L2, 1 ); + p448_bias ( &L2, 1 ); + L1 = p448_is_zero( &L2 ); + L0 = p448_is_zero( x ); + return L1 | L0; } mask_t @@ -700,15 +700,15 @@ eq_affine ( const struct affine_t* a, const struct affine_t* b ) { - struct p448_t L0; - mask_t L1, L2; - p448_sub ( &L0, &a->x, &b->x ); - p448_bias ( &L0, 2 ); - L2 = p448_is_zero( &L0 ); - p448_sub ( &L0, &a->y, &b->y ); - p448_bias ( &L0, 2 ); - L1 = p448_is_zero( &L0 ); - return L2 & L1; + mask_t L0, L1; + struct p448_t L2; + p448_sub ( &L2, &a->x, &b->x ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_sub ( &L2, &a->y, &b->y ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; } mask_t @@ -716,19 +716,19 @@ eq_extensible ( const struct extensible_t* a, const struct extensible_t* b ) { - struct p448_t L0, L1, L2; - mask_t L3, L4; - p448_mul ( &L2, &b->z, &a->x ); - p448_mul ( &L1, &a->z, &b->x ); - p448_sub ( &L0, &L2, &L1 ); - p448_bias ( &L0, 2 ); - L4 = p448_is_zero( &L0 ); - p448_mul ( &L2, &b->z, &a->y ); - p448_mul ( &L1, &a->z, &b->y ); - p448_sub ( &L0, &L2, &L1 ); - p448_bias ( &L0, 2 ); - L3 = p448_is_zero( &L0 ); - return L4 & L3; + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; } mask_t @@ -736,19 +736,19 @@ eq_tw_extensible ( const struct tw_extensible_t* a, const struct tw_extensible_t* b ) { - struct p448_t L0, L1, L2; - mask_t L3, L4; - p448_mul ( &L2, &b->z, &a->x ); - p448_mul ( &L1, &a->z, &b->x ); - p448_sub ( &L0, &L2, &L1 ); - p448_bias ( &L0, 2 ); - L4 = p448_is_zero( &L0 ); - p448_mul ( &L2, &b->z, &a->y ); - p448_mul ( &L1, &a->z, &b->y ); - p448_sub ( &L0, &L2, &L1 ); - p448_bias ( &L0, 2 ); - L3 = p448_is_zero( &L0 ); - return L4 & L3; + mask_t L0, L1; + struct p448_t L2, L3, L4; + p448_mul ( &L4, &b->z, &a->x ); + p448_mul ( &L3, &a->z, &b->x ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L1 = p448_is_zero( &L2 ); + p448_mul ( &L4, &b->z, &a->y ); + p448_mul ( &L3, &a->z, &b->y ); + p448_sub ( &L2, &L4, &L3 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; } void @@ -756,53 +756,56 @@ elligator_2s_inject ( struct affine_t* a, const struct p448_t* r ) { - struct p448_t L0, L1, L2, L3, L4, L5, L6, L7; - mask_t L8, L9; + mask_t L0, L1; + struct p448_t L2, L3, L4, L5, L6, L7, L8; p448_sqr ( &a->x, r ); - p448_sqr ( &L1, &a->x ); - p448_copy ( &a->y, &L1 ); + p448_sqr ( &L3, &a->x ); + p448_copy ( &a->y, &L3 ); p448_subw ( &a->y, 1 ); - p448_neg ( &L7, &a->y ); + p448_neg ( &L4, &a->y ); + p448_bias ( &L4, 2 ); + p448_sqr ( &L2, &L4 ); + p448_mulw ( &L7, &L2, 1527402724 ); + p448_mulw ( &L8, &L3, 6108985600 ); + p448_add ( &a->y, &L8, &L7 ); + p448_mulw ( &L8, &L2, 6109454568 ); + p448_sub ( &L7, &a->y, &L8 ); p448_bias ( &L7, 2 ); - p448_sqr ( &L0, &L7 ); - p448_mulw ( &L6, &L0, 1527402724 ); - p448_mulw ( &L5, &L1, 6108985600 ); - p448_add ( &a->y, &L5, &L6 ); - p448_mulw ( &L6, &L0, 6109454568 ); - p448_sub ( &L5, &a->y, &L6 ); - p448_bias ( &L5, 2 ); - p448_mulw ( &L2, &a->y, 78160 ); - p448_mul ( &L4, &L5, &L7 ); - p448_mul ( &L6, &L4, &L2 ); - p448_mul ( &L2, &L5, &L6 ); - p448_isr ( &L3, &L2 ); - p448_mul ( &L2, &L4, &L3 ); - p448_sqr ( &L4, &L3 ); - p448_mul ( &L3, &L6, &L4 ); - p448_mul ( &L6, &L5, &L3 ); - p448_mul ( &L5, &L6, &L3 ); - p448_copy ( &L4, &a->x ); - p448_subw ( &L4, 1 ); + p448_mulw ( &L6, &a->y, 78160 ); + p448_mul ( &L5, &L7, &L6 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_mul ( &L5, &L7, &L8 ); + p448_mul ( &L8, &L5, &L4 ); + p448_mul ( &L4, &L7, &L8 ); + p448_isr ( &L6, &L4 ); + p448_mul ( &L4, &L5, &L6 ); + p448_sqr ( &L5, &L6 ); + p448_mul ( &L6, &L8, &L5 ); + p448_mul ( &L8, &L7, &L6 ); + p448_mul ( &L7, &L8, &L6 ); + p448_copy ( &L6, &a->x ); + p448_subw ( &L6, 1 ); p448_addw ( &a->x, 1 ); - p448_mul ( &L3, &a->x, &L6 ); - p448_sub ( &a->x, &L4, &L3 ); + p448_mul ( &L5, &a->x, &L8 ); + p448_sub ( &a->x, &L6, &L5 ); p448_bias ( &a->x, 3 ); - p448_mul ( &L3, &L2, &a->x ); - p448_mulw ( &L2, &L3, 78160 ); - p448_neg ( &a->x, &L2 ); + p448_mul ( &L5, &L4, &a->x ); + p448_mulw ( &L4, &L5, 78160 ); + p448_neg ( &a->x, &L4 ); p448_bias ( &a->x, 2 ); p448_weak_reduce( &a->x ); - p448_add ( &L2, &L1, &L1 ); - p448_add ( &L1, &L2, &L0 ); - p448_subw ( &L1, 2 ); - p448_bias ( &L1, 1 ); - p448_mul ( &L0, &L1, &L6 ); - p448_mulw ( &L1, &L0, 3054649120 ); - p448_add ( &L0, &L1, &a->y ); - p448_mul ( &a->y, &L5, &L0 ); - L9 = p448_is_zero( &L7 ); - L8 = - L9; - p448_addw ( &a->y, L8 ); + p448_add ( &L4, &L3, &L3 ); + p448_add ( &L3, &L4, &L2 ); + p448_subw ( &L3, 2 ); + p448_bias ( &L3, 1 ); + p448_mul ( &L2, &L3, &L8 ); + p448_mulw ( &L3, &L2, 3054649120 ); + p448_add ( &L2, &L3, &a->y ); + p448_mul ( &a->y, &L7, &L2 ); + L1 = p448_is_zero( &L8 ); + L0 = - L1; + p448_addw ( &a->y, L0 ); p448_weak_reduce( &a->y ); } @@ -828,83 +831,83 @@ mask_t validate_tw_extensible ( const struct tw_extensible_t* ext ) { - struct p448_t L0, L1, L2, L3; - mask_t L4, L5; + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L0, &ext->t, &ext->u ); - p448_mul ( &L2, &ext->z, &L0 ); - p448_addw ( &L2, 0 ); - p448_mul ( &L1, &ext->x, &ext->y ); - p448_neg ( &L0, &L1 ); - p448_add ( &L1, &L0, &L2 ); - p448_bias ( &L1, 2 ); - L5 = p448_is_zero( &L1 ); + p448_mul ( &L2, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L2 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L3, &ext->x, &ext->y ); + p448_neg ( &L2, &L3 ); + p448_add ( &L3, &L2, &L4 ); + p448_bias ( &L3, 2 ); + L1 = p448_is_zero( &L3 ); /* * Check invariant: * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2 */ - p448_sqr ( &L2, &ext->y ); - p448_neg ( &L0, &L2 ); - p448_addw ( &L0, 0 ); - p448_sqr ( &L1, &ext->x ); - p448_add ( &L2, &L1, &L0 ); - p448_sqr ( &L3, &ext->u ); - p448_sqr ( &L1, &ext->t ); - p448_mul ( &L0, &L1, &L3 ); - p448_mulw ( &L1, &L0, 39081 ); - p448_neg ( &L3, &L1 ); - p448_add ( &L1, &L3, &L2 ); - p448_neg ( &L3, &L0 ); - p448_add ( &L2, &L3, &L1 ); - p448_sqr ( &L1, &ext->z ); - p448_add ( &L0, &L1, &L2 ); - p448_bias ( &L0, 4 ); - L4 = p448_is_zero( &L0 ); - return L5 & L4; + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L2, &L4 ); + p448_addw ( &L2, 0 ); + p448_sqr ( &L3, &ext->x ); + p448_add ( &L4, &L3, &L2 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L3, &ext->t ); + p448_mul ( &L2, &L3, &L5 ); + p448_mulw ( &L3, &L2, 39081 ); + p448_neg ( &L5, &L3 ); + p448_add ( &L3, &L5, &L4 ); + p448_neg ( &L5, &L2 ); + p448_add ( &L4, &L5, &L3 ); + p448_sqr ( &L3, &ext->z ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 4 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; } mask_t validate_extensible ( const struct extensible_t* ext ) { - struct p448_t L0, L1, L2, L3; - mask_t L4, L5; + mask_t L0, L1; + struct p448_t L2, L3, L4, L5; /* * Check invariant: * 0 = d*t^2*u^2 - x^2 - y^2 + z^2 */ - p448_sqr ( &L2, &ext->y ); - p448_neg ( &L1, &L2 ); - p448_addw ( &L1, 0 ); - p448_sqr ( &L0, &ext->z ); - p448_add ( &L2, &L0, &L1 ); - p448_sqr ( &L3, &ext->u ); - p448_sqr ( &L0, &ext->t ); - p448_mul ( &L1, &L0, &L3 ); - p448_mulw ( &L3, &L1, 39081 ); - p448_neg ( &L0, &L3 ); - p448_add ( &L1, &L0, &L2 ); - p448_sqr ( &L0, &ext->x ); - p448_neg ( &L2, &L0 ); - p448_add ( &L0, &L2, &L1 ); - p448_bias ( &L0, 4 ); - L5 = p448_is_zero( &L0 ); + p448_sqr ( &L4, &ext->y ); + p448_neg ( &L3, &L4 ); + p448_addw ( &L3, 0 ); + p448_sqr ( &L2, &ext->z ); + p448_add ( &L4, &L2, &L3 ); + p448_sqr ( &L5, &ext->u ); + p448_sqr ( &L2, &ext->t ); + p448_mul ( &L3, &L2, &L5 ); + p448_mulw ( &L5, &L3, 39081 ); + p448_neg ( &L2, &L5 ); + p448_add ( &L3, &L2, &L4 ); + p448_sqr ( &L2, &ext->x ); + p448_neg ( &L4, &L2 ); + p448_add ( &L2, &L4, &L3 ); + p448_bias ( &L2, 4 ); + L1 = p448_is_zero( &L2 ); /* * Check invariant: * 0 = -x*y + z*t*u */ - p448_mul ( &L1, &ext->t, &ext->u ); - p448_mul ( &L2, &ext->z, &L1 ); - p448_addw ( &L2, 0 ); - p448_mul ( &L0, &ext->x, &ext->y ); - p448_neg ( &L1, &L0 ); - p448_add ( &L0, &L1, &L2 ); - p448_bias ( &L0, 2 ); - L4 = p448_is_zero( &L0 ); - return L5 & L4; + p448_mul ( &L3, &ext->t, &ext->u ); + p448_mul ( &L4, &ext->z, &L3 ); + p448_addw ( &L4, 0 ); + p448_mul ( &L2, &ext->x, &ext->y ); + p448_neg ( &L3, &L2 ); + p448_add ( &L2, &L3, &L4 ); + p448_bias ( &L2, 2 ); + L0 = p448_is_zero( &L2 ); + return L1 & L0; } diff --git a/src/arch_x86_64/p448.c b/src/arch_x86_64/p448.c index 4abc788..5e97812 100644 --- a/src/arch_x86_64/p448.c +++ b/src/arch_x86_64/p448.c @@ -180,9 +180,6 @@ p448_mulw ( c[3] = accum0 & mask; accum0 >>= 56; c[7] = accum4 & mask; accum4 >>= 56; - - // c[4] += accum0 + accum4; - // c[0] += accum4; accum0 += accum4 + c[4]; c[4] = accum0 & mask; diff --git a/src/crandom.c b/src/crandom.c index e4a71d0..4b75f66 100644 --- a/src/crandom.c +++ b/src/crandom.c @@ -5,8 +5,11 @@ /* Chacha random number generator code copied from crandom */ -#include "intrinsics.h" #include "crandom.h" +#include "intrinsics.h" +#include "config.h" +#include "magic.h" + #include volatile unsigned int crandom_features = 0; @@ -67,7 +70,7 @@ INTRINSIC u_int64_t rdrand(int abort_on_fail) { out = out << 32 | reg; return out; # else - abort(); // whut + abort(); /* whut */ # endif } else { tries = 0; @@ -296,9 +299,6 @@ crandom_chacha_expand(u_int64_t iv, #endif /* NEED_CONV */ } -/* "return 4", cf xkcd #221 */ -#define CRANDOM_MAGIC 0x72657475726e2034ull - int crandom_init_from_file( struct crandom_state_t *state, @@ -361,6 +361,52 @@ crandom_generate( int ret = 0; + /* + * Addition 5/21/2014. + * + * If this is used in an application inside a VM, and the VM + * is snapshotted and restored, then crandom_generate() would + * produce the same output. + * + * Of course, the real defense against this is "don't do that", + * but we mitigate it by the RDRAND and/or rdtsc() in the refilling + * code. Since chacha is pseudorandom, when the attacker doesn't + * know the state, it's good enough if RDRAND/rdtsc() return + * different results. However, if (part of) the request is filled + * from the buffer, this won't help. + * + * So, add a flag EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES which + * disables the buffer for requests larger than this size. + * + * Suggest EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES = 0, which + * disables the buffer. But instead you can set it to say 16, + * so that pulls of at least 128 bits will be stirred. This + * could still be a problem for eg 64-bit nonces, but those + * aren't entirely collision-resistant anyway. + * + * Heuristic: large requests are more likely to be + * cryptographically important, and the buffer doesn't impact + * their performance as much. So if the request is bigger + * than a certain size, just drop the buffer on the floor. + * + * This code isn't activated if state->reseed_interval == 0, + * because then the PRNG is deterministic anyway. + * + * TODO: sample 128 bits out of RDRAND() instead of 64 bits. + * TODO: option to completely remove the buffer and fill? + * FUTURE: come up with a less band-aid-y solution to this problem. + */ +#ifdef EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES + if (state->reseed_interval +#if EXPERIMENT_CRANDOM_CUTOFF_BYTES > 0 + /* #if'd to a warning from -Wtype-limits in GCC when it's zero */ + && length >= EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES +#endif + ) { + state->fill = 0; + } +#endif + while (length) { if (unlikely(state->fill <= 0)) { uint64_t iv = 0; diff --git a/src/goldilocks.c b/src/goldilocks.c index 4314e46..440c8bd 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -32,73 +32,27 @@ #define GOLDILOCKS_RANDOM_RESEEDS_MANDATORY 0 #endif -#define GOLDI_FIELD_WORDS ((GOLDI_FIELD_BITS+WORD_BITS-1)/(WORD_BITS)) #define GOLDI_DIVERSIFY_BYTES 8 -/* FUTURE: auto. MAGIC */ -const struct affine_t goldilocks_base_point = { - {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), - U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), - U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324), - U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff) - }}, - {{ 19 }} -}; - /* These are just unique identifiers */ static const char *G_INITING = "initializing"; static const char *G_INITED = "initialized"; static const char *G_FAILED = "failed to initialize"; -/* FUTURE: auto. MAGIC */ -static const word_t goldi_q448_lo[(224+WORD_BITS-1)/WORD_BITS] = { - U64LE(0xdc873d6d54a7bb0d), - U64LE(0xde933d8d723a70aa), - U64LE(0x3bb124b65129c96f), - 0x8335dc16 -}; -const struct barrett_prime_t goldi_q448 = { - GOLDI_FIELD_WORDS, - 62 % WORD_BITS, - sizeof(goldi_q448_lo)/sizeof(goldi_q448_lo[0]), - goldi_q448_lo -}; - -/* MAGIC */ -static const struct p448_t -sqrt_d_minus_1 = {{ - U58LE(0xd2e21836749f46), - U58LE(0x888db42b4f0179), - U58LE(0x5a189aabdeea38), - U58LE(0x51e65ca6f14c06), - U58LE(0xa49f7b424d9770), - U58LE(0xdcac4628c5f656), - U58LE(0x49443b8748734a), - U58LE(0x12fec0c0b25b7a) -}}; - struct goldilocks_precomputed_public_key_t { struct goldilocks_public_key_t pub; struct fixed_base_table_t table; }; -#ifndef USE_BIG_TABLES -#if __ARM_NEON__ -#define USE_BIG_TABLES 1 -#else -#define USE_BIG_TABLES (WORD_BITS==64) -#endif -#endif - -/* FUTURE: auto. MAGIC */ -struct { +/* FUTURE: auto. */ +static struct { const char * volatile state; #if GOLDILOCKS_USE_PTHREAD pthread_mutex_t mutex; #endif - struct tw_niels_t combs[USE_BIG_TABLES ? 80 : 64]; + struct tw_niels_t combs[COMB_N << (COMB_T-1)]; struct fixed_base_table_t fixed_base; - struct tw_niels_t wnafs[32]; + struct tw_niels_t wnafs[1<opaque, sk, GOLDI_FIELD_BYTES); scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); untwist_and_double_and_serialize(&pk, &exta); - p448_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk); + field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk); return GOLDI_EOK; } @@ -245,11 +204,11 @@ goldilocks_private_to_public ( struct goldilocks_public_key_t *pubkey, const struct goldilocks_private_key_t *privkey ) { - struct p448_t pk; - mask_t msucc = p448_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]); + struct field_t pk; + mask_t msucc = field_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]); if (msucc) { - p448_serialize(pubkey->opaque, &pk); + field_serialize(pubkey->opaque, &pk); return GOLDI_EOK; } else { return GOLDI_ECORRUPT; @@ -270,18 +229,18 @@ goldilocks_shared_secret_core ( assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES); word_t sk[GOLDI_FIELD_WORDS]; - struct p448_t pk; + struct field_t pk; - mask_t succ = p448_deserialize(&pk,your_pubkey->opaque), msucc = -1; + mask_t succ = field_deserialize(&pk,your_pubkey->opaque), msucc = -1; #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS - struct p448_t sum, prod; - msucc &= p448_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]); - p448_mul(&prod,&pk,&sum); - p448_add(&sum,&pk,&sum); + struct field_t sum, prod; + msucc &= field_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]); + field_mul(&prod,&pk,&sum); + field_add(&sum,&pk,&sum); #endif - msucc &= barrett_deserialize(sk,my_privkey->opaque,&goldi_q448); + msucc &= barrett_deserialize(sk,my_privkey->opaque,&curve_prime_order); #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS if (pre) { @@ -297,7 +256,7 @@ goldilocks_shared_secret_core ( #endif - p448_serialize(shared,&pk); + field_serialize(shared,&pk); /* obliterate records of our failure by adjusting with obliteration key */ struct sha512_ctx_t ctx; @@ -318,9 +277,9 @@ goldilocks_shared_secret_core ( #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS /* stir in the sum and product of the pubkeys. */ uint8_t a_pk[GOLDI_FIELD_BYTES]; - p448_serialize(a_pk, &sum); + field_serialize(a_pk, &sum); sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); - p448_serialize(a_pk, &prod); + field_serialize(a_pk, &prod); sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES); #endif @@ -363,7 +322,7 @@ goldilocks_derive_challenge( sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES); sha512_update(&ctx, message, message_len); sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &goldi_q448); + barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order); } int @@ -379,7 +338,7 @@ goldilocks_sign ( /* challenge = H(pk, [nonceG], message). */ word_t skw[GOLDI_FIELD_WORDS]; - mask_t succ = barrett_deserialize(skw,privkey->opaque,&goldi_q448); + mask_t succ = barrett_deserialize(skw,privkey->opaque,&curve_prime_order); if (!succ) { memset(skw,0,sizeof(skw)); return GOLDI_ECORRUPT; @@ -395,16 +354,16 @@ goldilocks_sign ( sha512_update(&ctx, message, message_len); sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &goldi_q448); + barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order); /* 4[nonce]G */ uint8_t signature_tmp[GOLDI_FIELD_BYTES]; struct tw_extensible_t exta; - struct p448_t gsk; + struct field_t gsk; scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); double_tw_extensible(&exta); untwist_and_double_and_serialize(&gsk, &exta); - p448_serialize(signature_tmp, &gsk); + field_serialize(signature_tmp, &gsk); word_t challenge[GOLDI_FIELD_WORDS]; goldilocks_derive_challenge ( @@ -415,18 +374,18 @@ goldilocks_sign ( message_len ); - // reduce challenge and sub. - barrett_negate(challenge,GOLDI_FIELD_WORDS,&goldi_q448); + /* reduce challenge and sub. */ + barrett_negate(challenge,GOLDI_FIELD_WORDS,&curve_prime_order); barrett_mac( tk,GOLDI_FIELD_WORDS, challenge,GOLDI_FIELD_WORDS, skw,GOLDI_FIELD_WORDS, - &goldi_q448 + &curve_prime_order ); word_t carry = add_nr_ext_packed(tk,tk,GOLDI_FIELD_WORDS,tk,GOLDI_FIELD_WORDS,-1); - barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&goldi_q448); + barrett_reduce(tk,GOLDI_FIELD_WORDS,carry,&curve_prime_order); memcpy(signature_out, signature_tmp, GOLDI_FIELD_BYTES); barrett_serialize(signature_out+GOLDI_FIELD_BYTES, tk, GOLDI_FIELD_BYTES); @@ -454,23 +413,23 @@ goldilocks_verify ( return GOLDI_EUNINIT; } - struct p448_t pk; + struct field_t pk; word_t s[GOLDI_FIELD_WORDS]; - mask_t succ = p448_deserialize(&pk,pubkey->opaque); + mask_t succ = field_deserialize(&pk,pubkey->opaque); if (!succ) return GOLDI_EINVAL; - succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); + succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order); if (!succ) return GOLDI_EINVAL; word_t challenge[GOLDI_FIELD_WORDS]; goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len); - struct p448_t eph; + struct field_t eph; struct tw_extensible_t pk_text; /* deserialize [nonce]G */ - succ = p448_deserialize(&eph, signature); + succ = field_deserialize(&eph, signature); if (!succ) return GOLDI_EINVAL; succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk); @@ -479,13 +438,13 @@ goldilocks_verify ( linear_combo_var_fixed_vt( &pk_text, challenge, GOLDI_SCALAR_BITS, s, GOLDI_SCALAR_BITS, - goldilocks_global.wnafs, 5 ); + goldilocks_global.wnafs, WNAF_PRECMP_BITS ); untwist_and_double_and_serialize( &pk, &pk_text ); - p448_sub(&eph, &eph, &pk); - p448_bias(&eph, 2); + field_sub(&eph, &eph, &pk); + field_bias(&eph, 2); - succ = p448_is_zero(&eph); + succ = field_is_zero(&eph); return succ ? 0 : GOLDI_EINVAL; } @@ -504,8 +463,8 @@ goldilocks_precompute_public_key ( struct tw_extensible_t pk_text; - struct p448_t pk; - mask_t succ = p448_deserialize(&pk, pub->opaque); + struct field_t pk; + mask_t succ = field_deserialize(&pk, pub->opaque); if (!succ) { free(precom); return NULL; @@ -516,11 +475,9 @@ goldilocks_precompute_public_key ( free(precom); return NULL; } - - int big = USE_BIG_TABLES; - uint64_t n = big ? 5 : 8, t = big ? 5 : 4, s = big ? 18 : 14; - succ = precompute_fixed_base(&precom->table, &pk_text, n, t, s, NULL); + succ = precompute_fixed_base(&precom->table, &pk_text, + COMB_N, COMB_T, COMB_S, NULL); if (!succ) { free(precom); return NULL; @@ -553,17 +510,17 @@ goldilocks_verify_precomputed ( } word_t s[GOLDI_FIELD_WORDS]; - mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &goldi_q448); + mask_t succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order); if (!succ) return GOLDI_EINVAL; word_t challenge[GOLDI_FIELD_WORDS]; goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len); - struct p448_t eph, pk; + struct field_t eph, pk; struct tw_extensible_t pk_text; /* deserialize [nonce]G */ - succ = p448_deserialize(&eph, signature); + succ = field_deserialize(&eph, signature); if (!succ) return GOLDI_EINVAL; succ = linear_combo_combs_vt ( @@ -574,10 +531,10 @@ goldilocks_verify_precomputed ( if (!succ) return GOLDI_EINVAL; untwist_and_double_and_serialize( &pk, &pk_text ); - p448_sub(&eph, &eph, &pk); - p448_bias(&eph, 2); + field_sub(&eph, &eph, &pk); + field_bias(&eph, 2); - succ = p448_is_zero(&eph); + succ = field_is_zero(&eph); return succ ? 0 : GOLDI_EINVAL; } @@ -596,5 +553,5 @@ goldilocks_shared_secret_precomputed ( ); } -#endif // GOLDI_IMPLEMENT_PRECOMPUTED_KEYS +#endif /* GOLDI_IMPLEMENT_PRECOMPUTED_KEYS */ diff --git a/src/include/api.h b/src/include/api.h new file mode 100644 index 0000000..cc20246 --- /dev/null +++ b/src/include/api.h @@ -0,0 +1,190 @@ +/** + * @file sizes.h + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @brief BATMAN / SUPERCOP glue for benchmarking. + */ + +#include +#include "goldilocks.h" + +#define PUBLICKEY_BYTES GOLDI_PUBLIC_KEY_BYTES +#define SECRETKEY_BYTES GOLDI_PRIVATE_KEY_BYTES +#define SIGNATURE_BYTES GOLDI_SIGNATURE_BYTES +#define SHAREDSECRET_BYTES GOLDI_SHARED_SECRET_BYTES + +#define crypto_dh_PUBLICKEYBYTES PUBLICKEY_BYTES +#define crypto_dh_SECRETKEYBYTES SECRETKEY_BYTES +#define PRIVATEKEY_BYTES SECRETKEY_BYTES +#define crypto_dh_BYTES SHAREDSECRET_BYTES +#define crypto_dh_IMPLEMENTATION "AMD64" +#define crypto_dh_VERSION "2014-07-11" + +#define crypto_sign_PUBLICKEYBYTES PUBLICKEY_BYTES +#define crypto_sign_SECRETKEYBYTES SECRETKEY_BYTES +#define crypto_sign_IMPLEMENTATION "AMD64" +#define crypto_sign_VERSION "2014-07-11" +#define crypto_sign_BYTES SIGNATURE_BYTES + +#define CRYPTO_DETERMINISTIC 1 + +/* +#ifndef LOOPS +#define LOOPS 512 +#endif +*/ + +static inline int timingattacks() { return 0; } +static inline int copyrightclaims() { return 0; } +static inline int patentclaims() { + /* Until the end of July 2014, point compression + * is patented. */ + return 20; +} + +#define crypto_sign_keypair crypto_dh_keypair +static inline int crypto_dh_keypair ( + unsigned char pk[SECRETKEY_BYTES], + unsigned char sk[PUBLICKEY_BYTES] +) { + int ret; + ret = goldilocks_init(); + if (ret && ret != GOLDI_EALREADYINIT) + return ret; + if ((ret = goldilocks_keygen( + (struct goldilocks_private_key_t *)sk, + (struct goldilocks_public_key_t *)pk + ))) abort(); + return ret; +} + +static inline void keypair ( + unsigned char sk[SECRETKEY_BYTES], + unsigned long long *sklen, + unsigned char pk[PUBLICKEY_BYTES], + unsigned long long *pklen +) { + int ret = goldilocks_init(); + if (ret) abort(); + + ret = goldilocks_keygen( + (struct goldilocks_private_key_t *)sk, + (struct goldilocks_public_key_t *)pk + ); + if (ret) abort(); + + *sklen = SECRETKEY_BYTES; + *pklen = PUBLICKEY_BYTES; +} + +static inline int crypto_sign ( + unsigned char *sm, + unsigned long long *smlen, + const unsigned char *m, + unsigned long long mlen, + const unsigned char sk[SECRETKEY_BYTES] +) { + int ret = goldilocks_sign( + sm, m, mlen, + (const struct goldilocks_private_key_t *)sk + ); + if (ret) abort(); + + memcpy(sm + SIGNATURE_BYTES, m, mlen); + + *smlen = mlen + SIGNATURE_BYTES; + return 0; +} + +static inline void signmessage ( + unsigned char *sm, + unsigned long long *smlen, + const unsigned char *m, + unsigned long long mlen, + const unsigned char sk[SECRETKEY_BYTES], + unsigned long long sklen +) { + if (sklen != PRIVATEKEY_BYTES) abort(); + + int ret = goldilocks_sign( + sm, m, mlen, + (const struct goldilocks_private_key_t *)sk + ); + if (ret) abort(); + + memcpy(sm + SIGNATURE_BYTES, m, mlen); + + *smlen = mlen + SIGNATURE_BYTES; +} + +static inline int crypto_sign_open ( + unsigned char *m, + unsigned long long *mlen, + const unsigned char *sm, + unsigned long long smlen, + const unsigned char pk[PUBLICKEY_BYTES] +) { + int ret = goldilocks_verify( + sm, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES, + (const struct goldilocks_public_key_t *)pk + ); + if (!ret) { + *mlen = smlen - SIGNATURE_BYTES; + memcpy(m, sm + SIGNATURE_BYTES, *mlen); + } + return ret ? -1 : 0; +} + +static inline int verification ( + const unsigned char *m, + unsigned long long mlen, + const unsigned char *sm, + unsigned long long smlen, + const unsigned char pk[PUBLICKEY_BYTES], + unsigned long long pklen +) { + if (pklen != PUBLICKEY_BYTES) abort(); + + int ret = goldilocks_verify( + sm, m, mlen, + (const struct goldilocks_public_key_t *)pk + ); + return ret ? -1 : 0; +} + + +static inline int crypto_dh ( + unsigned char s[SHAREDSECRET_BYTES], + const unsigned char sk[SECRETKEY_BYTES], + const unsigned char pk[PUBLICKEY_BYTES] +) { + return goldilocks_shared_secret ( + s, + (const struct goldilocks_private_key_t *)sk, + (const struct goldilocks_public_key_t *)pk + ); +} + +static inline int sharedsecret ( + unsigned char s[SHAREDSECRET_BYTES], + unsigned long long *slen, + const unsigned char sk[SECRETKEY_BYTES], + unsigned long long sklen, + const unsigned char pk[PUBLICKEY_BYTES], + unsigned long long pklen +) { + if (pklen != PUBLICKEY_BYTES) abort(); + if (sklen != SECRETKEY_BYTES) abort(); + + int ret = goldilocks_shared_secret ( + s, + (const struct goldilocks_private_key_t *)sk, + (const struct goldilocks_public_key_t *)pk + ); + if (ret) return -1; + *slen = SHAREDSECRET_BYTES; + return 0; +} + diff --git a/src/include/barrett_field.h b/src/include/barrett_field.h index 9d8f930..1187138 100644 --- a/src/include/barrett_field.h +++ b/src/include/barrett_field.h @@ -32,7 +32,7 @@ struct barrett_prime_t { /** * The Goldilocks prime. I'm not sure this is the right place for it, but oh well. */ -extern const struct barrett_prime_t goldi_q448; +extern const struct barrett_prime_t curve_prime_order; /** * Reduce a number (with optional high carry word) mod p. diff --git a/src/include/config.h b/src/include/config.h index dbd785d..ca6da24 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -1,8 +1,64 @@ +/** + * @file config.h + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @brief Goldilocks top-level configuration flags. + */ + #ifndef __GOLDILOCKS_CONFIG_H__ #define __GOLDILOCKS_CONFIG_H__ 1 +/** @brief crandom architecture detection. + * With this flag set to 1, crandom will assume that any flag + * supported by -march and friends (MIGHT_HAVE) will actually + * be available on the target machine (MUST_HAVE), instead of + * trying to detect it. + * + * Without this flag, crandom can detect, eg, that while -mavx + * was passed, the currint machine doesn't support AVX, and can + * fall back to SSE2 or whatever. But the rest of the + * Goldilocks code doesn't support this, so it'll still crash + * with an illegal instruction error. + * + * Setting this flag will make the library smaller. + */ +#define CRANDOM_MIGHT_IS_MUST 1 + +/** + * @brief Causes crandom to refuse to buffer requests bigger + * than this size. Setting 0 disables buffering for all + * requests, which hurts performance. + * + * The advantage is that if a user process forks or is VM- + * snapshotted, the buffer is not adjusted (FUTURE). However, + * with the buffer disabled, the refresh routines will stir + * in entropy from RDTSC and/or RDRAND, making this operation + * mostly-safe. + */ +#define EXPERIMENT_CRANDOM_BUFFER_CUTOFF_BYTES 0 + +/** + * @brief Goldilocks uses libpthread mutexes to provide + * thread-safety. If you disable this flag, it won't link + * libpthread, but it won't be thread-safe either. + */ #define GOLDILOCKS_USE_PTHREAD 1 + +/** + * @brief Experiment to change the hash inputs for ECDH, + * in a way that obliterates the result -- overwriting it with + * a safe pseudorandom value -- if the public key is invalid. + * That way users who ignore the status result won't be + * exposed to invalid key attacks. + */ #define EXPERIMENT_ECDH_OBLITERATE_CT 1 + +/** + * @brief ECDH adds public keys into the hash, to prevent + * esoteric attacks. + */ #define EXPERIMENT_ECDH_STIR_IN_PUBKEYS 1 -#endif // __GOLDILOCKS_CONFIG_H__ +#endif /* __GOLDILOCKS_CONFIG_H__ */ diff --git a/src/include/crandom.h b/src/include/crandom.h index f603f13..90cc374 100644 --- a/src/include/crandom.h +++ b/src/include/crandom.h @@ -12,6 +12,7 @@ #ifndef __GOLDI_CRANDOM_H__ #define __GOLDI_CRANDOM_H__ 1 +#define _XOPEN_SOURCE 600 #include /* for uint64_t */ #include /* for open */ #include /* for returning errors after open */ diff --git a/src/include/field.h b/src/include/field.h new file mode 100644 index 0000000..6231aba --- /dev/null +++ b/src/include/field.h @@ -0,0 +1,30 @@ +/** + * @file field.h + * @brief Field switch code. + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + */ +#ifndef __FIELD_H__ +#define __FIELD_H__ +#include "magic.h" + +#include "p448.h" + +#define field_t p448_t +#define field_mul p448_mul +#define field_add p448_add +#define field_sub p448_sub +#define field_bias p448_bias +#define field_copy p448_copy +#define field_weak_reduce p448_weak_reduce +#define field_strong_reduce p448_strong_reduce +#define field_cond_swap p448_cond_swap +#define field_cond_neg p448_cond_neg +#define field_serialize p448_serialize +#define field_deserialize p448_deserialize +#define field_is_zero p448_is_zero +#define simultaneous_invert simultaneous_invert_p448 /* FUTURE: consistency */ + +#endif /* __FIELD_H__ */ diff --git a/src/include/intrinsics.h b/src/include/intrinsics.h index 1dac686..1b39eb5 100644 --- a/src/include/intrinsics.h +++ b/src/include/intrinsics.h @@ -11,25 +11,27 @@ #define __CRANDOM_INTRINSICS_H__ 1 #include +#include "config.h" #if __i386__ || __x86_64__ #include #endif +/** @brief Macro to make a function static, forcibly inlined and possibly unused. */ #define INTRINSIC \ static __inline__ __attribute__((__gnu_inline__, __always_inline__, unused)) -#define GEN 1 -#define SSE2 2 -#define SSSE3 4 -#define AESNI 8 -#define XOP 16 -#define AVX 32 -#define AVX2 64 -#define RDRAND 128 +#define GEN 1 /**< @brief Intrinsics field has been generated. */ +#define SSE2 2 /**< @brief Machine supports SSE2 */ +#define SSSE3 4 /**< @brief Machine supports SSSE3 (for shuffles) */ +#define AESNI 8 /**< @brief Machine supports Intel AES-NI */ +#define XOP 16 /**< @brief Machine supports AMD XOP */ +#define AVX 32 /**< @brief Machine supports Intel AVX (for masking) */ +#define AVX2 64 /**< @brief Machine supports Intel AVX2 (for bignums) */ +#define RDRAND 128 /**< @brief Machine supports Intel RDRAND */ /** - * If on x86, read the timestamp counter. Otherwise, return 0. + * @brief If on x86, read the timestamp counter. Otherwise, return 0. */ INTRINSIC u_int64_t rdtsc() { u_int64_t out = 0; @@ -53,6 +55,8 @@ INTRINSIC u_int64_t opacify(u_int64_t x) { return x; } + +/** @cond internal */ #ifdef __AVX2__ # define MIGHT_HAVE_AVX2 1 # ifndef MUST_HAVE_AVX2 @@ -92,10 +96,6 @@ INTRINSIC u_int64_t opacify(u_int64_t x) { # define pslldq _mm_slli_epi32 # define pshufd _mm_shuffle_epi32 -INTRINSIC ssereg sse2_rotate(int r, ssereg a) { - return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r); -} - #else # define MIGHT_HAVE_SSE2 0 # define MUST_HAVE_SSE2 0 @@ -127,11 +127,6 @@ INTRINSIC ssereg sse2_rotate(int r, ssereg a) { # ifndef MUST_HAVE_XOP # define MUST_HAVE_XOP 0 # endif -INTRINSIC ssereg xop_rotate(int amount, ssereg x) { - ssereg out; - __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount)); - return out; -} #else # define MIGHT_HAVE_XOP 0 # define MUST_HAVE_XOP 0 @@ -146,6 +141,9 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) { | RDRAND * MIGHT_HAVE_RDRAND \ | AVX2 * MIGHT_HAVE_AVX2) +#if CRANDOM_MIGHT_IS_MUST +#define MUST_MASK MIGHT_MASK +#else #define MUST_MASK \ ( SSE2 * MUST_HAVE_SSE2 \ | SSSE3 * MUST_HAVE_SSSE3 \ @@ -154,22 +152,58 @@ INTRINSIC ssereg xop_rotate(int amount, ssereg x) { | AVX * MUST_HAVE_AVX \ | RDRAND * MUST_HAVE_RDRAND \ | AVX2 * MUST_HAVE_AVX2 ) +#endif +/** @endcond */ + +#ifdef __SSE2__ +/** Rotate a register by some amount using SSE2. */ +INTRINSIC ssereg sse2_rotate(int r, ssereg a) { + return _mm_slli_epi32(a, r) ^ _mm_srli_epi32(a, 32-r); +} +#endif + +#ifdef __XOP__ +/** Rotate a register by some amount using AMD XOP. */ +INTRINSIC ssereg xop_rotate(int amount, ssereg x) { + ssereg out; + __asm__ ("vprotd %1, %2, %0" : "=x"(out) : "x"(x), "g"(amount)); + return out; +} +#endif +/** + * @brief Macro which detects that targets might support this feature, + * so that we can include code for it. + */ #define MIGHT_HAVE(feature) ((MIGHT_MASK & feature) == feature) + +/** + * @brief Macro which detects that targets must support this feature, + * so we can omit fallback code. + */ #define MUST_HAVE(feature) ((MUST_MASK & feature) == feature) +/** + * @brief Make a functiona available by C API. + */ #ifdef __cplusplus # define extern_c extern "C" #else # define extern_c #endif +/** @cond internal + * @brief Detect platform features and return them as a flagfield int. + */ extern_c unsigned int crandom_detect_features(); +/** @endcond */ #ifndef likely -# define likely(x) __builtin_expect((x),1) -# define unlikely(x) __builtin_expect((x),0) +# define likely(x) __builtin_expect((x),1) \ + /**< @brief Tell the compiler that a branch is likely, for optimization. */ +# define unlikely(x) __builtin_expect((x),0) \ + /**< @brief Tell the compiler that a branch is unlikely, for optimization. */ #endif /** @@ -187,12 +221,6 @@ compare_and_swap ( const char *volatile* target, const char *old, const char *new -); - -const char *compare_and_swap ( - const char *volatile* target, - const char *old, - const char *new ) { return __sync_val_compare_and_swap(target,old,new); } @@ -208,13 +236,6 @@ const char *compare_and_swap ( * @param [in] new A value to replace the target on success. */ INTRINSIC int -bool_compare_and_swap ( - const char *volatile* target, - const char *old, - const char *new -); - -int bool_compare_and_swap ( const char *volatile* target, const char *old, @@ -231,6 +252,8 @@ bool_compare_and_swap ( * MIGHT_HAVE(feature) is set, but MUST_HAVE(feature) is not. */ extern volatile unsigned int crandom_features; + +/** @brief Determine if a given CPU feature is available. */ INTRINSIC int HAVE(unsigned int feature); int HAVE(unsigned int feature) { diff --git a/src/include/magic.h b/src/include/magic.h new file mode 100644 index 0000000..1aac4ce --- /dev/null +++ b/src/include/magic.h @@ -0,0 +1,105 @@ +/** + * @file magic.h + * @copyright + * Copyright (c) 2014 Cryptography Research, Inc. \n + * Released under the MIT License. See LICENSE.txt for license information. + * @author Mike Hamburg + * @brief Goldilocks magic numbers (group orders, coefficients, algo params etc). + */ + + +#ifndef __GOLDI_MAGIC_H__ +#define __GOLDI_MAGIC_H__ 1 + +#include "word.h" +#include "p448.h" +#include "ec_point.h" + +/* TODO: standardize notation */ + + +/** @brief The number of bits in the Goldilocks field. */ +#define GOLDI_FIELD_BITS 448 + +/** @brief The number of words in the Goldilocks field. */ +#define GOLDI_FIELD_WORDS DIV_CEIL(GOLDI_FIELD_BITS,WORD_BITS) + +/** @brief The number of bits in the Goldilocks curve's cofactor (cofactor=4). */ +#define COFACTOR_BITS 2 + +/** @brief The number of bits in a Goldilocks scalar. */ +#define SCALAR_BITS (GOLDI_FIELD_BITS - COFACTOR_BITS) + +/** @brief The number of words in the Goldilocks field. */ +#define SCALAR_WORDS WORDS_FOR_BITS(SCALAR_BITS) + +/** + * @brief sqrt(d-1), used for point formats and twisting. + */ +extern const struct p448_t sqrt_d_minus_1; + +/** + * @brief The base point for Goldilocks. + */ +extern const struct affine_t goldilocks_base_point; + +/** + * @brief The Goldilocks prime subgroup order. + */ +extern const struct barrett_prime_t curve_prime_order; + +/** + * @brief Window size for fixed-window signed binary scalarmul. + * Table size is 2^(this - 1). + */ +#define SCALARMUL_FIXED_WINDOW_SIZE 5 + +/** + * @brief Even/odd adjustments for fixed window with + * ROUNDUP(SCALAR_BITS,SCALARMUL_FIXED_WINDOW_SIZE). + */ +extern const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS]; + +/** + * @brief Table size for wNAF signed binary (variable-time) scalarmul. + * Table size is 2^this. + */ +#define SCALARMUL_WNAF_TABLE_BITS 3 + +/** + * @brief Table size for wNAF signed binary (variable-time) linear combo. + * Table size is 2^this. + */ +#define SCALARMUL_WNAF_COMBO_TABLE_BITS 4 + +/** + * @brief If true, use wider tables for the precomputed combs. + */ +#ifndef USE_BIG_COMBS +#if __ARM_NEON__ +#define USE_BIG_COMBS 1 +#else +#define USE_BIG_COMBS (WORD_BITS==64) +#endif +#endif + +/** @brief The number of combs to use for signed comb algo */ +#define COMB_N (USE_BIG_COMBS ? 5 : 8) + +/** @brief The number of teeth of the combs for signed comb algo */ +#define COMB_T (USE_BIG_COMBS ? 5 : 4) + +/** @brief The spacing the of combs for signed comb algo */ +#define COMB_S (USE_BIG_COMBS ? 18 : 14) + +/** + * @brief The bit width of the precomputed WNAF tables. Size is 2^this elements. + */ +#define WNAF_PRECMP_BITS 5 + +/** + * @brief crandom magic structure guard constant = "return 4", cf xkcd #221 + */ +#define CRANDOM_MAGIC 0x72657475726e2034ull + +#endif /* __GOLDI_MAGIC_H__ */ diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h index 8b42fd7..bd97cc9 100644 --- a/src/include/scalarmul.h +++ b/src/include/scalarmul.h @@ -10,12 +10,19 @@ #define __P448_ALGO_H__ 1 #include "ec_point.h" +#include "field.h" #include "intrinsics.h" +#include "magic.h" #ifdef __cplusplus extern "C" { #endif +/** + * A word array containing a scalar + */ +typedef word_t scalar_t[SCALAR_WORDS]; + /** * A precomputed table for fixed-base scalar multiplication. * @@ -26,7 +33,7 @@ struct fixed_base_table_t { struct tw_niels_t *table; /** Adjustments to the scalar in even and odd cases, respectively. */ - word_t scalar_adjustments[2*(448/WORD_BITS)]; /* MAGIC */ + word_t scalar_adjustments[2*SCALAR_WORDS]; /** The number of combs in the table. */ unsigned int n; @@ -83,8 +90,8 @@ struct fixed_base_table_t { */ mask_t montgomery_ladder ( - struct p448_t *out, - const struct p448_t *in, + struct field_t *out, + const struct field_t *in, const word_t *scalar, unsigned int nbits, unsigned int n_extra_doubles @@ -103,7 +110,7 @@ montgomery_ladder ( void scalarmul ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] /* MAGIC */ + const word_t scalar[SCALAR_WORDS] /* TODO? int nbits */ ); @@ -124,8 +131,7 @@ scalarmul ( void scalarmul_vlook ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] /* MAGIC */ - /* TODO? int nbits */ + const word_t scalar[SCALAR_WORDS] ); /** @@ -134,7 +140,7 @@ scalarmul_vlook ( * * This function computes $n$ "comb" tables, each containing * 2^(t-1) points in tw_niels_t format. You must have - * n * t * s >= 446 for complete coverage. + * n * t * s >= SCALAR_BITS = 446 for complete coverage. * * The scalar multiplication algorithm may adjust the scalar by * a multiple of q. Therefore, we strongly recommend to use base @@ -205,11 +211,13 @@ scalarmul_fixed_base ( * * @param [inout] working The input and output point. * @param [in] scalar The scalar. + * @param [in] nbits The number of bits in the scalar */ void scalarmul_vt ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] /* MAGIC */ + const word_t *scalar, + unsigned int nbits ); @@ -274,9 +282,9 @@ scalarmul_fixed_base_wnaf_vt ( void linear_combo_var_fixed_vt ( struct tw_extensible_t *working, - const word_t scalar_var[448/WORD_BITS], /* MAGIC */ + const word_t scalar_var[SCALAR_WORDS], unsigned int nbits_var, - const word_t scalar_pre[448/WORD_BITS], /* MAGIC */ + const word_t scalar_pre[SCALAR_WORDS], unsigned int nbits_pre, const struct tw_niels_t *precmp, unsigned int table_bits_pre @@ -302,10 +310,10 @@ linear_combo_var_fixed_vt ( mask_t linear_combo_combs_vt ( struct tw_extensible_t *out, - const word_t scalar1[448/WORD_BITS], + const word_t scalar1[SCALAR_WORDS], unsigned int nbits1, const struct fixed_base_table_t *table1, - const word_t scalar2[448/WORD_BITS], + const word_t scalar2[SCALAR_WORDS], unsigned int nbits2, const struct fixed_base_table_t *table2 ); diff --git a/src/include/word.h b/src/include/word.h index 0f6c6e6..d165647 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -26,7 +26,6 @@ #if (__SIZEOF_INT128__ == 16 && __SIZEOF_SIZE_T__ == 8 && (__SIZEOF_LONG__==8 || __POINTER_WIDTH__==64) && !GOLDI_FORCE_32_BIT) /* It's a 64-bit machine if: - * // limits.h thinks so * __uint128_t exists * size_t is 64 bits * Either longs are 64-bits (doesn't happen on Windows) @@ -61,6 +60,9 @@ typedef int64_t dsword_t; #endif #define WORD_BITS (sizeof(word_t) * 8) +#define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y)) +#define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y)) +#define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS)) typedef word_t mask_t; static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1; @@ -69,51 +71,80 @@ static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -1; #ifdef __ARM_NEON__ typedef uint32x4_t vecmask_t; -#else -/* FIXME this only works on clang */ +#elif __clang__ +typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); +typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); +typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); +typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); +typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); +typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); +typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); +typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); +typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); +typedef word_t vecmask_t __attribute__((ext_vector_type(4))); +#else /* GCC-cleanliness */ typedef uint64_t uint64x2_t __attribute__((vector_size(16))); typedef int64_t int64x2_t __attribute__((vector_size(16))); typedef uint64_t uint64x4_t __attribute__((vector_size(32))); typedef int64_t int64x4_t __attribute__((vector_size(32))); -typedef uint32_t uint32x2_t __attribute__((vector_size(8))); -typedef int32_t int32x2_t __attribute__((vector_size(8))); typedef uint32_t uint32x4_t __attribute__((vector_size(16))); typedef int32_t int32x4_t __attribute__((vector_size(16))); +typedef uint32_t uint32x2_t __attribute__((vector_size(8))); +typedef int32_t int32x2_t __attribute__((vector_size(8))); typedef uint32_t uint32x8_t __attribute__((vector_size(32))); typedef int32_t int32x8_t __attribute__((vector_size(32))); -/* TODO: vector width for procs like ARM; gcc support */ typedef word_t vecmask_t __attribute__((vector_size(32))); #endif #if __AVX2__ -typedef uint32x8_t big_register_t; -typedef uint64x4_t uint64xn_t; -typedef uint32x8_t uint32xn_t; -#elif __SSE2__ || __ARM_NEON__ -typedef uint32x4_t big_register_t; -typedef uint64x2_t uint64xn_t; -typedef uint32x4_t uint32xn_t; + typedef uint32x8_t big_register_t; + typedef uint64x4_t uint64xn_t; + typedef uint32x8_t uint32xn_t; + + static __inline__ big_register_t + br_set_to_mask(mask_t x) { + uint32_t y = x; + big_register_t ret = {y,y,y,y,y,y,y,y}; + return ret; + } +#elif __SSE2__ + typedef uint32x4_t big_register_t; + typedef uint64x2_t uint64xn_t; + typedef uint32x4_t uint32xn_t; + typedef uint32_t uint32xn_t; + + static __inline__ big_register_t + br_set_to_mask(mask_t x) { + uint32_t y = x; + big_register_t ret = {y,y,y,y}; + return ret; + } +#elif __ARM_NEON__ + typedef uint32x4_t big_register_t; + typedef uint64x2_t uint64xn_t; + typedef uint32x4_t uint32xn_t; + static __inline__ big_register_t + br_set_to_mask(mask_t x) { + return vdupq_n_u32(x); + } #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ -typedef uint64_t big_register_t, uint64xn_t; -typedef uint32_t uint32xn_t; -#else -typedef uint64_t uint64xn_t; -typedef uint32_t uint32xn_t; -typedef uint32_t big_register_t; -#endif - + typedef uint64_t big_register_t, uint64xn_t; -#ifdef __ARM_NEON__ -static __inline__ big_register_t -br_set_to_mask(mask_t x) { - return vdupq_n_u32(x); -} + typedef uint32_t uint32xn_t; + static __inline__ big_register_t + br_set_to_mask(mask_t x) { + return (big_register_t)x; + } #else -static __inline__ big_register_t -br_set_to_mask(mask_t x) { - big_register_t out = {x,x,x,x,x,x,x,x}; - return out; -} + typedef uint64_t uint64xn_t; + typedef uint32_t uint32xn_t; + typedef uint32_t big_register_t; + + static __inline__ big_register_t + br_set_to_mask(mask_t x) { + return (big_register_t)x; + } #endif #if __AVX2__ || __SSE2__ diff --git a/src/magic.c b/src/magic.c new file mode 100644 index 0000000..a6336f7 --- /dev/null +++ b/src/magic.c @@ -0,0 +1,61 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "field.h" +#include "magic.h" +#include "barrett_field.h" + +/* FUTURE: automatically generate this file. */ + +const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { + U64LE(0xebec9967f5d3f5c2), + U64LE(0x0aa09b49b16c9a02), + U64LE(0x7f6126aec172cd8e), + U64LE(0x00000007b027e54d), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x4000000000000000), + + U64LE(0xc873d6d54a7bb0cf), + U64LE(0xe933d8d723a70aad), + U64LE(0xbb124b65129c96fd), + U64LE(0x00000008335dc163), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000) +}; + +const struct affine_t goldilocks_base_point = { + {{ U58LE(0xf0de840aed939f), U58LE(0xc170033f4ba0c7), + U58LE(0xf3932d94c63d96), U58LE(0x9cecfa96147eaa), + U58LE(0x5f065c3c59d070), U58LE(0x3a6a26adf73324), + U58LE(0x1b4faff4609845), U58LE(0x297ea0ea2692ff) + }}, + {{ 19 }} +}; + +static const word_t curve_prime_order_lo[(224+WORD_BITS-1)/WORD_BITS] = { + U64LE(0xdc873d6d54a7bb0d), + U64LE(0xde933d8d723a70aa), + U64LE(0x3bb124b65129c96f), + 0x8335dc16 +}; +const struct barrett_prime_t curve_prime_order = { + GOLDI_FIELD_WORDS, + 62 % WORD_BITS, + sizeof(curve_prime_order_lo)/sizeof(curve_prime_order_lo[0]), + curve_prime_order_lo +}; + +const struct field_t +sqrt_d_minus_1 = {{ + U58LE(0xd2e21836749f46), + U58LE(0x888db42b4f0179), + U58LE(0x5a189aabdeea38), + U58LE(0x51e65ca6f14c06), + U58LE(0xa49f7b424d9770), + U58LE(0xdcac4628c5f656), + U58LE(0x49443b8748734a), + U58LE(0x12fec0c0b25b7a) +}}; diff --git a/src/scalarmul.c b/src/scalarmul.c index 89891db..e3e5850 100644 --- a/src/scalarmul.c +++ b/src/scalarmul.c @@ -13,8 +13,8 @@ mask_t montgomery_ladder ( - struct p448_t *out, - const struct p448_t *in, + struct field_t *out, + const struct field_t *in, const word_t *scalar, unsigned int nbits, unsigned int n_extra_doubles @@ -28,15 +28,15 @@ montgomery_ladder ( word_t w = scalar[j]; for (i=n; i>=0; i--) { mask_t flip = -((w>>i)&1); - p448_cond_swap(&mont.xa,&mont.xd,flip^pflip); - p448_cond_swap(&mont.za,&mont.zd,flip^pflip); + field_cond_swap(&mont.xa,&mont.xd,flip^pflip); + field_cond_swap(&mont.za,&mont.zd,flip^pflip); montgomery_step(&mont); pflip = flip; } n = WORD_BITS-1; } - p448_cond_swap(&mont.xa,&mont.xd,pflip); - p448_cond_swap(&mont.za,&mont.zd,pflip); + field_cond_swap(&mont.xa,&mont.xd,pflip); + field_cond_swap(&mont.za,&mont.zd,pflip); assert(n_extra_doubles < INT_MAX); for (j=0; j<(int)n_extra_doubles; j++) { @@ -51,8 +51,8 @@ cond_negate_tw_niels ( struct tw_niels_t *n, mask_t doNegate ) { - p448_cond_swap(&n->a, &n->b, doNegate); - p448_cond_neg(&n->c, doNegate); + field_cond_swap(&n->a, &n->b, doNegate); + field_cond_neg(&n->c, doNegate); } static __inline__ void @@ -137,34 +137,18 @@ convert_to_signed_window_form ( void scalarmul ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] + const word_t scalar[SCALAR_WORDS] ) { - const int nbits=450; /* MAGIC */ - word_t prepared_data[448*2/WORD_BITS] = { - - U64LE(0xebec9967f5d3f5c2), - U64LE(0x0aa09b49b16c9a02), - U64LE(0x7f6126aec172cd8e), - U64LE(0x00000007b027e54d), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000), - U64LE(0x4000000000000000), - - U64LE(0xc873d6d54a7bb0cf), - U64LE(0xe933d8d723a70aad), - U64LE(0xbb124b65129c96fd), - U64LE(0x00000008335dc163), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000) - }; /* MAGIC */ - - word_t scalar2[448/WORD_BITS]; - convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); - - const int WINDOW = 5, /* MAGIC */ + const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE, WINDOW_MASK = (1<> 1, - NTABLE = 1<<(WINDOW-1); + NTABLE = 1<<(WINDOW-1), + nbits = ROUND_UP(SCALAR_BITS,WINDOW); + + word_t scalar2[SCALAR_WORDS]; + convert_to_signed_window_form ( + scalar2, scalar, SCALAR_WORDS, + SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS + ); struct tw_extensible_t tabulator; copy_tw_extensible(&tabulator, working); @@ -197,7 +181,7 @@ scalarmul ( bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); - if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { + if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); } @@ -214,34 +198,19 @@ scalarmul ( void scalarmul_vlook ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] -) { - const int nbits=450; /* HACK? */ - word_t prepared_data[448*2/WORD_BITS] = { - - U64LE(0xebec9967f5d3f5c2), - U64LE(0x0aa09b49b16c9a02), - U64LE(0x7f6126aec172cd8e), - U64LE(0x00000007b027e54d), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000), - U64LE(0x4000000000000000), - - U64LE(0xc873d6d54a7bb0cf), - U64LE(0xe933d8d723a70aad), - U64LE(0xbb124b65129c96fd), - U64LE(0x00000008335dc163), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000), - U64LE(0x0000000000000000) - }; /* MAGIC: split off */ - - word_t scalar2[448/WORD_BITS]; - convert_to_signed_window_form(scalar2,scalar,448/WORD_BITS,prepared_data,448/WORD_BITS); - - const int WINDOW = 5, /* MAGIC */ + const word_t scalar[SCALAR_WORDS] +) { + const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE, WINDOW_MASK = (1<> 1, - NTABLE = 1<<(WINDOW-1); + NTABLE = 1<<(WINDOW-1), + nbits = ROUND_UP(SCALAR_BITS,WINDOW); + + word_t scalar2[SCALAR_WORDS]; + convert_to_signed_window_form( + scalar2, scalar, SCALAR_WORDS, + SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS + ); + struct tw_extensible_t tabulator; copy_tw_extensible(&tabulator, working); @@ -274,7 +243,7 @@ scalarmul_vlook ( bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS); - if (i/WORD_BITS < 448/WORD_BITS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { + if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) { bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS)); } @@ -304,8 +273,8 @@ schedule_scalar_for_combs ( unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS, scalar2_words = scalar_words; - if (scalar2_words < 448 / WORD_BITS) - scalar2_words = 448 / WORD_BITS; + if (scalar2_words < SCALAR_WORDS) + scalar2_words = SCALAR_WORDS; word_t scalar3[scalar2_words]; /* Copy scalar to scalar3, but clear its high bits (if there are any) */ @@ -322,7 +291,7 @@ schedule_scalar_for_combs ( convert_to_signed_window_form ( scalar2, scalar3, scalar2_words, - table->scalar_adjustments , 448 / WORD_BITS + table->scalar_adjustments , SCALAR_WORDS ); return MASK_SUCCESS; @@ -331,7 +300,7 @@ schedule_scalar_for_combs ( mask_t scalarmul_fixed_base ( struct tw_extensible_t *out, - const word_t scalar[448/WORD_BITS], + const word_t scalar[SCALAR_WORDS], unsigned int nbits, const struct fixed_base_table_t *table ) { @@ -339,7 +308,7 @@ scalarmul_fixed_base ( unsigned int n = table->n, t = table->t, s = table->s; unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS; - if (scalar2_words < 448 / WORD_BITS) scalar2_words = 448 / WORD_BITS; + if (scalar2_words < SCALAR_WORDS) scalar2_words = SCALAR_WORDS; word_t scalar2[scalar2_words]; @@ -389,10 +358,10 @@ scalarmul_fixed_base ( mask_t linear_combo_combs_vt ( struct tw_extensible_t *out, - const word_t scalar1[448/WORD_BITS], + const word_t scalar1[SCALAR_WORDS], unsigned int nbits1, const struct fixed_base_table_t *table1, - const word_t scalar2[448/WORD_BITS], + const word_t scalar2[SCALAR_WORDS], unsigned int nbits2, const struct fixed_base_table_t *table2 ) { @@ -400,10 +369,10 @@ linear_combo_combs_vt ( unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2; unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS; - if (scalar1b_words < 448 / WORD_BITS) scalar1b_words = 448 / WORD_BITS; + if (scalar1b_words < SCALAR_WORDS) scalar1b_words = SCALAR_WORDS; unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS; - if (scalar2b_words < 448 / WORD_BITS) scalar2b_words = 448 / WORD_BITS; + if (scalar2b_words < SCALAR_WORDS) scalar2b_words = SCALAR_WORDS; word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words]; @@ -479,7 +448,7 @@ precompute_fixed_base ( unsigned int s, struct tw_niels_t *prealloc ) { - if (s < 1 || t < 1 || n < 1 || n*t*s < 446) { /* MAGIC */ + if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) { memset(out, 0, sizeof(*out)); return 0; } @@ -493,8 +462,8 @@ precompute_fixed_base ( struct tw_pniels_t pn_tmp; struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1)); - struct p448_t *zs = (struct p448_t *) malloc_vector(sizeof(*zs) * (n<<(t-1))); - struct p448_t *zis = (struct p448_t *) malloc_vector(sizeof(*zis) * (n<<(t-1))); + struct field_t *zs = (struct field_t *) malloc_vector(sizeof(*zs) * (n<<(t-1))); + struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis) * (n<<(t-1))); struct tw_niels_t *table = prealloc; if (prealloc) { @@ -519,30 +488,19 @@ precompute_fixed_base ( /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */ unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1; - assert(adjustment_size >= 448/WORD_BITS); + assert(adjustment_size >= SCALAR_WORDS); word_t adjustment[adjustment_size]; for (i=0; iscalar_adjustments[(448/WORD_BITS)*(adjustment[0] & 1)], - *high_adjustment = &out->scalar_adjustments[(448/WORD_BITS)*((~adjustment[0]) & 1)]; - for (i=0; i<448/WORD_BITS; i++) { + barrett_reduce(adjustment, adjustment_size, 0, &curve_prime_order); + word_t *low_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*(adjustment[0] & 1)], + *high_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*((~adjustment[0]) & 1)]; + for (i=0; i= (1u<<(t-1)) - 1) break; int delta = (j+1) ^ ((j+1)>>1) ^ gray; @@ -611,24 +569,24 @@ precompute_fixed_base ( } } - simultaneous_invert_p448(zis, zs, n<<(t-1)); + simultaneous_invert(zis, zs, n<<(t-1)); - p448_t product; + field_t product; for (i=0; i 0) { @@ -688,32 +646,32 @@ precompute_fixed_base_wnaf ( add_tw_pniels_to_tw_extensible(&base, &tmp); convert_tw_extensible_to_tw_pniels(&tmp, &base); - p448_copy(&zs[1], &tmp.z); + field_copy(&zs[1], &tmp.z); copy_tw_niels(&out[1], &tmp.n); for (i=2; i < 1<= (2<> 1; // |delta| < 2^tablebits + int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */ current = -(current & 1); for (j=i; (delta & 1) == 0; j++) { @@ -813,10 +771,10 @@ prepare_wnaf_table( void scalarmul_vt ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS] + const word_t scalar[SCALAR_WORDS], + unsigned int nbits ) { - /* HACK: not 448? */ - const int nbits=448, table_bits = 3; + const int table_bits = SCALARMUL_WNAF_TABLE_BITS; struct smvt_control control[nbits/(table_bits+1)+3]; int control_bits = recode_wnaf(control, scalar, nbits, table_bits); @@ -854,7 +812,7 @@ scalarmul_vt ( void scalarmul_fixed_base_wnaf_vt ( struct tw_extensible_t *working, - const word_t scalar[448/WORD_BITS], + const word_t scalar[SCALAR_WORDS], unsigned int nbits, const struct tw_niels_t *precmp, unsigned int table_bits @@ -895,14 +853,14 @@ scalarmul_fixed_base_wnaf_vt ( void linear_combo_var_fixed_vt( struct tw_extensible_t *working, - const word_t scalar_var[448/WORD_BITS], + const word_t scalar_var[SCALAR_WORDS], unsigned int nbits_var, - const word_t scalar_pre[448/WORD_BITS], + const word_t scalar_pre[SCALAR_WORDS], unsigned int nbits_pre, const struct tw_niels_t *precmp, unsigned int table_bits_pre ) { - const int table_bits_var = 4; + const int table_bits_var = SCALARMUL_WNAF_COMBO_TABLE_BITS; struct smvt_control control_var[nbits_var/(table_bits_var+1)+3]; struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3]; diff --git a/src/sha512.c b/src/sha512.c index 09d2e4c..82f81ad 100644 --- a/src/sha512.c +++ b/src/sha512.c @@ -2,8 +2,8 @@ * Copyright (c) 2014 Cryptography Research, Inc. * Released under the MIT License. See LICENSE.txt for license information. */ -#include "sha512.h" #include "word.h" +#include "sha512.h" #include #include @@ -163,9 +163,11 @@ sha512_final ( sha512_process_block(ctx); fill = 0; } - memset(ctx->block + fill, 0, 120-fill); - uint64_t size = htobe64((ctx->nbytes * 8)); - memcpy(&ctx->block[120], &size, sizeof(size)); + memset(ctx->block + fill, 0, 112-fill); + + uint64_t highCount = 0, lowCount = htobe64((ctx->nbytes * 8)); + memcpy(&ctx->block[112],&highCount,8); + memcpy(&ctx->block[120],&lowCount,8); sha512_process_block(ctx); for (i=0; i<8; i++) { ctx->chain[i] = htobe64(ctx->chain[i]); diff --git a/test/bench.c b/test/bench.c index 2e90e9a..b80be14 100644 --- a/test/bench.c +++ b/test/bench.c @@ -100,6 +100,9 @@ int main(int argc, char **argv) { for (i=0; i<32; i++) initial_seed[i] = i; struct crandom_state_t crand; crandom_init_from_buffer(&crand, initial_seed); + /* For testing the performance drop from the crandom debuffering change. + ignore_result(crandom_init_from_file(&crand, "/dev/urandom", 10000, 1)); + */ word_t sk[448/WORD_BITS],tk[448/WORD_BITS]; q448_randomize(&crand, sk); @@ -248,14 +251,14 @@ int main(int argc, char **argv) { when = now(); for (i=0; i