/** * @cond internal * @file x25519.c * @copyright * Copyright (c) 2015-2016 Cryptography Research, Inc. \n * Released under the MIT License. See LICENSE.txt for license information. * @author Mike Hamburg * @brief Key exchange and signatures based on X25519. */ #include #include "x25519.h" #include "strobe.h" #include "strobe_config.h" #if X25519_WBITS == 64 typedef uint64_t limb_t; typedef __uint128_t dlimb_t; typedef __int128_t sdlimb_t; #define eswap_limb eswap_letoh_64 #define LIMB(x) x##ull #elif X25519_WBITS == 32 typedef uint32_t limb_t; typedef uint64_t dlimb_t; typedef int64_t sdlimb_t; #define eswap_limb eswap_letoh_32 #define LIMB(x) (uint32_t)(x##ull),(uint32_t)((x##ull)>>32) #else #error "Need to know X25519_WBITS" #endif #define NLIMBS (256/X25519_WBITS) typedef limb_t fe[NLIMBS]; #if X25519_SUPPORT_SIGN typedef limb_t scalar_t[NLIMBS]; static const limb_t MONTGOMERY_FACTOR = (limb_t)0xd2b51da312547e1bull; static const scalar_t sc_p = { LIMB(0x5812631a5cf5d3ed), LIMB(0x14def9dea2f79cd6), LIMB(0x0000000000000000), LIMB(0x1000000000000000) }, sc_r2 = { LIMB(0xa40611e3449c0f01), LIMB(0xd00e1ba768859347), LIMB(0xceec73d217f5be65), LIMB(0x0399411b7c309a3d) }; #endif static inline limb_t umaal( limb_t *carry, limb_t acc, limb_t mand, limb_t mier ) { dlimb_t tmp = (dlimb_t) mand * mier + acc + *carry; *carry = tmp >> X25519_WBITS; return tmp; } /* These functions are implemented in terms of umaal on ARM */ static inline limb_t adc(limb_t *carry, limb_t acc, limb_t mand) { dlimb_t total = (dlimb_t)*carry + acc + mand; *carry = total>>X25519_WBITS; return total; } static inline limb_t adc0(limb_t *carry, limb_t acc) { dlimb_t total = (dlimb_t)*carry + acc; *carry = total>>X25519_WBITS; return total; } /* Precondition: carry is small. * Invariant: result of propagate is < 2^255 + 1 word * In particular, always less than 2p. * Also, output x >= min(x,19) */ static void propagate(fe x, limb_t over) { unsigned i; over = x[NLIMBS-1]>>(X25519_WBITS-1) | over<<1; x[NLIMBS-1] &= ~((limb_t)1<<(X25519_WBITS-1)); limb_t carry = over * 19; for (i=0; i>= X25519_WBITS; } propagate(out,2+carry); } static void __attribute__((unused)) swapin(limb_t *x, const uint8_t *in) { memcpy(x,in,sizeof(fe)); unsigned i; for (i=0; i>= X25519_WBITS; } return ((dlimb_t)res - 1) >> X25519_WBITS; } static const limb_t a24[1] = { 121665 }; static void ladder_part1(fe xs[]) { limb_t *x2 = xs[0], *z2=xs[1],*x3=xs[2],*z3=xs[3],*t1=xs[4]; add(t1,x2,z2); // t1 = A sub(z2,x2,z2); // z2 = B add(x2,x3,z3); // x2 = C sub(z3,x3,z3); // z3 = D mul1(z3,t1); // z3 = DA mul1(x2,z2); // x3 = BC add(x3,z3,x2); // x3 = DA+CB sub(z3,z3,x2); // z3 = DA-CB sqr1(t1); // t1 = AA sqr1(z2); // z2 = BB sub(x2,t1,z2); // x2 = E = AA-BB mul(z2,x2,a24,sizeof(a24)/sizeof(a24[0])); // z2 = E*a24 add(z2,z2,t1); // z2 = E*a24 + AA } static void ladder_part2(fe xs[5], const fe x1) { limb_t *x2 = xs[0], *z2=xs[1],*x3=xs[2],*z3=xs[3],*t1=xs[4]; sqr1(z3); // z3 = (DA-CB)^2 mul1(z3,x1); // z3 = x1 * (DA-CB)^2 sqr1(x3); // x3 = (DA+CB)^2 mul1(z2,x2); // z2 = AA*(E*a24+AA) sub(x2,t1,x2); // x2 = BB again mul1(x2,t1); // x2 = AA*BB } static void x25519_core(fe xs[], const uint8_t scalar[X25519_BYTES], const uint8_t *x1, int clamp) { int i; #if X25519_MEMCPY_PARAMS fe x1i; swapin(x1i,x1); x1 = (const uint8_t *)x1; #endif limb_t swap = 0; limb_t *x2 = xs[0],*x3=xs[2],*z3=xs[3]; memset(xs,0,4*sizeof(fe)); x2[0] = z3[0] = 1; memcpy(x3,x1,sizeof(fe)); for (i=255; i>=0; i--) { uint8_t bytei = scalar[i/8]; if (clamp) { if (i/8 == 0) { bytei &= ~7; } else if (i/8 == X25519_BYTES-1) { bytei &= 0x7F; bytei |= 0x40; } } limb_t doswap = -(limb_t)((bytei>>(i%8)) & 1); condswap(x2,x3,swap^doswap); swap = doswap; ladder_part1(xs); ladder_part2(xs,(const limb_t *)x1); } condswap(x2,x3,swap); } int x25519(uint8_t out[X25519_BYTES], const uint8_t scalar[X25519_BYTES], const uint8_t x1[X25519_BYTES], int clamp) { fe xs[5]; x25519_core(xs,scalar,x1,clamp); /* Precomputed inversion chain */ limb_t *x2 = xs[0], *z2=xs[1], *z3=xs[3]; int i; limb_t *prev = z2; #if X25519_USE_POWER_CHAIN static const struct { uint8_t a,c,n; } steps[13] = { {2,1,1 }, {2,1,1 }, {4,2,3 }, {2,4,6 }, {3,1,1 }, {3,2,12 }, {4,3,25 }, {2,3,25 }, {2,4,50 }, {3,2,125}, {3,1,2 }, {3,1,2 }, {3,1,1 } }; for (i=0; i<13; i++) { int j; limb_t *a = xs[steps[i].a]; for (j=steps[i].n; j>0; j--) { sqr(a, prev); prev = a; } mul1(a,xs[steps[i].c]); } #else /* Raise to the p-2 = 0x7f..ffeb */ for (i=253; i>=0; i--) { sqr(z3,prev); prev = z3; if (i>=8 || (0xeb>>i & 1)) { mul1(z3,z2); } } #endif /* Here prev = z3 */ /* x2 /= z2 */ #if X25519_MEMCPY_PARAMS mul1(x2,z3); int ret = canon(x2); swapout(out,x2); #else mul((limb_t *)out, x2, z3, NLIMBS); int ret = canon((limb_t*)out); #endif if (clamp) return ret; else return 0; } const uint8_t X25519_BASE_POINT[X25519_BYTES] = {9}; #if X25519_SUPPORT_VERIFY static limb_t x25519_verify_core( fe xs[], const limb_t *other1, const uint8_t other2[X25519_BYTES] ) { limb_t *z2=xs[1],*x3=xs[2],*z3=xs[3]; #if X25519_MEMCPY_PARAMS fe xo2; swapin(xo2,other2); #else const limb_t *xo2 = (const limb_t *)other2; #endif memcpy(x3, other1, 2*sizeof(fe)); ladder_part1(xs); /* Here z2 = t2^2 */ mul1(z2,other1); mul1(z2,other1+NLIMBS); mul1(z2,xo2); const limb_t sixteen = 16; mul (z2,z2,&sixteen,1); mul1(z3,xo2); sub(z3,z3,x3); sqr1(z3); /* check equality */ sub(z3,z3,z2); /* If canon(z2) then both sides are zero. * If canon(z3) then the two sides are equal. * * Reject sigs where both sides are zero, because * that can happen if an input causes the ladder to * return 0/0. */ return canon(z2) | ~canon(z3); } int x25519_verify_p2 ( const uint8_t response[X25519_BYTES], const uint8_t challenge[X25519_BYTES], const uint8_t eph[X25519_BYTES], const uint8_t pub[X25519_BYTES] ) { fe xs[7]; x25519_core(&xs[0],challenge,pub,0); x25519_core(&xs[2],response,X25519_BASE_POINT,0); return x25519_verify_core(&xs[2],xs[0],eph); } #endif // X25519_SUPPORT_VERIFY #if X25519_SUPPORT_SIGN static void sc_montmul ( scalar_t out, const scalar_t a, const scalar_t b ) { /** * OK, so carry bounding. We're using a high carry, so that the * inputs don't have to be reduced. * * First montmul: output < (M^2 + Mp)/M = M+p, subtract p, < M. This gets rid of high carry. * Second montmul, by r^2 mod p < p: output < (Mp + Mp)/M = 2p, subtract p, < p, done. */ unsigned i,j; limb_t hic = 0; for (i=0; i0) out[j-1] = acc; } /* Add two carry registers and high carry */ out[NLIMBS-1] = adc(&hic, carry, carry2); } /* Reduce */ sdlimb_t scarry = 0; for (i=0; i>= X25519_WBITS; } limb_t need_add = -(scarry + hic); limb_t carry = 0; for (i=0; i