You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

303 lines
7.5 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "f_field.h"
  5. void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
  6. const uint64_t *a = as->limb, *b = bs->limb;
  7. uint64_t *c = cs->limb;
  8. __uint128_t accum0 = 0, accum1 = 0, accum2;
  9. uint64_t mask = (1ull<<56) - 1;
  10. uint64_t aa[4], bb[4], bbb[4];
  11. unsigned int i;
  12. for (i=0; i<4; i++) {
  13. aa[i] = a[i] + a[i+4];
  14. bb[i] = b[i] + b[i+4];
  15. bbb[i] = bb[i] + b[i+4];
  16. }
  17. int I_HATE_UNROLLED_LOOPS = 0;
  18. if (I_HATE_UNROLLED_LOOPS) {
  19. /* The compiler probably won't unroll this,
  20. * so it's like 80% slower.
  21. */
  22. for (i=0; i<4; i++) {
  23. accum2 = 0;
  24. unsigned int j;
  25. for (j=0; j<=i; j++) {
  26. accum2 += widemul(a[j], b[i-j]);
  27. accum1 += widemul(aa[j], bb[i-j]);
  28. accum0 += widemul(a[j+4], b[i-j+4]);
  29. }
  30. for (; j<4; j++) {
  31. accum2 += widemul(a[j], b[i-j+8]);
  32. accum1 += widemul(aa[j], bbb[i-j+4]);
  33. accum0 += widemul(a[j+4], bb[i-j+4]);
  34. }
  35. accum1 -= accum2;
  36. accum0 += accum2;
  37. c[i] = ((uint64_t)(accum0)) & mask;
  38. c[i+4] = ((uint64_t)(accum1)) & mask;
  39. accum0 >>= 56;
  40. accum1 >>= 56;
  41. }
  42. } else {
  43. accum2 = widemul(a[0], b[0]);
  44. accum1 += widemul(aa[0], bb[0]);
  45. accum0 += widemul(a[4], b[4]);
  46. accum2 += widemul(a[1], b[7]);
  47. accum1 += widemul(aa[1], bbb[3]);
  48. accum0 += widemul(a[5], bb[3]);
  49. accum2 += widemul(a[2], b[6]);
  50. accum1 += widemul(aa[2], bbb[2]);
  51. accum0 += widemul(a[6], bb[2]);
  52. accum2 += widemul(a[3], b[5]);
  53. accum1 += widemul(aa[3], bbb[1]);
  54. accum0 += widemul(a[7], bb[1]);
  55. accum1 -= accum2;
  56. accum0 += accum2;
  57. c[0] = ((uint64_t)(accum0)) & mask;
  58. c[4] = ((uint64_t)(accum1)) & mask;
  59. accum0 >>= 56;
  60. accum1 >>= 56;
  61. accum2 = widemul(a[0], b[1]);
  62. accum1 += widemul(aa[0], bb[1]);
  63. accum0 += widemul(a[4], b[5]);
  64. accum2 += widemul(a[1], b[0]);
  65. accum1 += widemul(aa[1], bb[0]);
  66. accum0 += widemul(a[5], b[4]);
  67. accum2 += widemul(a[2], b[7]);
  68. accum1 += widemul(aa[2], bbb[3]);
  69. accum0 += widemul(a[6], bb[3]);
  70. accum2 += widemul(a[3], b[6]);
  71. accum1 += widemul(aa[3], bbb[2]);
  72. accum0 += widemul(a[7], bb[2]);
  73. accum1 -= accum2;
  74. accum0 += accum2;
  75. c[1] = ((uint64_t)(accum0)) & mask;
  76. c[5] = ((uint64_t)(accum1)) & mask;
  77. accum0 >>= 56;
  78. accum1 >>= 56;
  79. accum2 = widemul(a[0], b[2]);
  80. accum1 += widemul(aa[0], bb[2]);
  81. accum0 += widemul(a[4], b[6]);
  82. accum2 += widemul(a[1], b[1]);
  83. accum1 += widemul(aa[1], bb[1]);
  84. accum0 += widemul(a[5], b[5]);
  85. accum2 += widemul(a[2], b[0]);
  86. accum1 += widemul(aa[2], bb[0]);
  87. accum0 += widemul(a[6], b[4]);
  88. accum2 += widemul(a[3], b[7]);
  89. accum1 += widemul(aa[3], bbb[3]);
  90. accum0 += widemul(a[7], bb[3]);
  91. accum1 -= accum2;
  92. accum0 += accum2;
  93. c[2] = ((uint64_t)(accum0)) & mask;
  94. c[6] = ((uint64_t)(accum1)) & mask;
  95. accum0 >>= 56;
  96. accum1 >>= 56;
  97. accum2 = widemul(a[0], b[3]);
  98. accum1 += widemul(aa[0], bb[3]);
  99. accum0 += widemul(a[4], b[7]);
  100. accum2 += widemul(a[1], b[2]);
  101. accum1 += widemul(aa[1], bb[2]);
  102. accum0 += widemul(a[5], b[6]);
  103. accum2 += widemul(a[2], b[1]);
  104. accum1 += widemul(aa[2], bb[1]);
  105. accum0 += widemul(a[6], b[5]);
  106. accum2 += widemul(a[3], b[0]);
  107. accum1 += widemul(aa[3], bb[0]);
  108. accum0 += widemul(a[7], b[4]);
  109. accum1 -= accum2;
  110. accum0 += accum2;
  111. c[3] = ((uint64_t)(accum0)) & mask;
  112. c[7] = ((uint64_t)(accum1)) & mask;
  113. accum0 >>= 56;
  114. accum1 >>= 56;
  115. } /* !I_HATE_UNROLLED_LOOPS */
  116. accum0 += accum1;
  117. accum0 += c[4];
  118. accum1 += c[0];
  119. c[4] = ((uint64_t)(accum0)) & mask;
  120. c[0] = ((uint64_t)(accum1)) & mask;
  121. accum0 >>= 56;
  122. accum1 >>= 56;
  123. c[5] += ((uint64_t)(accum0));
  124. c[1] += ((uint64_t)(accum1));
  125. }
  126. void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
  127. const uint64_t *a = as->limb;
  128. uint64_t *c = cs->limb;
  129. __uint128_t accum0 = 0, accum4 = 0;
  130. uint64_t mask = (1ull<<56) - 1;
  131. int i;
  132. for (i=0; i<4; i++) {
  133. accum0 += widemul(b, a[i]);
  134. accum4 += widemul(b, a[i+4]);
  135. c[i] = accum0 & mask; accum0 >>= 56;
  136. c[i+4] = accum4 & mask; accum4 >>= 56;
  137. }
  138. accum0 += accum4 + c[4];
  139. c[4] = accum0 & mask;
  140. c[5] += accum0 >> 56;
  141. accum4 += c[0];
  142. c[0] = accum4 & mask;
  143. c[1] += accum4 >> 56;
  144. }
  145. void gf_sqr (gf_s *__restrict__ cs, const gf as) {
  146. const uint64_t *a = as->limb;
  147. uint64_t *c = cs->limb;
  148. __uint128_t accum0 = 0, accum1 = 0, accum2;
  149. uint64_t mask = (1ull<<56) - 1;
  150. uint64_t aa[4];
  151. /* For some reason clang doesn't vectorize this without prompting? */
  152. unsigned int i;
  153. for (i=0; i<4; i++) {
  154. aa[i] = a[i] + a[i+4];
  155. }
  156. accum2 = widemul(a[0],a[3]);
  157. accum0 = widemul(aa[0],aa[3]);
  158. accum1 = widemul(a[4],a[7]);
  159. accum2 += widemul(a[1], a[2]);
  160. accum0 += widemul(aa[1], aa[2]);
  161. accum1 += widemul(a[5], a[6]);
  162. accum0 -= accum2;
  163. accum1 += accum2;
  164. c[3] = ((uint64_t)(accum1))<<1 & mask;
  165. c[7] = ((uint64_t)(accum0))<<1 & mask;
  166. accum0 >>= 55;
  167. accum1 >>= 55;
  168. accum0 += widemul(2*aa[1],aa[3]);
  169. accum1 += widemul(2*a[5], a[7]);
  170. accum0 += widemul(aa[2], aa[2]);
  171. accum1 += accum0;
  172. accum0 -= widemul(2*a[1], a[3]);
  173. accum1 += widemul(a[6], a[6]);
  174. accum2 = widemul(a[0],a[0]);
  175. accum1 -= accum2;
  176. accum0 += accum2;
  177. accum0 -= widemul(a[2], a[2]);
  178. accum1 += widemul(aa[0], aa[0]);
  179. accum0 += widemul(a[4], a[4]);
  180. c[0] = ((uint64_t)(accum0)) & mask;
  181. c[4] = ((uint64_t)(accum1)) & mask;
  182. accum0 >>= 56;
  183. accum1 >>= 56;
  184. accum2 = widemul(2*aa[2],aa[3]);
  185. accum0 -= widemul(2*a[2], a[3]);
  186. accum1 += widemul(2*a[6], a[7]);
  187. accum1 += accum2;
  188. accum0 += accum2;
  189. accum2 = widemul(2*a[0],a[1]);
  190. accum1 += widemul(2*aa[0], aa[1]);
  191. accum0 += widemul(2*a[4], a[5]);
  192. accum1 -= accum2;
  193. accum0 += accum2;
  194. c[1] = ((uint64_t)(accum0)) & mask;
  195. c[5] = ((uint64_t)(accum1)) & mask;
  196. accum0 >>= 56;
  197. accum1 >>= 56;
  198. accum2 = widemul(aa[3],aa[3]);
  199. accum0 -= widemul(a[3], a[3]);
  200. accum1 += widemul(a[7], a[7]);
  201. accum1 += accum2;
  202. accum0 += accum2;
  203. accum2 = widemul(2*a[0],a[2]);
  204. accum1 += widemul(2*aa[0], aa[2]);
  205. accum0 += widemul(2*a[4], a[6]);
  206. accum2 += widemul(a[1], a[1]);
  207. accum1 += widemul(aa[1], aa[1]);
  208. accum0 += widemul(a[5], a[5]);
  209. accum1 -= accum2;
  210. accum0 += accum2;
  211. c[2] = ((uint64_t)(accum0)) & mask;
  212. c[6] = ((uint64_t)(accum1)) & mask;
  213. accum0 >>= 56;
  214. accum1 >>= 56;
  215. accum0 += c[3];
  216. accum1 += c[7];
  217. c[3] = ((uint64_t)(accum0)) & mask;
  218. c[7] = ((uint64_t)(accum1)) & mask;
  219. /* we could almost stop here, but it wouldn't be stable, so... */
  220. accum0 >>= 56;
  221. accum1 >>= 56;
  222. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  223. c[0] += ((uint64_t)(accum1));
  224. }