You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

292 lines
7.1 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "f_field.h"
  5. void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) {
  6. const uint64_t *a = as->limb, *b = bs->limb;
  7. uint64_t *c = cs->limb;
  8. __uint128_t accum0 = 0, accum1 = 0, accum2;
  9. uint64_t mask = (1ull<<60) - 1;
  10. uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
  11. /* For some reason clang doesn't vectorize this without prompting? */
  12. unsigned int i;
  13. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  14. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  15. ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
  16. ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
  17. }
  18. /*
  19. for (int i=0; i<4; i++) {
  20. aa[i] = a[i] + a[i+4];
  21. bb[i] = b[i] + b[i+4];
  22. }
  23. */
  24. accum2 = widemul(&a[0],&b[3]);
  25. accum0 = widemul(&aa[0],&bb[3]);
  26. accum1 = widemul(&a[4],&b[7]);
  27. mac(&accum2, &a[1], &b[2]);
  28. mac(&accum0, &aa[1], &bb[2]);
  29. mac(&accum1, &a[5], &b[6]);
  30. mac(&accum2, &a[2], &b[1]);
  31. mac(&accum0, &aa[2], &bb[1]);
  32. mac(&accum1, &a[6], &b[5]);
  33. mac(&accum2, &a[3], &b[0]);
  34. mac(&accum0, &aa[3], &bb[0]);
  35. mac(&accum1, &a[7], &b[4]);
  36. accum0 -= accum2;
  37. accum1 += accum2;
  38. c[3] = ((uint64_t)(accum1)) & mask;
  39. c[7] = ((uint64_t)(accum0)) & mask;
  40. accum0 >>= 60;
  41. accum1 >>= 60;
  42. mac(&accum0, &aa[1],&bb[3]);
  43. mac(&accum1, &a[5], &b[7]);
  44. mac(&accum0, &aa[2], &bb[2]);
  45. mac(&accum1, &a[6], &b[6]);
  46. mac(&accum0, &aa[3], &bb[1]);
  47. accum1 += accum0;
  48. accum2 = widemul(&a[0],&b[0]);
  49. accum1 -= accum2;
  50. accum0 += accum2;
  51. msb(&accum0, &a[1], &b[3]);
  52. msb(&accum0, &a[2], &b[2]);
  53. mac(&accum1, &a[7], &b[5]);
  54. msb(&accum0, &a[3], &b[1]);
  55. mac(&accum1, &aa[0], &bb[0]);
  56. mac(&accum0, &a[4], &b[4]);
  57. c[0] = ((uint64_t)(accum0)) & mask;
  58. c[4] = ((uint64_t)(accum1)) & mask;
  59. accum0 >>= 60;
  60. accum1 >>= 60;
  61. accum2 = widemul(&a[2],&b[7]);
  62. mac(&accum0, &a[6], &bb[3]);
  63. mac(&accum1, &aa[2], &bbb[3]);
  64. mac(&accum2, &a[3], &b[6]);
  65. mac(&accum0, &a[7], &bb[2]);
  66. mac(&accum1, &aa[3], &bbb[2]);
  67. mac(&accum2, &a[0],&b[1]);
  68. mac(&accum1, &aa[0], &bb[1]);
  69. mac(&accum0, &a[4], &b[5]);
  70. mac(&accum2, &a[1], &b[0]);
  71. mac(&accum1, &aa[1], &bb[0]);
  72. mac(&accum0, &a[5], &b[4]);
  73. accum1 -= accum2;
  74. accum0 += accum2;
  75. c[1] = ((uint64_t)(accum0)) & mask;
  76. c[5] = ((uint64_t)(accum1)) & mask;
  77. accum0 >>= 60;
  78. accum1 >>= 60;
  79. accum2 = widemul(&a[3],&b[7]);
  80. mac(&accum0, &a[7], &bb[3]);
  81. mac(&accum1, &aa[3], &bbb[3]);
  82. mac(&accum2, &a[0],&b[2]);
  83. mac(&accum1, &aa[0], &bb[2]);
  84. mac(&accum0, &a[4], &b[6]);
  85. mac(&accum2, &a[1], &b[1]);
  86. mac(&accum1, &aa[1], &bb[1]);
  87. mac(&accum0, &a[5], &b[5]);
  88. mac(&accum2, &a[2], &b[0]);
  89. mac(&accum1, &aa[2], &bb[0]);
  90. mac(&accum0, &a[6], &b[4]);
  91. accum1 -= accum2;
  92. accum0 += accum2;
  93. c[2] = ((uint64_t)(accum0)) & mask;
  94. c[6] = ((uint64_t)(accum1)) & mask;
  95. accum0 >>= 60;
  96. accum1 >>= 60;
  97. accum0 += c[3];
  98. accum1 += c[7];
  99. c[3] = ((uint64_t)(accum0)) & mask;
  100. c[7] = ((uint64_t)(accum1)) & mask;
  101. /* we could almost stop here, but it wouldn't be stable, so... */
  102. accum0 >>= 60;
  103. accum1 >>= 60;
  104. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  105. c[0] += ((uint64_t)(accum1));
  106. }
  107. void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) {
  108. const uint64_t *a = as->limb;
  109. uint64_t *c = cs->limb;
  110. __uint128_t accum0, accum4;
  111. uint64_t mask = (1ull<<60) - 1;
  112. accum0 = widemul_rm(b, &a[0]);
  113. accum4 = widemul_rm(b, &a[4]);
  114. c[0] = accum0 & mask; accum0 >>= 60;
  115. c[4] = accum4 & mask; accum4 >>= 60;
  116. mac_rm(&accum0, b, &a[1]);
  117. mac_rm(&accum4, b, &a[5]);
  118. c[1] = accum0 & mask; accum0 >>= 60;
  119. c[5] = accum4 & mask; accum4 >>= 60;
  120. mac_rm(&accum0, b, &a[2]);
  121. mac_rm(&accum4, b, &a[6]);
  122. c[2] = accum0 & mask; accum0 >>= 60;
  123. c[6] = accum4 & mask; accum4 >>= 60;
  124. mac_rm(&accum0, b, &a[3]);
  125. mac_rm(&accum4, b, &a[7]);
  126. c[3] = accum0 & mask; accum0 >>= 60;
  127. c[7] = accum4 & mask; accum4 >>= 60;
  128. accum0 += accum4 + c[4];
  129. c[4] = accum0 & mask;
  130. c[5] += accum0 >> 60;
  131. accum4 += c[0];
  132. c[0] = accum4 & mask;
  133. c[1] += accum4 >> 60;
  134. }
  135. void gf_sqr (gf *__restrict__ cs, const gf *as) {
  136. const uint64_t *a = as->limb;
  137. uint64_t *c = cs->limb;
  138. __uint128_t accum0 = 0, accum1 = 0, accum2;
  139. uint64_t mask = (1ull<<60) - 1;
  140. uint64_t aa[4] __attribute__((aligned(32)));
  141. /* For some reason clang doesn't vectorize this without prompting? */
  142. unsigned int i;
  143. for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
  144. ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
  145. }
  146. accum2 = widemul(&a[0],&a[3]);
  147. accum0 = widemul(&aa[0],&aa[3]);
  148. accum1 = widemul(&a[4],&a[7]);
  149. mac(&accum2, &a[1], &a[2]);
  150. mac(&accum0, &aa[1], &aa[2]);
  151. mac(&accum1, &a[5], &a[6]);
  152. accum0 -= accum2;
  153. accum1 += accum2;
  154. c[3] = ((uint64_t)(accum1))<<1 & mask;
  155. c[7] = ((uint64_t)(accum0))<<1 & mask;
  156. accum0 >>= 59;
  157. accum1 >>= 59;
  158. mac2(&accum0, &aa[1],&aa[3]);
  159. mac2(&accum1, &a[5], &a[7]);
  160. mac(&accum0, &aa[2], &aa[2]);
  161. accum1 += accum0;
  162. msb2(&accum0, &a[1], &a[3]);
  163. mac(&accum1, &a[6], &a[6]);
  164. accum2 = widemul(&a[0],&a[0]);
  165. accum1 -= accum2;
  166. accum0 += accum2;
  167. msb(&accum0, &a[2], &a[2]);
  168. mac(&accum1, &aa[0], &aa[0]);
  169. mac(&accum0, &a[4], &a[4]);
  170. c[0] = ((uint64_t)(accum0)) & mask;
  171. c[4] = ((uint64_t)(accum1)) & mask;
  172. accum0 >>= 60;
  173. accum1 >>= 60;
  174. accum2 = widemul2(&aa[2],&aa[3]);
  175. msb2(&accum0, &a[2], &a[3]);
  176. mac2(&accum1, &a[6], &a[7]);
  177. accum1 += accum2;
  178. accum0 += accum2;
  179. accum2 = widemul2(&a[0],&a[1]);
  180. mac2(&accum1, &aa[0], &aa[1]);
  181. mac2(&accum0, &a[4], &a[5]);
  182. accum1 -= accum2;
  183. accum0 += accum2;
  184. c[1] = ((uint64_t)(accum0)) & mask;
  185. c[5] = ((uint64_t)(accum1)) & mask;
  186. accum0 >>= 60;
  187. accum1 >>= 60;
  188. accum2 = widemul(&aa[3],&aa[3]);
  189. msb(&accum0, &a[3], &a[3]);
  190. mac(&accum1, &a[7], &a[7]);
  191. accum1 += accum2;
  192. accum0 += accum2;
  193. accum2 = widemul2(&a[0],&a[2]);
  194. mac2(&accum1, &aa[0], &aa[2]);
  195. mac2(&accum0, &a[4], &a[6]);
  196. mac(&accum2, &a[1], &a[1]);
  197. mac(&accum1, &aa[1], &aa[1]);
  198. mac(&accum0, &a[5], &a[5]);
  199. accum1 -= accum2;
  200. accum0 += accum2;
  201. c[2] = ((uint64_t)(accum0)) & mask;
  202. c[6] = ((uint64_t)(accum1)) & mask;
  203. accum0 >>= 60;
  204. accum1 >>= 60;
  205. accum0 += c[3];
  206. accum1 += c[7];
  207. c[3] = ((uint64_t)(accum0)) & mask;
  208. c[7] = ((uint64_t)(accum1)) & mask;
  209. /* we could almost stop here, but it wouldn't be stable, so... */
  210. accum0 >>= 60;
  211. accum1 >>= 60;
  212. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  213. c[0] += ((uint64_t)(accum1));
  214. }