modm_donna_32bit.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. */
  4. #include "ed25519_donna.h"
  5. /*
  6. Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
  7. k = 32
  8. b = 1 << 8 = 256
  9. m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
  10. mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
  11. */
  12. static const bignum256modm modm_m = {
  13. 0x1cf5d3ed,
  14. 0x20498c69,
  15. 0x2f79cd65,
  16. 0x37be77a8,
  17. 0x00000014,
  18. 0x00000000,
  19. 0x00000000,
  20. 0x00000000,
  21. 0x00001000};
  22. static const bignum256modm modm_mu = {
  23. 0x0a2c131b,
  24. 0x3673968c,
  25. 0x06329a7e,
  26. 0x01885742,
  27. 0x3fffeb21,
  28. 0x3fffffff,
  29. 0x3fffffff,
  30. 0x3fffffff,
  31. 0x000fffff};
  32. static bignum256modm_element_t lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
  33. return (a - b) >> 31;
  34. }
  35. /* see HAC, Alg. 14.42 Step 4 */
  36. void reduce256_modm(bignum256modm r) {
  37. bignum256modm t = {0};
  38. bignum256modm_element_t b = 0, pb = 0, mask = 0;
  39. /* t = r - m */
  40. pb = 0;
  41. pb += modm_m[0];
  42. b = lt_modm(r[0], pb);
  43. t[0] = (r[0] - pb + (b << 30));
  44. pb = b;
  45. pb += modm_m[1];
  46. b = lt_modm(r[1], pb);
  47. t[1] = (r[1] - pb + (b << 30));
  48. pb = b;
  49. pb += modm_m[2];
  50. b = lt_modm(r[2], pb);
  51. t[2] = (r[2] - pb + (b << 30));
  52. pb = b;
  53. pb += modm_m[3];
  54. b = lt_modm(r[3], pb);
  55. t[3] = (r[3] - pb + (b << 30));
  56. pb = b;
  57. pb += modm_m[4];
  58. b = lt_modm(r[4], pb);
  59. t[4] = (r[4] - pb + (b << 30));
  60. pb = b;
  61. pb += modm_m[5];
  62. b = lt_modm(r[5], pb);
  63. t[5] = (r[5] - pb + (b << 30));
  64. pb = b;
  65. pb += modm_m[6];
  66. b = lt_modm(r[6], pb);
  67. t[6] = (r[6] - pb + (b << 30));
  68. pb = b;
  69. pb += modm_m[7];
  70. b = lt_modm(r[7], pb);
  71. t[7] = (r[7] - pb + (b << 30));
  72. pb = b;
  73. pb += modm_m[8];
  74. b = lt_modm(r[8], pb);
  75. t[8] = (r[8] - pb + (b << 16));
  76. /* keep r if r was smaller than m */
  77. mask = b - 1;
  78. r[0] ^= mask & (r[0] ^ t[0]);
  79. r[1] ^= mask & (r[1] ^ t[1]);
  80. r[2] ^= mask & (r[2] ^ t[2]);
  81. r[3] ^= mask & (r[3] ^ t[3]);
  82. r[4] ^= mask & (r[4] ^ t[4]);
  83. r[5] ^= mask & (r[5] ^ t[5]);
  84. r[6] ^= mask & (r[6] ^ t[6]);
  85. r[7] ^= mask & (r[7] ^ t[7]);
  86. r[8] ^= mask & (r[8] ^ t[8]);
  87. }
  88. /*
  89. Barrett reduction, see HAC, Alg. 14.42
  90. Instead of passing in x, pre-process in to q1 and r1 for efficiency
  91. */
  92. void barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
  93. bignum256modm q3 = {0}, r2 = {0};
  94. uint64_t c = 0;
  95. bignum256modm_element_t f = 0, b = 0, pb = 0;
  96. /* q1 = x >> 248 = 264 bits = 9 30 bit elements
  97. q2 = mu * q1
  98. q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
  99. c = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) +
  100. mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) +
  101. mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) +
  102. mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]);
  103. c >>= 30;
  104. c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) +
  105. mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) +
  106. mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) +
  107. mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) +
  108. mul32x32_64(modm_mu[8], q1[0]);
  109. f = (bignum256modm_element_t)c;
  110. q3[0] = (f >> 24) & 0x3f;
  111. c >>= 30;
  112. c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) +
  113. mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) +
  114. mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) +
  115. mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]);
  116. f = (bignum256modm_element_t)c;
  117. q3[0] |= (f << 6) & 0x3fffffff;
  118. q3[1] = (f >> 24) & 0x3f;
  119. c >>= 30;
  120. c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) +
  121. mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) +
  122. mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) +
  123. mul32x32_64(modm_mu[8], q1[2]);
  124. f = (bignum256modm_element_t)c;
  125. q3[1] |= (f << 6) & 0x3fffffff;
  126. q3[2] = (f >> 24) & 0x3f;
  127. c >>= 30;
  128. c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) +
  129. mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) +
  130. mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]);
  131. f = (bignum256modm_element_t)c;
  132. q3[2] |= (f << 6) & 0x3fffffff;
  133. q3[3] = (f >> 24) & 0x3f;
  134. c >>= 30;
  135. c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) +
  136. mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) +
  137. mul32x32_64(modm_mu[8], q1[4]);
  138. f = (bignum256modm_element_t)c;
  139. q3[3] |= (f << 6) & 0x3fffffff;
  140. q3[4] = (f >> 24) & 0x3f;
  141. c >>= 30;
  142. c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) +
  143. mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]);
  144. f = (bignum256modm_element_t)c;
  145. q3[4] |= (f << 6) & 0x3fffffff;
  146. q3[5] = (f >> 24) & 0x3f;
  147. c >>= 30;
  148. c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) +
  149. mul32x32_64(modm_mu[8], q1[6]);
  150. f = (bignum256modm_element_t)c;
  151. q3[5] |= (f << 6) & 0x3fffffff;
  152. q3[6] = (f >> 24) & 0x3f;
  153. c >>= 30;
  154. c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]);
  155. f = (bignum256modm_element_t)c;
  156. q3[6] |= (f << 6) & 0x3fffffff;
  157. q3[7] = (f >> 24) & 0x3f;
  158. c >>= 30;
  159. c += mul32x32_64(modm_mu[8], q1[8]);
  160. f = (bignum256modm_element_t)c;
  161. q3[7] |= (f << 6) & 0x3fffffff;
  162. q3[8] = (bignum256modm_element_t)(c >> 24);
  163. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(32+1) = x & ((1 << 264) - 1)
  164. r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */
  165. c = mul32x32_64(modm_m[0], q3[0]);
  166. r2[0] = (bignum256modm_element_t)(c & 0x3fffffff);
  167. c >>= 30;
  168. c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]);
  169. r2[1] = (bignum256modm_element_t)(c & 0x3fffffff);
  170. c >>= 30;
  171. c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) +
  172. mul32x32_64(modm_m[2], q3[0]);
  173. r2[2] = (bignum256modm_element_t)(c & 0x3fffffff);
  174. c >>= 30;
  175. c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) +
  176. mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]);
  177. r2[3] = (bignum256modm_element_t)(c & 0x3fffffff);
  178. c >>= 30;
  179. c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) +
  180. mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) +
  181. mul32x32_64(modm_m[4], q3[0]);
  182. r2[4] = (bignum256modm_element_t)(c & 0x3fffffff);
  183. c >>= 30;
  184. c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) +
  185. mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) +
  186. mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]);
  187. r2[5] = (bignum256modm_element_t)(c & 0x3fffffff);
  188. c >>= 30;
  189. c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) +
  190. mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) +
  191. mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) +
  192. mul32x32_64(modm_m[6], q3[0]);
  193. r2[6] = (bignum256modm_element_t)(c & 0x3fffffff);
  194. c >>= 30;
  195. c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) +
  196. mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) +
  197. mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) +
  198. mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]);
  199. r2[7] = (bignum256modm_element_t)(c & 0x3fffffff);
  200. c >>= 30;
  201. c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) +
  202. mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) +
  203. mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) +
  204. mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) +
  205. mul32x32_64(modm_m[8], q3[0]);
  206. r2[8] = (bignum256modm_element_t)(c & 0xffffff);
  207. /* r = r1 - r2
  208. if (r < 0) r += (1 << 264) */
  209. pb = 0;
  210. pb += r2[0];
  211. b = lt_modm(r1[0], pb);
  212. r[0] = (r1[0] - pb + (b << 30));
  213. pb = b;
  214. pb += r2[1];
  215. b = lt_modm(r1[1], pb);
  216. r[1] = (r1[1] - pb + (b << 30));
  217. pb = b;
  218. pb += r2[2];
  219. b = lt_modm(r1[2], pb);
  220. r[2] = (r1[2] - pb + (b << 30));
  221. pb = b;
  222. pb += r2[3];
  223. b = lt_modm(r1[3], pb);
  224. r[3] = (r1[3] - pb + (b << 30));
  225. pb = b;
  226. pb += r2[4];
  227. b = lt_modm(r1[4], pb);
  228. r[4] = (r1[4] - pb + (b << 30));
  229. pb = b;
  230. pb += r2[5];
  231. b = lt_modm(r1[5], pb);
  232. r[5] = (r1[5] - pb + (b << 30));
  233. pb = b;
  234. pb += r2[6];
  235. b = lt_modm(r1[6], pb);
  236. r[6] = (r1[6] - pb + (b << 30));
  237. pb = b;
  238. pb += r2[7];
  239. b = lt_modm(r1[7], pb);
  240. r[7] = (r1[7] - pb + (b << 30));
  241. pb = b;
  242. pb += r2[8];
  243. b = lt_modm(r1[8], pb);
  244. r[8] = (r1[8] - pb + (b << 24));
  245. reduce256_modm(r);
  246. reduce256_modm(r);
  247. }
  248. /* addition modulo m */
  249. void add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  250. bignum256modm_element_t c = 0;
  251. c = x[0] + y[0];
  252. r[0] = c & 0x3fffffff;
  253. c >>= 30;
  254. c += x[1] + y[1];
  255. r[1] = c & 0x3fffffff;
  256. c >>= 30;
  257. c += x[2] + y[2];
  258. r[2] = c & 0x3fffffff;
  259. c >>= 30;
  260. c += x[3] + y[3];
  261. r[3] = c & 0x3fffffff;
  262. c >>= 30;
  263. c += x[4] + y[4];
  264. r[4] = c & 0x3fffffff;
  265. c >>= 30;
  266. c += x[5] + y[5];
  267. r[5] = c & 0x3fffffff;
  268. c >>= 30;
  269. c += x[6] + y[6];
  270. r[6] = c & 0x3fffffff;
  271. c >>= 30;
  272. c += x[7] + y[7];
  273. r[7] = c & 0x3fffffff;
  274. c >>= 30;
  275. c += x[8] + y[8];
  276. r[8] = c;
  277. reduce256_modm(r);
  278. }
  279. /* -x modulo m */
  280. void neg256_modm(bignum256modm r, const bignum256modm x) {
  281. bignum256modm_element_t b = 0, pb = 0;
  282. /* r = m - x */
  283. pb = 0;
  284. pb += x[0];
  285. b = lt_modm(modm_m[0], pb);
  286. r[0] = (modm_m[0] - pb + (b << 30));
  287. pb = b;
  288. pb += x[1];
  289. b = lt_modm(modm_m[1], pb);
  290. r[1] = (modm_m[1] - pb + (b << 30));
  291. pb = b;
  292. pb += x[2];
  293. b = lt_modm(modm_m[2], pb);
  294. r[2] = (modm_m[2] - pb + (b << 30));
  295. pb = b;
  296. pb += x[3];
  297. b = lt_modm(modm_m[3], pb);
  298. r[3] = (modm_m[3] - pb + (b << 30));
  299. pb = b;
  300. pb += x[4];
  301. b = lt_modm(modm_m[4], pb);
  302. r[4] = (modm_m[4] - pb + (b << 30));
  303. pb = b;
  304. pb += x[5];
  305. b = lt_modm(modm_m[5], pb);
  306. r[5] = (modm_m[5] - pb + (b << 30));
  307. pb = b;
  308. pb += x[6];
  309. b = lt_modm(modm_m[6], pb);
  310. r[6] = (modm_m[6] - pb + (b << 30));
  311. pb = b;
  312. pb += x[7];
  313. b = lt_modm(modm_m[7], pb);
  314. r[7] = (modm_m[7] - pb + (b << 30));
  315. pb = b;
  316. pb += x[8];
  317. b = lt_modm(modm_m[8], pb);
  318. r[8] = (modm_m[8] - pb + (b << 16));
  319. // if x==0, reduction is required
  320. reduce256_modm(r);
  321. }
  322. /* consts for subtraction, > p */
  323. /* Emilia Kasper trick, https://www.imperialviolet.org/2010/12/04/ecc.html */
  324. static const uint32_t twoP[] = {
  325. 0x5cf5d3ed,
  326. 0x60498c68,
  327. 0x6f79cd64,
  328. 0x77be77a7,
  329. 0x40000013,
  330. 0x3fffffff,
  331. 0x3fffffff,
  332. 0x3fffffff,
  333. 0xfff};
  334. /* subtraction x-y % m */
  335. void sub256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  336. bignum256modm_element_t c = 0;
  337. c = twoP[0] + x[0] - y[0];
  338. r[0] = c & 0x3fffffff;
  339. c >>= 30;
  340. c += twoP[1] + x[1] - y[1];
  341. r[1] = c & 0x3fffffff;
  342. c >>= 30;
  343. c += twoP[2] + x[2] - y[2];
  344. r[2] = c & 0x3fffffff;
  345. c >>= 30;
  346. c += twoP[3] + x[3] - y[3];
  347. r[3] = c & 0x3fffffff;
  348. c >>= 30;
  349. c += twoP[4] + x[4] - y[4];
  350. r[4] = c & 0x3fffffff;
  351. c >>= 30;
  352. c += twoP[5] + x[5] - y[5];
  353. r[5] = c & 0x3fffffff;
  354. c >>= 30;
  355. c += twoP[6] + x[6] - y[6];
  356. r[6] = c & 0x3fffffff;
  357. c >>= 30;
  358. c += twoP[7] + x[7] - y[7];
  359. r[7] = c & 0x3fffffff;
  360. c >>= 30;
  361. c += twoP[8] + x[8] - y[8];
  362. r[8] = c;
  363. reduce256_modm(r);
  364. }
  365. /* multiplication modulo m */
  366. void mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  367. bignum256modm r1 = {0}, q1 = {0};
  368. uint64_t c = 0;
  369. bignum256modm_element_t f = 0;
  370. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
  371. q1 = x >> 248 = 264 bits = 9 30 bit elements */
  372. c = mul32x32_64(x[0], y[0]);
  373. f = (bignum256modm_element_t)c;
  374. r1[0] = (f & 0x3fffffff);
  375. c >>= 30;
  376. c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]);
  377. f = (bignum256modm_element_t)c;
  378. r1[1] = (f & 0x3fffffff);
  379. c >>= 30;
  380. c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]);
  381. f = (bignum256modm_element_t)c;
  382. r1[2] = (f & 0x3fffffff);
  383. c >>= 30;
  384. c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) +
  385. mul32x32_64(x[3], y[0]);
  386. f = (bignum256modm_element_t)c;
  387. r1[3] = (f & 0x3fffffff);
  388. c >>= 30;
  389. c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) +
  390. mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]);
  391. f = (bignum256modm_element_t)c;
  392. r1[4] = (f & 0x3fffffff);
  393. c >>= 30;
  394. c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) +
  395. mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]);
  396. f = (bignum256modm_element_t)c;
  397. r1[5] = (f & 0x3fffffff);
  398. c >>= 30;
  399. c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) +
  400. mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) +
  401. mul32x32_64(x[6], y[0]);
  402. f = (bignum256modm_element_t)c;
  403. r1[6] = (f & 0x3fffffff);
  404. c >>= 30;
  405. c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) +
  406. mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) +
  407. mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]);
  408. f = (bignum256modm_element_t)c;
  409. r1[7] = (f & 0x3fffffff);
  410. c >>= 30;
  411. c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) +
  412. mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) +
  413. mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]);
  414. f = (bignum256modm_element_t)c;
  415. r1[8] = (f & 0x00ffffff);
  416. q1[0] = (f >> 8) & 0x3fffff;
  417. c >>= 30;
  418. c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) +
  419. mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) +
  420. mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]);
  421. f = (bignum256modm_element_t)c;
  422. q1[0] = (q1[0] | (f << 22)) & 0x3fffffff;
  423. q1[1] = (f >> 8) & 0x3fffff;
  424. c >>= 30;
  425. c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) +
  426. mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) +
  427. mul32x32_64(x[8], y[2]);
  428. f = (bignum256modm_element_t)c;
  429. q1[1] = (q1[1] | (f << 22)) & 0x3fffffff;
  430. q1[2] = (f >> 8) & 0x3fffff;
  431. c >>= 30;
  432. c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) +
  433. mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]);
  434. f = (bignum256modm_element_t)c;
  435. q1[2] = (q1[2] | (f << 22)) & 0x3fffffff;
  436. q1[3] = (f >> 8) & 0x3fffff;
  437. c >>= 30;
  438. c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) +
  439. mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]);
  440. f = (bignum256modm_element_t)c;
  441. q1[3] = (q1[3] | (f << 22)) & 0x3fffffff;
  442. q1[4] = (f >> 8) & 0x3fffff;
  443. c >>= 30;
  444. c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) +
  445. mul32x32_64(x[8], y[5]);
  446. f = (bignum256modm_element_t)c;
  447. q1[4] = (q1[4] | (f << 22)) & 0x3fffffff;
  448. q1[5] = (f >> 8) & 0x3fffff;
  449. c >>= 30;
  450. c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]);
  451. f = (bignum256modm_element_t)c;
  452. q1[5] = (q1[5] | (f << 22)) & 0x3fffffff;
  453. q1[6] = (f >> 8) & 0x3fffff;
  454. c >>= 30;
  455. c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]);
  456. f = (bignum256modm_element_t)c;
  457. q1[6] = (q1[6] | (f << 22)) & 0x3fffffff;
  458. q1[7] = (f >> 8) & 0x3fffff;
  459. c >>= 30;
  460. c += mul32x32_64(x[8], y[8]);
  461. f = (bignum256modm_element_t)c;
  462. q1[7] = (q1[7] | (f << 22)) & 0x3fffffff;
  463. q1[8] = (f >> 8) & 0x3fffff;
  464. barrett_reduce256_modm(r, q1, r1);
  465. }
  466. void expand256_modm(bignum256modm out, const unsigned char* in, size_t len) {
  467. unsigned char work[64] = {0};
  468. bignum256modm_element_t x[16] = {0};
  469. bignum256modm q1 = {0};
  470. memcpy(work, in, len);
  471. x[0] = U8TO32_LE(work + 0);
  472. x[1] = U8TO32_LE(work + 4);
  473. x[2] = U8TO32_LE(work + 8);
  474. x[3] = U8TO32_LE(work + 12);
  475. x[4] = U8TO32_LE(work + 16);
  476. x[5] = U8TO32_LE(work + 20);
  477. x[6] = U8TO32_LE(work + 24);
  478. x[7] = U8TO32_LE(work + 28);
  479. x[8] = U8TO32_LE(work + 32);
  480. x[9] = U8TO32_LE(work + 36);
  481. x[10] = U8TO32_LE(work + 40);
  482. x[11] = U8TO32_LE(work + 44);
  483. x[12] = U8TO32_LE(work + 48);
  484. x[13] = U8TO32_LE(work + 52);
  485. x[14] = U8TO32_LE(work + 56);
  486. x[15] = U8TO32_LE(work + 60);
  487. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
  488. out[0] = (x[0]) & 0x3fffffff;
  489. out[1] = ((x[0] >> 30) | (x[1] << 2)) & 0x3fffffff;
  490. out[2] = ((x[1] >> 28) | (x[2] << 4)) & 0x3fffffff;
  491. out[3] = ((x[2] >> 26) | (x[3] << 6)) & 0x3fffffff;
  492. out[4] = ((x[3] >> 24) | (x[4] << 8)) & 0x3fffffff;
  493. out[5] = ((x[4] >> 22) | (x[5] << 10)) & 0x3fffffff;
  494. out[6] = ((x[5] >> 20) | (x[6] << 12)) & 0x3fffffff;
  495. out[7] = ((x[6] >> 18) | (x[7] << 14)) & 0x3fffffff;
  496. out[8] = ((x[7] >> 16) | (x[8] << 16)) & 0x00ffffff;
  497. /* 8*31 = 248 bits, no need to reduce */
  498. if(len < 32) return;
  499. /* q1 = x >> 248 = 264 bits = 9 30 bit elements */
  500. q1[0] = ((x[7] >> 24) | (x[8] << 8)) & 0x3fffffff;
  501. q1[1] = ((x[8] >> 22) | (x[9] << 10)) & 0x3fffffff;
  502. q1[2] = ((x[9] >> 20) | (x[10] << 12)) & 0x3fffffff;
  503. q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff;
  504. q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff;
  505. q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff;
  506. q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff;
  507. q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff;
  508. q1[8] = ((x[15] >> 8));
  509. barrett_reduce256_modm(out, q1, out);
  510. }
  511. void expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
  512. bignum256modm_element_t x[8] = {0};
  513. x[0] = U8TO32_LE(in + 0);
  514. x[1] = U8TO32_LE(in + 4);
  515. x[2] = U8TO32_LE(in + 8);
  516. x[3] = U8TO32_LE(in + 12);
  517. x[4] = U8TO32_LE(in + 16);
  518. x[5] = U8TO32_LE(in + 20);
  519. x[6] = U8TO32_LE(in + 24);
  520. x[7] = U8TO32_LE(in + 28);
  521. out[0] = (x[0]) & 0x3fffffff;
  522. out[1] = ((x[0] >> 30) | (x[1] << 2)) & 0x3fffffff;
  523. out[2] = ((x[1] >> 28) | (x[2] << 4)) & 0x3fffffff;
  524. out[3] = ((x[2] >> 26) | (x[3] << 6)) & 0x3fffffff;
  525. out[4] = ((x[3] >> 24) | (x[4] << 8)) & 0x3fffffff;
  526. out[5] = ((x[4] >> 22) | (x[5] << 10)) & 0x3fffffff;
  527. out[6] = ((x[5] >> 20) | (x[6] << 12)) & 0x3fffffff;
  528. out[7] = ((x[6] >> 18) | (x[7] << 14)) & 0x3fffffff;
  529. out[8] = ((x[7] >> 16)) & 0x0000ffff;
  530. }
  531. int is_reduced256_modm(const bignum256modm in) {
  532. int i = 0;
  533. uint32_t res1 = 0;
  534. uint32_t res2 = 0;
  535. for(i = 8; i >= 0; i--) {
  536. res1 = (res1 << 1) | (in[i] < modm_m[i]);
  537. res2 = (res2 << 1) | (in[i] > modm_m[i]);
  538. }
  539. return res1 > res2;
  540. }
  541. void contract256_modm(unsigned char out[32], const bignum256modm in) {
  542. U32TO8_LE(out + 0, (in[0]) | (in[1] << 30));
  543. U32TO8_LE(out + 4, (in[1] >> 2) | (in[2] << 28));
  544. U32TO8_LE(out + 8, (in[2] >> 4) | (in[3] << 26));
  545. U32TO8_LE(out + 12, (in[3] >> 6) | (in[4] << 24));
  546. U32TO8_LE(out + 16, (in[4] >> 8) | (in[5] << 22));
  547. U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20));
  548. U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18));
  549. U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16));
  550. }
  551. void contract256_window4_modm(signed char r[64], const bignum256modm in) {
  552. char carry = 0;
  553. signed char* quads = r;
  554. bignum256modm_element_t i = 0, j = 0, v = 0;
  555. for(i = 0; i < 8; i += 2) {
  556. v = in[i];
  557. for(j = 0; j < 7; j++) {
  558. *quads++ = (v & 15);
  559. v >>= 4;
  560. }
  561. v |= (in[i + 1] << 2);
  562. for(j = 0; j < 8; j++) {
  563. *quads++ = (v & 15);
  564. v >>= 4;
  565. }
  566. }
  567. v = in[8];
  568. *quads++ = (v & 15);
  569. v >>= 4;
  570. *quads++ = (v & 15);
  571. v >>= 4;
  572. *quads++ = (v & 15);
  573. v >>= 4;
  574. *quads++ = (v & 15);
  575. v >>= 4;
  576. /* making it signed */
  577. carry = 0;
  578. for(i = 0; i < 63; i++) {
  579. r[i] += carry;
  580. r[i + 1] += (r[i] >> 4);
  581. r[i] &= 15;
  582. carry = (r[i] >> 3);
  583. r[i] -= (carry << 4);
  584. }
  585. r[63] += carry;
  586. }
  587. void contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
  588. int i = 0, j = 0, k = 0, b = 0;
  589. int m = (1 << (windowsize - 1)) - 1, soplen = 256;
  590. signed char* bits = r;
  591. bignum256modm_element_t v = 0;
  592. /* first put the binary expansion into r */
  593. for(i = 0; i < 8; i++) {
  594. v = s[i];
  595. for(j = 0; j < 30; j++, v >>= 1) *bits++ = (v & 1);
  596. }
  597. v = s[8];
  598. for(j = 0; j < 16; j++, v >>= 1) *bits++ = (v & 1);
  599. /* Making it sliding window */
  600. for(j = 0; j < soplen; j++) {
  601. if(!r[j]) continue;
  602. for(b = 1; (b < (soplen - j)) && (b <= 6); b++) {
  603. if((r[j] + (r[j + b] << b)) <= m) {
  604. r[j] += r[j + b] << b;
  605. r[j + b] = 0;
  606. } else if((r[j] - (r[j + b] << b)) >= -m) {
  607. r[j] -= r[j + b] << b;
  608. for(k = j + b; k < soplen; k++) {
  609. if(!r[k]) {
  610. r[k] = 1;
  611. break;
  612. }
  613. r[k] = 0;
  614. }
  615. } else if(r[j + b]) {
  616. break;
  617. }
  618. }
  619. }
  620. }
  621. void set256_modm(bignum256modm r, uint64_t v) {
  622. r[0] = (bignum256modm_element_t)(v & 0x3fffffff);
  623. v >>= 30;
  624. r[1] = (bignum256modm_element_t)(v & 0x3fffffff);
  625. v >>= 30;
  626. r[2] = (bignum256modm_element_t)(v & 0x3fffffff);
  627. r[3] = 0;
  628. r[4] = 0;
  629. r[5] = 0;
  630. r[6] = 0;
  631. r[7] = 0;
  632. r[8] = 0;
  633. }
  634. int get256_modm(uint64_t* v, const bignum256modm r) {
  635. *v = 0;
  636. int con1 = 0;
  637. #define NONZ(x) ((((((int64_t)(x)) - 1) >> 32) + 1) & 1)
  638. bignum256modm_element_t c = 0;
  639. c = r[0];
  640. *v += (uint64_t)c & 0x3fffffff;
  641. c >>= 30; // 30
  642. c += r[1];
  643. *v += ((uint64_t)c & 0x3fffffff) << 30;
  644. c >>= 30; // 60
  645. c += r[2];
  646. *v += ((uint64_t)c & 0xf) << 60;
  647. con1 |= NONZ(c >> 4);
  648. c >>= 30; // 64 bits
  649. c += r[3];
  650. con1 |= NONZ(c);
  651. c >>= 30;
  652. c += r[4];
  653. con1 |= NONZ(c);
  654. c >>= 30;
  655. c += r[5];
  656. con1 |= NONZ(c);
  657. c >>= 30;
  658. c += r[6];
  659. con1 |= NONZ(c);
  660. c >>= 30;
  661. c += r[7];
  662. con1 |= NONZ(c);
  663. c >>= 30;
  664. c += r[8];
  665. con1 |= NONZ(c);
  666. c >>= 30;
  667. con1 |= NONZ(c);
  668. #undef NONZ
  669. return con1 ^ 1;
  670. }
  671. int eq256_modm(const bignum256modm x, const bignum256modm y) {
  672. size_t differentbits = 0;
  673. int len = bignum256modm_limb_size;
  674. while(len--) {
  675. differentbits |= (*x++ ^ *y++);
  676. }
  677. return (int)(1 & ((differentbits - 1) >> bignum256modm_bits_per_limb));
  678. }
  679. int cmp256_modm(const bignum256modm x, const bignum256modm y) {
  680. int len = 2 * bignum256modm_limb_size;
  681. uint32_t a_gt = 0;
  682. uint32_t b_gt = 0;
  683. // 16B chunks
  684. while(len--) {
  685. const uint32_t ln = (const uint32_t)len;
  686. const uint32_t a = (x[ln >> 1] >> 16 * (ln & 1)) & 0xffff;
  687. const uint32_t b = (y[ln >> 1] >> 16 * (ln & 1)) & 0xffff;
  688. const uint32_t limb_a_gt = ((b - a) >> 16) & 1;
  689. const uint32_t limb_b_gt = ((a - b) >> 16) & 1;
  690. a_gt |= limb_a_gt & ~b_gt;
  691. b_gt |= limb_b_gt & ~a_gt;
  692. }
  693. return a_gt - b_gt;
  694. }
  695. int iszero256_modm(const bignum256modm x) {
  696. size_t differentbits = 0;
  697. int len = bignum256modm_limb_size;
  698. while(len--) {
  699. differentbits |= (*x++);
  700. }
  701. return (int)(1 & ((differentbits - 1) >> bignum256modm_bits_per_limb));
  702. }
  703. void copy256_modm(bignum256modm r, const bignum256modm x) {
  704. r[0] = x[0];
  705. r[1] = x[1];
  706. r[2] = x[2];
  707. r[3] = x[3];
  708. r[4] = x[4];
  709. r[5] = x[5];
  710. r[6] = x[6];
  711. r[7] = x[7];
  712. r[8] = x[8];
  713. }
  714. int check256_modm(const bignum256modm x) {
  715. int ok = 1;
  716. bignum256modm t = {0}, z = {0};
  717. ok &= iszero256_modm(x) ^ 1;
  718. barrett_reduce256_modm(t, z, x);
  719. ok &= eq256_modm(t, x);
  720. return ok;
  721. }
  722. void mulsub256_modm(
  723. bignum256modm r,
  724. const bignum256modm a,
  725. const bignum256modm b,
  726. const bignum256modm c) {
  727. //(cc - aa * bb) % l
  728. bignum256modm t = {0};
  729. mul256_modm(t, a, b);
  730. sub256_modm(r, c, t);
  731. }
  732. void muladd256_modm(
  733. bignum256modm r,
  734. const bignum256modm a,
  735. const bignum256modm b,
  736. const bignum256modm c) {
  737. //(cc + aa * bb) % l
  738. bignum256modm t = {0};
  739. mul256_modm(t, a, b);
  740. add256_modm(r, c, t);
  741. }