modm-donna-32bit.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. */
  4. #include "ed25519-donna.h"
  5. /*
  6. Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
  7. k = 32
  8. b = 1 << 8 = 256
  9. m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
  10. mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
  11. */
  12. static const bignum256modm modm_m = {
  13. 0x1cf5d3ed, 0x20498c69, 0x2f79cd65, 0x37be77a8,
  14. 0x00000014, 0x00000000, 0x00000000, 0x00000000,
  15. 0x00001000
  16. };
  17. static const bignum256modm modm_mu = {
  18. 0x0a2c131b, 0x3673968c, 0x06329a7e, 0x01885742,
  19. 0x3fffeb21, 0x3fffffff, 0x3fffffff, 0x3fffffff,
  20. 0x000fffff
  21. };
  22. static bignum256modm_element_t
  23. lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
  24. return (a - b) >> 31;
  25. }
  26. /* see HAC, Alg. 14.42 Step 4 */
  27. void reduce256_modm(bignum256modm r) {
  28. bignum256modm t = {0};
  29. bignum256modm_element_t b = 0, pb = 0, mask = 0;
  30. /* t = r - m */
  31. pb = 0;
  32. pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 30)); pb = b;
  33. pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 30)); pb = b;
  34. pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 30)); pb = b;
  35. pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 30)); pb = b;
  36. pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 30)); pb = b;
  37. pb += modm_m[5]; b = lt_modm(r[5], pb); t[5] = (r[5] - pb + (b << 30)); pb = b;
  38. pb += modm_m[6]; b = lt_modm(r[6], pb); t[6] = (r[6] - pb + (b << 30)); pb = b;
  39. pb += modm_m[7]; b = lt_modm(r[7], pb); t[7] = (r[7] - pb + (b << 30)); pb = b;
  40. pb += modm_m[8]; b = lt_modm(r[8], pb); t[8] = (r[8] - pb + (b << 16));
  41. /* keep r if r was smaller than m */
  42. mask = b - 1;
  43. r[0] ^= mask & (r[0] ^ t[0]);
  44. r[1] ^= mask & (r[1] ^ t[1]);
  45. r[2] ^= mask & (r[2] ^ t[2]);
  46. r[3] ^= mask & (r[3] ^ t[3]);
  47. r[4] ^= mask & (r[4] ^ t[4]);
  48. r[5] ^= mask & (r[5] ^ t[5]);
  49. r[6] ^= mask & (r[6] ^ t[6]);
  50. r[7] ^= mask & (r[7] ^ t[7]);
  51. r[8] ^= mask & (r[8] ^ t[8]);
  52. }
  53. /*
  54. Barrett reduction, see HAC, Alg. 14.42
  55. Instead of passing in x, pre-process in to q1 and r1 for efficiency
  56. */
  57. void barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
  58. bignum256modm q3 = {0}, r2 = {0};
  59. uint64_t c = 0;
  60. bignum256modm_element_t f = 0, b = 0, pb = 0;
  61. /* q1 = x >> 248 = 264 bits = 9 30 bit elements
  62. q2 = mu * q1
  63. q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
  64. c = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) + mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) + mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) + mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]);
  65. c >>= 30;
  66. c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) + mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) + mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) + mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) + mul32x32_64(modm_mu[8], q1[0]);
  67. f = (bignum256modm_element_t)c; q3[0] = (f >> 24) & 0x3f; c >>= 30;
  68. c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) + mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) + mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) + mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]);
  69. f = (bignum256modm_element_t)c; q3[0] |= (f << 6) & 0x3fffffff; q3[1] = (f >> 24) & 0x3f; c >>= 30;
  70. c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) + mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) + mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) + mul32x32_64(modm_mu[8], q1[2]);
  71. f = (bignum256modm_element_t)c; q3[1] |= (f << 6) & 0x3fffffff; q3[2] = (f >> 24) & 0x3f; c >>= 30;
  72. c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) + mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) + mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]);
  73. f = (bignum256modm_element_t)c; q3[2] |= (f << 6) & 0x3fffffff; q3[3] = (f >> 24) & 0x3f; c >>= 30;
  74. c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) + mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) + mul32x32_64(modm_mu[8], q1[4]);
  75. f = (bignum256modm_element_t)c; q3[3] |= (f << 6) & 0x3fffffff; q3[4] = (f >> 24) & 0x3f; c >>= 30;
  76. c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) + mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]);
  77. f = (bignum256modm_element_t)c; q3[4] |= (f << 6) & 0x3fffffff; q3[5] = (f >> 24) & 0x3f; c >>= 30;
  78. c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) + mul32x32_64(modm_mu[8], q1[6]);
  79. f = (bignum256modm_element_t)c; q3[5] |= (f << 6) & 0x3fffffff; q3[6] = (f >> 24) & 0x3f; c >>= 30;
  80. c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]);
  81. f = (bignum256modm_element_t)c; q3[6] |= (f << 6) & 0x3fffffff; q3[7] = (f >> 24) & 0x3f; c >>= 30;
  82. c += mul32x32_64(modm_mu[8], q1[8]);
  83. f = (bignum256modm_element_t)c; q3[7] |= (f << 6) & 0x3fffffff; q3[8] = (bignum256modm_element_t)(c >> 24);
  84. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(32+1) = x & ((1 << 264) - 1)
  85. r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */
  86. c = mul32x32_64(modm_m[0], q3[0]);
  87. r2[0] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  88. c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]);
  89. r2[1] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  90. c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) + mul32x32_64(modm_m[2], q3[0]);
  91. r2[2] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  92. c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) + mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]);
  93. r2[3] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  94. c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) + mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) + mul32x32_64(modm_m[4], q3[0]);
  95. r2[4] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  96. c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) + mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) + mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]);
  97. r2[5] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  98. c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) + mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) + mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) + mul32x32_64(modm_m[6], q3[0]);
  99. r2[6] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  100. c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) + mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) + mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) + mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]);
  101. r2[7] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  102. c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) + mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) + mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) + mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) + mul32x32_64(modm_m[8], q3[0]);
  103. r2[8] = (bignum256modm_element_t)(c & 0xffffff);
  104. /* r = r1 - r2
  105. if (r < 0) r += (1 << 264) */
  106. pb = 0;
  107. pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 30)); pb = b;
  108. pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 30)); pb = b;
  109. pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 30)); pb = b;
  110. pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 30)); pb = b;
  111. pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 30)); pb = b;
  112. pb += r2[5]; b = lt_modm(r1[5], pb); r[5] = (r1[5] - pb + (b << 30)); pb = b;
  113. pb += r2[6]; b = lt_modm(r1[6], pb); r[6] = (r1[6] - pb + (b << 30)); pb = b;
  114. pb += r2[7]; b = lt_modm(r1[7], pb); r[7] = (r1[7] - pb + (b << 30)); pb = b;
  115. pb += r2[8]; b = lt_modm(r1[8], pb); r[8] = (r1[8] - pb + (b << 24));
  116. reduce256_modm(r);
  117. reduce256_modm(r);
  118. }
  119. /* addition modulo m */
  120. void add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  121. bignum256modm_element_t c = 0;
  122. c = x[0] + y[0]; r[0] = c & 0x3fffffff; c >>= 30;
  123. c += x[1] + y[1]; r[1] = c & 0x3fffffff; c >>= 30;
  124. c += x[2] + y[2]; r[2] = c & 0x3fffffff; c >>= 30;
  125. c += x[3] + y[3]; r[3] = c & 0x3fffffff; c >>= 30;
  126. c += x[4] + y[4]; r[4] = c & 0x3fffffff; c >>= 30;
  127. c += x[5] + y[5]; r[5] = c & 0x3fffffff; c >>= 30;
  128. c += x[6] + y[6]; r[6] = c & 0x3fffffff; c >>= 30;
  129. c += x[7] + y[7]; r[7] = c & 0x3fffffff; c >>= 30;
  130. c += x[8] + y[8]; r[8] = c;
  131. reduce256_modm(r);
  132. }
  133. /* -x modulo m */
  134. void neg256_modm(bignum256modm r, const bignum256modm x) {
  135. bignum256modm_element_t b = 0, pb = 0;
  136. /* r = m - x */
  137. pb = 0;
  138. pb += x[0]; b = lt_modm(modm_m[0], pb); r[0] = (modm_m[0] - pb + (b << 30)); pb = b;
  139. pb += x[1]; b = lt_modm(modm_m[1], pb); r[1] = (modm_m[1] - pb + (b << 30)); pb = b;
  140. pb += x[2]; b = lt_modm(modm_m[2], pb); r[2] = (modm_m[2] - pb + (b << 30)); pb = b;
  141. pb += x[3]; b = lt_modm(modm_m[3], pb); r[3] = (modm_m[3] - pb + (b << 30)); pb = b;
  142. pb += x[4]; b = lt_modm(modm_m[4], pb); r[4] = (modm_m[4] - pb + (b << 30)); pb = b;
  143. pb += x[5]; b = lt_modm(modm_m[5], pb); r[5] = (modm_m[5] - pb + (b << 30)); pb = b;
  144. pb += x[6]; b = lt_modm(modm_m[6], pb); r[6] = (modm_m[6] - pb + (b << 30)); pb = b;
  145. pb += x[7]; b = lt_modm(modm_m[7], pb); r[7] = (modm_m[7] - pb + (b << 30)); pb = b;
  146. pb += x[8]; b = lt_modm(modm_m[8], pb); r[8] = (modm_m[8] - pb + (b << 16));
  147. // if x==0, reduction is required
  148. reduce256_modm(r);
  149. }
  150. /* consts for subtraction, > p */
  151. /* Emilia Kasper trick, https://www.imperialviolet.org/2010/12/04/ecc.html */
  152. static const uint32_t twoP[] = {
  153. 0x5cf5d3ed, 0x60498c68, 0x6f79cd64, 0x77be77a7, 0x40000013, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0xfff};
  154. /* subtraction x-y % m */
  155. void sub256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  156. bignum256modm_element_t c = 0;
  157. c = twoP[0] + x[0] - y[0]; r[0] = c & 0x3fffffff; c >>= 30;
  158. c += twoP[1] + x[1] - y[1]; r[1] = c & 0x3fffffff; c >>= 30;
  159. c += twoP[2] + x[2] - y[2]; r[2] = c & 0x3fffffff; c >>= 30;
  160. c += twoP[3] + x[3] - y[3]; r[3] = c & 0x3fffffff; c >>= 30;
  161. c += twoP[4] + x[4] - y[4]; r[4] = c & 0x3fffffff; c >>= 30;
  162. c += twoP[5] + x[5] - y[5]; r[5] = c & 0x3fffffff; c >>= 30;
  163. c += twoP[6] + x[6] - y[6]; r[6] = c & 0x3fffffff; c >>= 30;
  164. c += twoP[7] + x[7] - y[7]; r[7] = c & 0x3fffffff; c >>= 30;
  165. c += twoP[8] + x[8] - y[8]; r[8] = c;
  166. reduce256_modm(r);
  167. }
  168. /* multiplication modulo m */
  169. void mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  170. bignum256modm r1 = {0}, q1 = {0};
  171. uint64_t c = 0;
  172. bignum256modm_element_t f = 0;
  173. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
  174. q1 = x >> 248 = 264 bits = 9 30 bit elements */
  175. c = mul32x32_64(x[0], y[0]);
  176. f = (bignum256modm_element_t)c; r1[0] = (f & 0x3fffffff); c >>= 30;
  177. c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]);
  178. f = (bignum256modm_element_t)c; r1[1] = (f & 0x3fffffff); c >>= 30;
  179. c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]);
  180. f = (bignum256modm_element_t)c; r1[2] = (f & 0x3fffffff); c >>= 30;
  181. c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) + mul32x32_64(x[3], y[0]);
  182. f = (bignum256modm_element_t)c; r1[3] = (f & 0x3fffffff); c >>= 30;
  183. c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) + mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]);
  184. f = (bignum256modm_element_t)c; r1[4] = (f & 0x3fffffff); c >>= 30;
  185. c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) + mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]);
  186. f = (bignum256modm_element_t)c; r1[5] = (f & 0x3fffffff); c >>= 30;
  187. c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) + mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) + mul32x32_64(x[6], y[0]);
  188. f = (bignum256modm_element_t)c; r1[6] = (f & 0x3fffffff); c >>= 30;
  189. c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) + mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) + mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]);
  190. f = (bignum256modm_element_t)c; r1[7] = (f & 0x3fffffff); c >>= 30;
  191. c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) + mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) + mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]);
  192. f = (bignum256modm_element_t)c; r1[8] = (f & 0x00ffffff); q1[0] = (f >> 8) & 0x3fffff; c >>= 30;
  193. c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) + mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) + mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]);
  194. f = (bignum256modm_element_t)c; q1[0] = (q1[0] | (f << 22)) & 0x3fffffff; q1[1] = (f >> 8) & 0x3fffff; c >>= 30;
  195. c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) + mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) + mul32x32_64(x[8], y[2]);
  196. f = (bignum256modm_element_t)c; q1[1] = (q1[1] | (f << 22)) & 0x3fffffff; q1[2] = (f >> 8) & 0x3fffff; c >>= 30;
  197. c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) + mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]);
  198. f = (bignum256modm_element_t)c; q1[2] = (q1[2] | (f << 22)) & 0x3fffffff; q1[3] = (f >> 8) & 0x3fffff; c >>= 30;
  199. c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) + mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]);
  200. f = (bignum256modm_element_t)c; q1[3] = (q1[3] | (f << 22)) & 0x3fffffff; q1[4] = (f >> 8) & 0x3fffff; c >>= 30;
  201. c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) + mul32x32_64(x[8], y[5]);
  202. f = (bignum256modm_element_t)c; q1[4] = (q1[4] | (f << 22)) & 0x3fffffff; q1[5] = (f >> 8) & 0x3fffff; c >>= 30;
  203. c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]);
  204. f = (bignum256modm_element_t)c; q1[5] = (q1[5] | (f << 22)) & 0x3fffffff; q1[6] = (f >> 8) & 0x3fffff; c >>= 30;
  205. c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]);
  206. f = (bignum256modm_element_t)c; q1[6] = (q1[6] | (f << 22)) & 0x3fffffff; q1[7] = (f >> 8) & 0x3fffff; c >>= 30;
  207. c += mul32x32_64(x[8], y[8]);
  208. f = (bignum256modm_element_t)c; q1[7] = (q1[7] | (f << 22)) & 0x3fffffff; q1[8] = (f >> 8) & 0x3fffff;
  209. barrett_reduce256_modm(r, q1, r1);
  210. }
  211. void expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
  212. unsigned char work[64] = {0};
  213. bignum256modm_element_t x[16] = {0};
  214. bignum256modm q1 = {0};
  215. memcpy(work, in, len);
  216. x[0] = U8TO32_LE(work + 0);
  217. x[1] = U8TO32_LE(work + 4);
  218. x[2] = U8TO32_LE(work + 8);
  219. x[3] = U8TO32_LE(work + 12);
  220. x[4] = U8TO32_LE(work + 16);
  221. x[5] = U8TO32_LE(work + 20);
  222. x[6] = U8TO32_LE(work + 24);
  223. x[7] = U8TO32_LE(work + 28);
  224. x[8] = U8TO32_LE(work + 32);
  225. x[9] = U8TO32_LE(work + 36);
  226. x[10] = U8TO32_LE(work + 40);
  227. x[11] = U8TO32_LE(work + 44);
  228. x[12] = U8TO32_LE(work + 48);
  229. x[13] = U8TO32_LE(work + 52);
  230. x[14] = U8TO32_LE(work + 56);
  231. x[15] = U8TO32_LE(work + 60);
  232. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
  233. out[0] = ( x[0]) & 0x3fffffff;
  234. out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff;
  235. out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff;
  236. out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff;
  237. out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff;
  238. out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
  239. out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
  240. out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
  241. out[8] = ((x[ 7] >> 16) | (x[ 8] << 16)) & 0x00ffffff;
  242. /* 8*31 = 248 bits, no need to reduce */
  243. if (len < 32)
  244. return;
  245. /* q1 = x >> 248 = 264 bits = 9 30 bit elements */
  246. q1[0] = ((x[ 7] >> 24) | (x[ 8] << 8)) & 0x3fffffff;
  247. q1[1] = ((x[ 8] >> 22) | (x[ 9] << 10)) & 0x3fffffff;
  248. q1[2] = ((x[ 9] >> 20) | (x[10] << 12)) & 0x3fffffff;
  249. q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff;
  250. q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff;
  251. q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff;
  252. q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff;
  253. q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff;
  254. q1[8] = ((x[15] >> 8) );
  255. barrett_reduce256_modm(out, q1, out);
  256. }
  257. void expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
  258. bignum256modm_element_t x[8] = {0};
  259. x[0] = U8TO32_LE(in + 0);
  260. x[1] = U8TO32_LE(in + 4);
  261. x[2] = U8TO32_LE(in + 8);
  262. x[3] = U8TO32_LE(in + 12);
  263. x[4] = U8TO32_LE(in + 16);
  264. x[5] = U8TO32_LE(in + 20);
  265. x[6] = U8TO32_LE(in + 24);
  266. x[7] = U8TO32_LE(in + 28);
  267. out[0] = ( x[0]) & 0x3fffffff;
  268. out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff;
  269. out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff;
  270. out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff;
  271. out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff;
  272. out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
  273. out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
  274. out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
  275. out[8] = ((x[ 7] >> 16) ) & 0x0000ffff;
  276. }
  277. int is_reduced256_modm(const bignum256modm in)
  278. {
  279. int i = 0;
  280. uint32_t res1 = 0;
  281. uint32_t res2 = 0;
  282. for (i = 8; i >= 0; i--) {
  283. res1 = (res1 << 1) | (in[i] < modm_m[i]);
  284. res2 = (res2 << 1) | (in[i] > modm_m[i]);
  285. }
  286. return res1 > res2;
  287. }
  288. void contract256_modm(unsigned char out[32], const bignum256modm in) {
  289. U32TO8_LE(out + 0, (in[0] ) | (in[1] << 30));
  290. U32TO8_LE(out + 4, (in[1] >> 2) | (in[2] << 28));
  291. U32TO8_LE(out + 8, (in[2] >> 4) | (in[3] << 26));
  292. U32TO8_LE(out + 12, (in[3] >> 6) | (in[4] << 24));
  293. U32TO8_LE(out + 16, (in[4] >> 8) | (in[5] << 22));
  294. U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20));
  295. U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18));
  296. U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16));
  297. }
  298. void contract256_window4_modm(signed char r[64], const bignum256modm in) {
  299. char carry = 0;
  300. signed char *quads = r;
  301. bignum256modm_element_t i = 0, j = 0, v = 0;
  302. for (i = 0; i < 8; i += 2) {
  303. v = in[i];
  304. for (j = 0; j < 7; j++) {
  305. *quads++ = (v & 15);
  306. v >>= 4;
  307. }
  308. v |= (in[i+1] << 2);
  309. for (j = 0; j < 8; j++) {
  310. *quads++ = (v & 15);
  311. v >>= 4;
  312. }
  313. }
  314. v = in[8];
  315. *quads++ = (v & 15); v >>= 4;
  316. *quads++ = (v & 15); v >>= 4;
  317. *quads++ = (v & 15); v >>= 4;
  318. *quads++ = (v & 15); v >>= 4;
  319. /* making it signed */
  320. carry = 0;
  321. for(i = 0; i < 63; i++) {
  322. r[i] += carry;
  323. r[i+1] += (r[i] >> 4);
  324. r[i] &= 15;
  325. carry = (r[i] >> 3);
  326. r[i] -= (carry << 4);
  327. }
  328. r[63] += carry;
  329. }
  330. void contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
  331. int i = 0, j = 0, k = 0, b = 0;
  332. int m = (1 << (windowsize - 1)) - 1, soplen = 256;
  333. signed char *bits = r;
  334. bignum256modm_element_t v = 0;
  335. /* first put the binary expansion into r */
  336. for (i = 0; i < 8; i++) {
  337. v = s[i];
  338. for (j = 0; j < 30; j++, v >>= 1)
  339. *bits++ = (v & 1);
  340. }
  341. v = s[8];
  342. for (j = 0; j < 16; j++, v >>= 1)
  343. *bits++ = (v & 1);
  344. /* Making it sliding window */
  345. for (j = 0; j < soplen; j++) {
  346. if (!r[j])
  347. continue;
  348. for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
  349. if ((r[j] + (r[j + b] << b)) <= m) {
  350. r[j] += r[j + b] << b;
  351. r[j + b] = 0;
  352. } else if ((r[j] - (r[j + b] << b)) >= -m) {
  353. r[j] -= r[j + b] << b;
  354. for (k = j + b; k < soplen; k++) {
  355. if (!r[k]) {
  356. r[k] = 1;
  357. break;
  358. }
  359. r[k] = 0;
  360. }
  361. } else if (r[j + b]) {
  362. break;
  363. }
  364. }
  365. }
  366. }
  367. void set256_modm(bignum256modm r, uint64_t v) {
  368. r[0] = (bignum256modm_element_t) (v & 0x3fffffff); v >>= 30;
  369. r[1] = (bignum256modm_element_t) (v & 0x3fffffff); v >>= 30;
  370. r[2] = (bignum256modm_element_t) (v & 0x3fffffff);
  371. r[3] = 0;
  372. r[4] = 0;
  373. r[5] = 0;
  374. r[6] = 0;
  375. r[7] = 0;
  376. r[8] = 0;
  377. }
  378. int get256_modm(uint64_t * v, const bignum256modm r){
  379. *v = 0;
  380. int con1 = 0;
  381. #define NONZ(x) ((((((int64_t)(x)) - 1) >> 32) + 1) & 1)
  382. bignum256modm_element_t c = 0;
  383. c = r[0]; *v += (uint64_t)c & 0x3fffffff; c >>= 30; // 30
  384. c += r[1]; *v += ((uint64_t)c & 0x3fffffff) << 30; c >>= 30; // 60
  385. c += r[2]; *v += ((uint64_t)c & 0xf) << 60; con1 |= NONZ(c>>4); c >>= 30; // 64 bits
  386. c += r[3]; con1 |= NONZ(c); c >>= 30;
  387. c += r[4]; con1 |= NONZ(c); c >>= 30;
  388. c += r[5]; con1 |= NONZ(c); c >>= 30;
  389. c += r[6]; con1 |= NONZ(c); c >>= 30;
  390. c += r[7]; con1 |= NONZ(c); c >>= 30;
  391. c += r[8]; con1 |= NONZ(c); c >>= 30;
  392. con1 |= NONZ(c);
  393. #undef NONZ
  394. return con1 ^ 1;
  395. }
  396. int eq256_modm(const bignum256modm x, const bignum256modm y){
  397. size_t differentbits = 0;
  398. int len = bignum256modm_limb_size;
  399. while (len--) {
  400. differentbits |= (*x++ ^ *y++);
  401. }
  402. return (int) (1 & ((differentbits - 1) >> bignum256modm_bits_per_limb));
  403. }
  404. int cmp256_modm(const bignum256modm x, const bignum256modm y){
  405. int len = 2*bignum256modm_limb_size;
  406. uint32_t a_gt = 0;
  407. uint32_t b_gt = 0;
  408. // 16B chunks
  409. while (len--) {
  410. const uint32_t ln = (const uint32_t) len;
  411. const uint32_t a = (x[ln>>1] >> 16*(ln & 1)) & 0xffff;
  412. const uint32_t b = (y[ln>>1] >> 16*(ln & 1)) & 0xffff;
  413. const uint32_t limb_a_gt = ((b - a) >> 16) & 1;
  414. const uint32_t limb_b_gt = ((a - b) >> 16) & 1;
  415. a_gt |= limb_a_gt & ~b_gt;
  416. b_gt |= limb_b_gt & ~a_gt;
  417. }
  418. return a_gt - b_gt;
  419. }
  420. int iszero256_modm(const bignum256modm x){
  421. size_t differentbits = 0;
  422. int len = bignum256modm_limb_size;
  423. while (len--) {
  424. differentbits |= (*x++);
  425. }
  426. return (int) (1 & ((differentbits - 1) >> bignum256modm_bits_per_limb));
  427. }
  428. void copy256_modm(bignum256modm r, const bignum256modm x){
  429. r[0] = x[0];
  430. r[1] = x[1];
  431. r[2] = x[2];
  432. r[3] = x[3];
  433. r[4] = x[4];
  434. r[5] = x[5];
  435. r[6] = x[6];
  436. r[7] = x[7];
  437. r[8] = x[8];
  438. }
  439. int check256_modm(const bignum256modm x){
  440. int ok = 1;
  441. bignum256modm t={0}, z={0};
  442. ok &= iszero256_modm(x) ^ 1;
  443. barrett_reduce256_modm(t, z, x);
  444. ok &= eq256_modm(t, x);
  445. return ok;
  446. }
  447. void mulsub256_modm(bignum256modm r, const bignum256modm a, const bignum256modm b, const bignum256modm c){
  448. //(cc - aa * bb) % l
  449. bignum256modm t={0};
  450. mul256_modm(t, a, b);
  451. sub256_modm(r, c, t);
  452. }
  453. void muladd256_modm(bignum256modm r, const bignum256modm a, const bignum256modm b, const bignum256modm c){
  454. //(cc + aa * bb) % l
  455. bignum256modm t={0};
  456. mul256_modm(t, a, b);
  457. add256_modm(r, c, t);
  458. }