asm_arm_mult_square_umaal.inc 50 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202
  1. /* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
  2. #ifndef _UECC_ASM_ARM_MULT_SQUARE_H_
  3. #define _UECC_ASM_ARM_MULT_SQUARE_H_
  4. #define FAST_MULT_ASM_5 \
  5. "push {r3} \n\t" \
  6. "ldmia r2!, {r3, r4, r5, r6, r7} \n\t" \
  7. "push {r2} \n\t" \
  8. \
  9. "ldr r2, [r1], #4 \n\t" \
  10. "umull r8, r9, r3, r2 \n\t" \
  11. "str r8, [r0], #4 \n\t" \
  12. "mov r10, #0 \n\t" \
  13. "umaal r9, r10, r4, r2 \n\t" \
  14. "mov r11, #0 \n\t" \
  15. "umaal r10, r11, r5, r2 \n\t" \
  16. "mov r12, #0 \n\t" \
  17. "umaal r11, r12, r6, r2 \n\t" \
  18. "mov r14, #0 \n\t" \
  19. "umaal r12, r14, r7, r2 \n\t" \
  20. \
  21. "ldr r2, [r1], #4 \n\t" \
  22. "mov r8, #0 \n\t" \
  23. "umaal r8, r9, r3, r2 \n\t" \
  24. "str r8, [r0], #4 \n\t" \
  25. "umaal r9, r10, r4, r2 \n\t" \
  26. "umaal r10, r11, r5, r2 \n\t" \
  27. "umaal r11, r12, r6, r2 \n\t" \
  28. "umaal r12, r14, r7, r2 \n\t" \
  29. \
  30. "ldr r2, [r1], #4 \n\t" \
  31. "mov r8, #0 \n\t" \
  32. "umaal r8, r9, r3, r2 \n\t" \
  33. "str r8, [r0], #4 \n\t" \
  34. "umaal r9, r10, r4, r2 \n\t" \
  35. "umaal r10, r11, r5, r2 \n\t" \
  36. "umaal r11, r12, r6, r2 \n\t" \
  37. "umaal r12, r14, r7, r2 \n\t" \
  38. \
  39. "ldr r2, [r1], #4 \n\t" \
  40. "mov r8, #0 \n\t" \
  41. "umaal r8, r9, r3, r2 \n\t" \
  42. "str r8, [r0], #4 \n\t" \
  43. "umaal r9, r10, r4, r2 \n\t" \
  44. "umaal r10, r11, r5, r2 \n\t" \
  45. "umaal r11, r12, r6, r2 \n\t" \
  46. "umaal r12, r14, r7, r2 \n\t" \
  47. \
  48. "ldr r2, [r1], #4 \n\t" \
  49. "mov r8, #0 \n\t" \
  50. "umaal r8, r9, r3, r2 \n\t" \
  51. "str r8, [r0], #4 \n\t" \
  52. "umaal r9, r10, r4, r2 \n\t" \
  53. "umaal r10, r11, r5, r2 \n\t" \
  54. "umaal r11, r12, r6, r2 \n\t" \
  55. "umaal r12, r14, r7, r2 \n\t" \
  56. \
  57. "str r9, [r0], #4 \n\t" \
  58. "str r10, [r0], #4 \n\t" \
  59. "str r11, [r0], #4 \n\t" \
  60. "str r12, [r0], #4 \n\t" \
  61. "str r14, [r0], #4 \n\t" \
  62. \
  63. "pop {r2, r3} \n\t"
  64. #define FAST_MULT_ASM_5_TO_6 \
  65. "cmp r3, #5 \n\t" \
  66. "beq 1f \n\t" \
  67. \
  68. /* r4 = left high */ \
  69. "ldr r4, [r1] \n\t" \
  70. \
  71. "sub r0, #20 \n\t" \
  72. "sub r1, #20 \n\t" \
  73. "sub r2, #20 \n\t" \
  74. \
  75. /* Do right side */ \
  76. "ldr r14, [r2], #4 \n\t" \
  77. "mov r5, #0 \n\t" \
  78. "ldr r6, [r0], #4 \n\t" \
  79. "umaal r5, r6, r4, r14 \n\t" \
  80. "ldr r14, [r2], #4 \n\t" \
  81. "ldr r7, [r0], #4 \n\t" \
  82. "umaal r6, r7, r4, r14 \n\t" \
  83. "ldr r14, [r2], #4 \n\t" \
  84. "ldr r8, [r0], #4 \n\t" \
  85. "umaal r7, r8, r4, r14 \n\t" \
  86. "ldr r14, [r2], #4 \n\t" \
  87. "ldr r9, [r0], #4 \n\t" \
  88. "umaal r8, r9, r4, r14 \n\t" \
  89. "ldr r14, [r2], #4 \n\t" \
  90. "ldr r10, [r0], #4 \n\t" \
  91. "umaal r9, r10, r4, r14 \n\t" \
  92. "sub r0, #20 \n\t" \
  93. \
  94. /* r4 = right high */ \
  95. "ldr r4, [r2], #4 \n\t" \
  96. \
  97. /* Do left side */ \
  98. "ldr r14, [r1], #4 \n\t" \
  99. "mov r12, #0 \n\t" \
  100. "umaal r12, r5, r4, r14 \n\t" \
  101. "str r12, [r0], #4 \n\t" \
  102. "ldr r14, [r1], #4 \n\t" \
  103. "umaal r5, r6, r4, r14 \n\t" \
  104. "str r5, [r0], #4 \n\t" \
  105. "ldr r14, [r1], #4 \n\t" \
  106. "umaal r6, r7, r4, r14 \n\t" \
  107. "str r6, [r0], #4 \n\t" \
  108. "ldr r14, [r1], #4 \n\t" \
  109. "umaal r7, r8, r4, r14 \n\t" \
  110. "str r7, [r0], #4 \n\t" \
  111. "ldr r14, [r1], #4 \n\t" \
  112. "umaal r8, r9, r4, r14 \n\t" \
  113. "str r8, [r0], #4 \n\t" \
  114. \
  115. "ldr r14, [r1], #4 \n\t" \
  116. "umaal r9, r10, r4, r14 \n\t" \
  117. "stmia r0!, {r9, r10} \n\t"
  118. #define FAST_MULT_ASM_6 \
  119. "ldmia r2!, {r4, r5, r6} \n\t" \
  120. \
  121. "ldr r14, [r1], #4 \n\t" \
  122. "umull r8, r9, r4, r14 \n\t" \
  123. "str r8, [r0], #4 \n\t" \
  124. "mov r10, #0 \n\t" \
  125. "umaal r9, r10, r5, r14 \n\t" \
  126. "mov r11, #0 \n\t" \
  127. "umaal r10, r11, r6, r14 \n\t" \
  128. \
  129. "ldr r14, [r1], #4 \n\t" \
  130. "mov r8, #0 \n\t" \
  131. "umaal r8, r9, r4, r14 \n\t" \
  132. "str r8, [r0], #4 \n\t" \
  133. "umaal r9, r10, r5, r14 \n\t" \
  134. "umaal r10, r11, r6, r14 \n\t" \
  135. \
  136. "ldr r14, [r1], #4 \n\t" \
  137. "mov r8, #0 \n\t" \
  138. "umaal r8, r9, r4, r14 \n\t" \
  139. "str r8, [r0], #4 \n\t" \
  140. "umaal r9, r10, r5, r14 \n\t" \
  141. "umaal r10, r11, r6, r14 \n\t" \
  142. \
  143. "ldr r14, [r1], #4 \n\t" \
  144. "mov r8, #0 \n\t" \
  145. "umaal r8, r9, r4, r14 \n\t" \
  146. "str r8, [r0], #4 \n\t" \
  147. "umaal r9, r10, r5, r14 \n\t" \
  148. "umaal r10, r11, r6, r14 \n\t" \
  149. \
  150. "ldr r14, [r1], #4 \n\t" \
  151. "mov r8, #0 \n\t" \
  152. "umaal r8, r9, r4, r14 \n\t" \
  153. "str r8, [r0], #4 \n\t" \
  154. "umaal r9, r10, r5, r14 \n\t" \
  155. "umaal r10, r11, r6, r14 \n\t" \
  156. \
  157. "ldr r14, [r1], #4 \n\t" \
  158. "mov r8, #0 \n\t" \
  159. "umaal r8, r9, r4, r14 \n\t" \
  160. "str r8, [r0], #4 \n\t" \
  161. "umaal r9, r10, r5, r14 \n\t" \
  162. "umaal r10, r11, r6, r14 \n\t" \
  163. \
  164. "str r9, [r0], #4 \n\t" \
  165. "str r10, [r0], #4 \n\t" \
  166. "str r11, [r0], #4 \n\t" \
  167. \
  168. "sub r0, #24 \n\t" \
  169. "sub r1, #24 \n\t" \
  170. "ldmia r2!, {r4, r5, r6} \n\t" \
  171. \
  172. "ldr r14, [r1], #4 \n\t" \
  173. "ldr r8, [r0] \n\t" \
  174. "mov r9, #0 \n\t" \
  175. "umaal r8, r9, r4, r14 \n\t" \
  176. "str r8, [r0], #4 \n\t" \
  177. "mov r10, #0 \n\t" \
  178. "umaal r9, r10, r5, r14 \n\t" \
  179. "mov r11, #0 \n\t" \
  180. "umaal r10, r11, r6, r14 \n\t" \
  181. \
  182. "ldr r14, [r1], #4 \n\t" \
  183. "ldr r8, [r0] \n\t" \
  184. "umaal r8, r9, r4, r14 \n\t" \
  185. "str r8, [r0], #4 \n\t" \
  186. "umaal r9, r10, r5, r14 \n\t" \
  187. "umaal r10, r11, r6, r14 \n\t" \
  188. \
  189. "ldr r14, [r1], #4 \n\t" \
  190. "ldr r8, [r0] \n\t" \
  191. "umaal r8, r9, r4, r14 \n\t" \
  192. "str r8, [r0], #4 \n\t" \
  193. "umaal r9, r10, r5, r14 \n\t" \
  194. "umaal r10, r11, r6, r14 \n\t" \
  195. \
  196. "ldr r14, [r1], #4 \n\t" \
  197. "ldr r8, [r0] \n\t" \
  198. "umaal r8, r9, r4, r14 \n\t" \
  199. "str r8, [r0], #4 \n\t" \
  200. "umaal r9, r10, r5, r14 \n\t" \
  201. "umaal r10, r11, r6, r14 \n\t" \
  202. \
  203. "ldr r14, [r1], #4 \n\t" \
  204. "ldr r8, [r0] \n\t" \
  205. "umaal r8, r9, r4, r14 \n\t" \
  206. "str r8, [r0], #4 \n\t" \
  207. "umaal r9, r10, r5, r14 \n\t" \
  208. "umaal r10, r11, r6, r14 \n\t" \
  209. \
  210. "ldr r14, [r1], #4 \n\t" \
  211. "ldr r8, [r0] \n\t" \
  212. "umaal r8, r9, r4, r14 \n\t" \
  213. "str r8, [r0], #4 \n\t" \
  214. "umaal r9, r10, r5, r14 \n\t" \
  215. "umaal r10, r11, r6, r14 \n\t" \
  216. \
  217. "str r9, [r0], #4 \n\t" \
  218. "str r10, [r0], #4 \n\t" \
  219. "str r11, [r0], #4 \n\t"
  220. #define FAST_MULT_ASM_6_TO_7 \
  221. "cmp r3, #6 \n\t" \
  222. "beq 1f \n\t" \
  223. \
  224. /* r4 = left high */ \
  225. "ldr r4, [r1] \n\t" \
  226. \
  227. "sub r0, #24 \n\t" \
  228. "sub r1, #24 \n\t" \
  229. "sub r2, #24 \n\t" \
  230. \
  231. /* Do right side */ \
  232. "ldr r14, [r2], #4 \n\t" \
  233. "mov r5, #0 \n\t" \
  234. "ldr r6, [r0], #4 \n\t" \
  235. "umaal r5, r6, r4, r14 \n\t" \
  236. "ldr r14, [r2], #4 \n\t" \
  237. "ldr r7, [r0], #4 \n\t" \
  238. "umaal r6, r7, r4, r14 \n\t" \
  239. "ldr r14, [r2], #4 \n\t" \
  240. "ldr r8, [r0], #4 \n\t" \
  241. "umaal r7, r8, r4, r14 \n\t" \
  242. "ldr r14, [r2], #4 \n\t" \
  243. "ldr r9, [r0], #4 \n\t" \
  244. "umaal r8, r9, r4, r14 \n\t" \
  245. "ldr r14, [r2], #4 \n\t" \
  246. "ldr r10, [r0], #4 \n\t" \
  247. "umaal r9, r10, r4, r14 \n\t" \
  248. "ldr r14, [r2], #4 \n\t" \
  249. "ldr r11, [r0], #4 \n\t" \
  250. "umaal r10, r11, r4, r14 \n\t" \
  251. "sub r0, #24 \n\t" \
  252. \
  253. /* r4 = right high */ \
  254. "ldr r4, [r2], #4 \n\t" \
  255. \
  256. /* Do left side */ \
  257. "ldr r14, [r1], #4 \n\t" \
  258. "mov r12, #0 \n\t" \
  259. "umaal r12, r5, r4, r14 \n\t" \
  260. "str r12, [r0], #4 \n\t" \
  261. "ldr r14, [r1], #4 \n\t" \
  262. "umaal r5, r6, r4, r14 \n\t" \
  263. "str r5, [r0], #4 \n\t" \
  264. "ldr r14, [r1], #4 \n\t" \
  265. "umaal r6, r7, r4, r14 \n\t" \
  266. "str r6, [r0], #4 \n\t" \
  267. "ldr r14, [r1], #4 \n\t" \
  268. "umaal r7, r8, r4, r14 \n\t" \
  269. "str r7, [r0], #4 \n\t" \
  270. "ldr r14, [r1], #4 \n\t" \
  271. "umaal r8, r9, r4, r14 \n\t" \
  272. "str r8, [r0], #4 \n\t" \
  273. "ldr r14, [r1], #4 \n\t" \
  274. "umaal r9, r10, r4, r14 \n\t" \
  275. "str r9, [r0], #4 \n\t" \
  276. \
  277. "ldr r14, [r1], #4 \n\t" \
  278. "umaal r10, r11, r4, r14 \n\t" \
  279. "stmia r0!, {r10, r11} \n\t"
  280. #define FAST_MULT_ASM_7 \
  281. "ldmia r2!, {r4, r5, r6, r7} \n\t" \
  282. \
  283. "ldr r14, [r1], #4 \n\t" \
  284. "umull r8, r9, r4, r14 \n\t" \
  285. "str r8, [r0], #4 \n\t" \
  286. "mov r10, #0 \n\t" \
  287. "umaal r9, r10, r5, r14 \n\t" \
  288. "mov r11, #0 \n\t" \
  289. "umaal r10, r11, r6, r14 \n\t" \
  290. "mov r12, #0 \n\t" \
  291. "umaal r11, r12, r7, r14 \n\t" \
  292. \
  293. "ldr r14, [r1], #4 \n\t" \
  294. "mov r8, #0 \n\t" \
  295. "umaal r8, r9, r4, r14 \n\t" \
  296. "str r8, [r0], #4 \n\t" \
  297. "umaal r9, r10, r5, r14 \n\t" \
  298. "umaal r10, r11, r6, r14 \n\t" \
  299. "umaal r11, r12, r7, r14 \n\t" \
  300. \
  301. "ldr r14, [r1], #4 \n\t" \
  302. "mov r8, #0 \n\t" \
  303. "umaal r8, r9, r4, r14 \n\t" \
  304. "str r8, [r0], #4 \n\t" \
  305. "umaal r9, r10, r5, r14 \n\t" \
  306. "umaal r10, r11, r6, r14 \n\t" \
  307. "umaal r11, r12, r7, r14 \n\t" \
  308. \
  309. "ldr r14, [r1], #4 \n\t" \
  310. "mov r8, #0 \n\t" \
  311. "umaal r8, r9, r4, r14 \n\t" \
  312. "str r8, [r0], #4 \n\t" \
  313. "umaal r9, r10, r5, r14 \n\t" \
  314. "umaal r10, r11, r6, r14 \n\t" \
  315. "umaal r11, r12, r7, r14 \n\t" \
  316. \
  317. "ldr r14, [r1], #4 \n\t" \
  318. "mov r8, #0 \n\t" \
  319. "umaal r8, r9, r4, r14 \n\t" \
  320. "str r8, [r0], #4 \n\t" \
  321. "umaal r9, r10, r5, r14 \n\t" \
  322. "umaal r10, r11, r6, r14 \n\t" \
  323. "umaal r11, r12, r7, r14 \n\t" \
  324. \
  325. "ldr r14, [r1], #4 \n\t" \
  326. "mov r8, #0 \n\t" \
  327. "umaal r8, r9, r4, r14 \n\t" \
  328. "str r8, [r0], #4 \n\t" \
  329. "umaal r9, r10, r5, r14 \n\t" \
  330. "umaal r10, r11, r6, r14 \n\t" \
  331. "umaal r11, r12, r7, r14 \n\t" \
  332. \
  333. "ldr r14, [r1], #4 \n\t" \
  334. "mov r8, #0 \n\t" \
  335. "umaal r8, r9, r4, r14 \n\t" \
  336. "str r8, [r0], #4 \n\t" \
  337. "umaal r9, r10, r5, r14 \n\t" \
  338. "umaal r10, r11, r6, r14 \n\t" \
  339. "umaal r11, r12, r7, r14 \n\t" \
  340. \
  341. "str r9, [r0], #4 \n\t" \
  342. "str r10, [r0], #4 \n\t" \
  343. "str r11, [r0], #4 \n\t" \
  344. "str r12, [r0], #4 \n\t" \
  345. \
  346. "sub r0, #28 \n\t" \
  347. "sub r1, #28 \n\t" \
  348. "ldmia r2!, {r4, r5, r6} \n\t" \
  349. \
  350. "ldr r14, [r1], #4 \n\t" \
  351. "ldr r8, [r0] \n\t" \
  352. "mov r9, #0 \n\t" \
  353. "umaal r8, r9, r4, r14 \n\t" \
  354. "str r8, [r0], #4 \n\t" \
  355. "mov r10, #0 \n\t" \
  356. "umaal r9, r10, r5, r14 \n\t" \
  357. "mov r11, #0 \n\t" \
  358. "umaal r10, r11, r6, r14 \n\t" \
  359. \
  360. "ldr r14, [r1], #4 \n\t" \
  361. "ldr r8, [r0] \n\t" \
  362. "umaal r8, r9, r4, r14 \n\t" \
  363. "str r8, [r0], #4 \n\t" \
  364. "umaal r9, r10, r5, r14 \n\t" \
  365. "umaal r10, r11, r6, r14 \n\t" \
  366. \
  367. "ldr r14, [r1], #4 \n\t" \
  368. "ldr r8, [r0] \n\t" \
  369. "umaal r8, r9, r4, r14 \n\t" \
  370. "str r8, [r0], #4 \n\t" \
  371. "umaal r9, r10, r5, r14 \n\t" \
  372. "umaal r10, r11, r6, r14 \n\t" \
  373. \
  374. "ldr r14, [r1], #4 \n\t" \
  375. "ldr r8, [r0] \n\t" \
  376. "umaal r8, r9, r4, r14 \n\t" \
  377. "str r8, [r0], #4 \n\t" \
  378. "umaal r9, r10, r5, r14 \n\t" \
  379. "umaal r10, r11, r6, r14 \n\t" \
  380. \
  381. "ldr r14, [r1], #4 \n\t" \
  382. "ldr r8, [r0] \n\t" \
  383. "umaal r8, r9, r4, r14 \n\t" \
  384. "str r8, [r0], #4 \n\t" \
  385. "umaal r9, r10, r5, r14 \n\t" \
  386. "umaal r10, r11, r6, r14 \n\t" \
  387. \
  388. "ldr r14, [r1], #4 \n\t" \
  389. "ldr r8, [r0] \n\t" \
  390. "umaal r8, r9, r4, r14 \n\t" \
  391. "str r8, [r0], #4 \n\t" \
  392. "umaal r9, r10, r5, r14 \n\t" \
  393. "umaal r10, r11, r6, r14 \n\t" \
  394. \
  395. "ldr r14, [r1], #4 \n\t" \
  396. "ldr r8, [r0] \n\t" \
  397. "umaal r8, r9, r4, r14 \n\t" \
  398. "str r8, [r0], #4 \n\t" \
  399. "umaal r9, r10, r5, r14 \n\t" \
  400. "umaal r10, r11, r6, r14 \n\t" \
  401. \
  402. "str r9, [r0], #4 \n\t" \
  403. "str r10, [r0], #4 \n\t" \
  404. "str r11, [r0], #4 \n\t"
  405. #define FAST_MULT_ASM_7_TO_8 \
  406. "cmp r3, #7 \n\t" \
  407. "beq 1f \n\t" \
  408. "push {r3} \n\t" \
  409. \
  410. /* r4 = left high */ \
  411. "ldr r4, [r1] \n\t" \
  412. \
  413. "sub r0, #28 \n\t" \
  414. "sub r1, #28 \n\t" \
  415. "sub r2, #28 \n\t" \
  416. \
  417. /* Do right side */ \
  418. "ldr r14, [r2], #4 \n\t" \
  419. "mov r5, #0 \n\t" \
  420. "ldr r6, [r0], #4 \n\t" \
  421. "umaal r5, r6, r4, r14 \n\t" \
  422. "ldr r14, [r2], #4 \n\t" \
  423. "ldr r7, [r0], #4 \n\t" \
  424. "umaal r6, r7, r4, r14 \n\t" \
  425. "ldr r14, [r2], #4 \n\t" \
  426. "ldr r8, [r0], #4 \n\t" \
  427. "umaal r7, r8, r4, r14 \n\t" \
  428. "ldr r14, [r2], #4 \n\t" \
  429. "ldr r9, [r0], #4 \n\t" \
  430. "umaal r8, r9, r4, r14 \n\t" \
  431. "ldr r14, [r2], #4 \n\t" \
  432. "ldr r10, [r0], #4 \n\t" \
  433. "umaal r9, r10, r4, r14 \n\t" \
  434. "ldr r14, [r2], #4 \n\t" \
  435. "ldr r11, [r0], #4 \n\t" \
  436. "umaal r10, r11, r4, r14 \n\t" \
  437. "ldr r14, [r2], #4 \n\t" \
  438. "ldr r12, [r0], #4 \n\t" \
  439. "umaal r11, r12, r4, r14 \n\t" \
  440. "sub r0, #28 \n\t" \
  441. \
  442. /* r4 = right high */ \
  443. "ldr r4, [r2], #4 \n\t" \
  444. \
  445. /* Do left side */ \
  446. "ldr r14, [r1], #4 \n\t" \
  447. "mov r3, #0 \n\t" \
  448. "umaal r3, r5, r4, r14 \n\t" \
  449. "str r3, [r0], #4 \n\t" \
  450. "ldr r14, [r1], #4 \n\t" \
  451. "umaal r5, r6, r4, r14 \n\t" \
  452. "str r5, [r0], #4 \n\t" \
  453. "ldr r14, [r1], #4 \n\t" \
  454. "umaal r6, r7, r4, r14 \n\t" \
  455. "str r6, [r0], #4 \n\t" \
  456. "ldr r14, [r1], #4 \n\t" \
  457. "umaal r7, r8, r4, r14 \n\t" \
  458. "str r7, [r0], #4 \n\t" \
  459. "ldr r14, [r1], #4 \n\t" \
  460. "umaal r8, r9, r4, r14 \n\t" \
  461. "str r8, [r0], #4 \n\t" \
  462. "ldr r14, [r1], #4 \n\t" \
  463. "umaal r9, r10, r4, r14 \n\t" \
  464. "str r9, [r0], #4 \n\t" \
  465. "ldr r14, [r1], #4 \n\t" \
  466. "umaal r10, r11, r4, r14 \n\t" \
  467. "str r10, [r0], #4 \n\t" \
  468. \
  469. "ldr r14, [r1], #4 \n\t" \
  470. "umaal r11, r12, r4, r14 \n\t" \
  471. "stmia r0!, {r11, r12} \n\t" \
  472. "pop {r3} \n\t"
  473. #define FAST_MULT_ASM_8 \
  474. "ldmia r2!, {r4, r5, r6, r7} \n\t" \
  475. \
  476. "ldr r14, [r1], #4 \n\t" \
  477. "umull r8, r9, r4, r14 \n\t" \
  478. "str r8, [r0], #4 \n\t" \
  479. "mov r10, #0 \n\t" \
  480. "umaal r9, r10, r5, r14 \n\t" \
  481. "mov r11, #0 \n\t" \
  482. "umaal r10, r11, r6, r14 \n\t" \
  483. "mov r12, #0 \n\t" \
  484. "umaal r11, r12, r7, r14 \n\t" \
  485. \
  486. "ldr r14, [r1], #4 \n\t" \
  487. "mov r8, #0 \n\t" \
  488. "umaal r8, r9, r4, r14 \n\t" \
  489. "str r8, [r0], #4 \n\t" \
  490. "umaal r9, r10, r5, r14 \n\t" \
  491. "umaal r10, r11, r6, r14 \n\t" \
  492. "umaal r11, r12, r7, r14 \n\t" \
  493. \
  494. "ldr r14, [r1], #4 \n\t" \
  495. "mov r8, #0 \n\t" \
  496. "umaal r8, r9, r4, r14 \n\t" \
  497. "str r8, [r0], #4 \n\t" \
  498. "umaal r9, r10, r5, r14 \n\t" \
  499. "umaal r10, r11, r6, r14 \n\t" \
  500. "umaal r11, r12, r7, r14 \n\t" \
  501. \
  502. "ldr r14, [r1], #4 \n\t" \
  503. "mov r8, #0 \n\t" \
  504. "umaal r8, r9, r4, r14 \n\t" \
  505. "str r8, [r0], #4 \n\t" \
  506. "umaal r9, r10, r5, r14 \n\t" \
  507. "umaal r10, r11, r6, r14 \n\t" \
  508. "umaal r11, r12, r7, r14 \n\t" \
  509. \
  510. "ldr r14, [r1], #4 \n\t" \
  511. "mov r8, #0 \n\t" \
  512. "umaal r8, r9, r4, r14 \n\t" \
  513. "str r8, [r0], #4 \n\t" \
  514. "umaal r9, r10, r5, r14 \n\t" \
  515. "umaal r10, r11, r6, r14 \n\t" \
  516. "umaal r11, r12, r7, r14 \n\t" \
  517. \
  518. "ldr r14, [r1], #4 \n\t" \
  519. "mov r8, #0 \n\t" \
  520. "umaal r8, r9, r4, r14 \n\t" \
  521. "str r8, [r0], #4 \n\t" \
  522. "umaal r9, r10, r5, r14 \n\t" \
  523. "umaal r10, r11, r6, r14 \n\t" \
  524. "umaal r11, r12, r7, r14 \n\t" \
  525. \
  526. "ldr r14, [r1], #4 \n\t" \
  527. "mov r8, #0 \n\t" \
  528. "umaal r8, r9, r4, r14 \n\t" \
  529. "str r8, [r0], #4 \n\t" \
  530. "umaal r9, r10, r5, r14 \n\t" \
  531. "umaal r10, r11, r6, r14 \n\t" \
  532. "umaal r11, r12, r7, r14 \n\t" \
  533. \
  534. "ldr r14, [r1], #4 \n\t" \
  535. "mov r8, #0 \n\t" \
  536. "umaal r8, r9, r4, r14 \n\t" \
  537. "str r8, [r0], #4 \n\t" \
  538. "umaal r9, r10, r5, r14 \n\t" \
  539. "umaal r10, r11, r6, r14 \n\t" \
  540. "umaal r11, r12, r7, r14 \n\t" \
  541. \
  542. "str r9, [r0], #4 \n\t" \
  543. "str r10, [r0], #4 \n\t" \
  544. "str r11, [r0], #4 \n\t" \
  545. "str r12, [r0], #4 \n\t" \
  546. \
  547. "sub r0, #32 \n\t" \
  548. "sub r1, #32 \n\t" \
  549. "ldmia r2!, {r4, r5, r6, r7} \n\t" \
  550. \
  551. "ldr r14, [r1], #4 \n\t" \
  552. "ldr r8, [r0] \n\t" \
  553. "mov r9, #0 \n\t" \
  554. "umaal r8, r9, r4, r14 \n\t" \
  555. "str r8, [r0], #4 \n\t" \
  556. "mov r10, #0 \n\t" \
  557. "umaal r9, r10, r5, r14 \n\t" \
  558. "mov r11, #0 \n\t" \
  559. "umaal r10, r11, r6, r14 \n\t" \
  560. "mov r12, #0 \n\t" \
  561. "umaal r11, r12, r7, r14 \n\t" \
  562. \
  563. "ldr r14, [r1], #4 \n\t" \
  564. "ldr r8, [r0] \n\t" \
  565. "umaal r8, r9, r4, r14 \n\t" \
  566. "str r8, [r0], #4 \n\t" \
  567. "umaal r9, r10, r5, r14 \n\t" \
  568. "umaal r10, r11, r6, r14 \n\t" \
  569. "umaal r11, r12, r7, r14 \n\t" \
  570. \
  571. "ldr r14, [r1], #4 \n\t" \
  572. "ldr r8, [r0] \n\t" \
  573. "umaal r8, r9, r4, r14 \n\t" \
  574. "str r8, [r0], #4 \n\t" \
  575. "umaal r9, r10, r5, r14 \n\t" \
  576. "umaal r10, r11, r6, r14 \n\t" \
  577. "umaal r11, r12, r7, r14 \n\t" \
  578. \
  579. "ldr r14, [r1], #4 \n\t" \
  580. "ldr r8, [r0] \n\t" \
  581. "umaal r8, r9, r4, r14 \n\t" \
  582. "str r8, [r0], #4 \n\t" \
  583. "umaal r9, r10, r5, r14 \n\t" \
  584. "umaal r10, r11, r6, r14 \n\t" \
  585. "umaal r11, r12, r7, r14 \n\t" \
  586. \
  587. "ldr r14, [r1], #4 \n\t" \
  588. "ldr r8, [r0] \n\t" \
  589. "umaal r8, r9, r4, r14 \n\t" \
  590. "str r8, [r0], #4 \n\t" \
  591. "umaal r9, r10, r5, r14 \n\t" \
  592. "umaal r10, r11, r6, r14 \n\t" \
  593. "umaal r11, r12, r7, r14 \n\t" \
  594. \
  595. "ldr r14, [r1], #4 \n\t" \
  596. "ldr r8, [r0] \n\t" \
  597. "umaal r8, r9, r4, r14 \n\t" \
  598. "str r8, [r0], #4 \n\t" \
  599. "umaal r9, r10, r5, r14 \n\t" \
  600. "umaal r10, r11, r6, r14 \n\t" \
  601. "umaal r11, r12, r7, r14 \n\t" \
  602. \
  603. "ldr r14, [r1], #4 \n\t" \
  604. "ldr r8, [r0] \n\t" \
  605. "umaal r8, r9, r4, r14 \n\t" \
  606. "str r8, [r0], #4 \n\t" \
  607. "umaal r9, r10, r5, r14 \n\t" \
  608. "umaal r10, r11, r6, r14 \n\t" \
  609. "umaal r11, r12, r7, r14 \n\t" \
  610. \
  611. "ldr r14, [r1], #4 \n\t" \
  612. "ldr r8, [r0] \n\t" \
  613. "umaal r8, r9, r4, r14 \n\t" \
  614. "str r8, [r0], #4 \n\t" \
  615. "umaal r9, r10, r5, r14 \n\t" \
  616. "umaal r10, r11, r6, r14 \n\t" \
  617. "umaal r11, r12, r7, r14 \n\t" \
  618. \
  619. "str r9, [r0], #4 \n\t" \
  620. "str r10, [r0], #4 \n\t" \
  621. "str r11, [r0], #4 \n\t" \
  622. "str r12, [r0], #4 \n\t"
  623. #define FAST_SQUARE_ASM_5 \
  624. "ldmia r1!, {r9,r10,r11,r12,r14} \n\t" \
  625. "push {r1, r2} \n\t" \
  626. \
  627. "umull r1, r2, r10, r9 \n\t" \
  628. "mov r3, #0 \n\t" \
  629. "umaal r2, r3, r11, r9 \n\t" \
  630. "mov r4, #0 \n\t" \
  631. "umaal r3, r4, r12, r9 \n\t" \
  632. "mov r5, #0 \n\t" \
  633. "umaal r4, r5, r14, r9 \n\t" \
  634. \
  635. "mov r6, #0 \n\t" \
  636. "umaal r6, r3, r11, r10 \n\t" \
  637. "umaal r3, r4, r12, r10 \n\t" \
  638. "adds r1, r1, r1 \n\t" \
  639. "adcs r2, r2, r2 \n\t" \
  640. "adcs r6, r6, r6 \n\t" \
  641. "adcs r3, r3, r3 \n\t" \
  642. \
  643. "umull r7, r8, r9, r9 \n\t" \
  644. /* Store carry in r9 */ \
  645. "mov r9, #0 \n\t" \
  646. "adc r9, r9, #0 \n\t" \
  647. "adds r8, r8, r1 \n\t" \
  648. "stmia r0!, {r7,r8} \n\t" \
  649. \
  650. "umull r7, r8, r10, r10 \n\t" \
  651. "adcs r7, r7, r2 \n\t" \
  652. "adcs r8, r8, r6 \n\t" \
  653. "stmia r0!, {r7,r8} \n\t" \
  654. \
  655. "umaal r4, r5, r14, r10 \n\t" \
  656. /* Store carry in r10 */ \
  657. "mov r10, #0 \n\t" \
  658. "adc r10, r10, #0 \n\t" \
  659. \
  660. "mov r1, #0 \n\t" \
  661. "umaal r1, r4, r12, r11 \n\t" \
  662. "umaal r4, r5, r14, r11 \n\t" \
  663. \
  664. "mov r2, #0 \n\t" \
  665. "umaal r2, r5, r14, r12 \n\t" \
  666. /* Load carry from r9 */ \
  667. "lsrs r9, #1 \n\t" \
  668. "adcs r1, r1, r1 \n\t" \
  669. "adcs r4, r4, r4 \n\t" \
  670. "adcs r2, r2, r2 \n\t" \
  671. "adcs r5, r5, r5 \n\t" \
  672. /* r9 is 0 now */ \
  673. "adc r9, r9, #0 \n\t" \
  674. \
  675. /* Use carry from r10 */ \
  676. "umaal r3, r10, r11, r11 \n\t" \
  677. "adds r10, r10, r1 \n\t" \
  678. "stmia r0!, {r3,r10} \n\t" \
  679. \
  680. "umull r6, r10, r12, r12 \n\t" \
  681. "adcs r6, r6, r4 \n\t" \
  682. "adcs r10, r10, r2 \n\t" \
  683. "stmia r0!, {r6,r10} \n\t" \
  684. \
  685. "umull r6, r10, r14, r14 \n\t" \
  686. "adcs r6, r6, r5 \n\t" \
  687. "adcs r10, r10, r9 \n\t" \
  688. "stmia r0!, {r6,r10} \n\t" \
  689. "pop {r1, r2} \n\t"
  690. #define FAST_SQUARE_ASM_5_TO_6 \
  691. "cmp r2, #5 \n\t" \
  692. "beq 1f \n\t" \
  693. \
  694. "sub r0, #20 \n\t" \
  695. "sub r1, #20 \n\t" \
  696. \
  697. /* Do off-center multiplication */ \
  698. "ldmia r1!, {r5,r6,r7,r8,r9,r14} \n\t" \
  699. "umull r3, r4, r5, r14 \n\t" \
  700. "mov r5, #0 \n\t" \
  701. "umaal r4, r5, r6, r14 \n\t" \
  702. "mov r6, #0 \n\t" \
  703. "umaal r5, r6, r7, r14 \n\t" \
  704. "mov r7, #0 \n\t" \
  705. "umaal r6, r7, r8, r14 \n\t" \
  706. "mov r8, #0 \n\t" \
  707. "umaal r7, r8, r9, r14 \n\t" \
  708. \
  709. /* Multiply by 2 */ \
  710. "mov r9, #0 \n\t" \
  711. "adds r3, r3, r3 \n\t" \
  712. "adcs r4, r4, r4 \n\t" \
  713. "adcs r5, r5, r5 \n\t" \
  714. "adcs r6, r6, r6 \n\t" \
  715. "adcs r7, r7, r7 \n\t" \
  716. "adcs r8, r8, r8 \n\t" \
  717. "adcs r9, r9, #0 \n\t" \
  718. \
  719. /* Add into previous */ \
  720. "ldr r12, [r0], #4 \n\t" \
  721. "adds r3, r3, r12 \n\t" \
  722. "ldr r12, [r0], #4 \n\t" \
  723. "adcs r4, r4, r12 \n\t" \
  724. "ldr r12, [r0], #4 \n\t" \
  725. "adcs r5, r5, r12 \n\t" \
  726. "ldr r12, [r0], #4 \n\t" \
  727. "adcs r6, r6, r12 \n\t" \
  728. "ldr r12, [r0], #4 \n\t" \
  729. "adcs r7, r7, r12 \n\t" \
  730. "adcs r8, r8, #0 \n\t" \
  731. "adcs r9, r9, #0 \n\t" \
  732. "sub r0, #20 \n\t" \
  733. \
  734. /* Perform center multiplication */ \
  735. "umlal r8, r9, r14, r14 \n\t" \
  736. "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
  737. #define FAST_SQUARE_ASM_6 \
  738. "ldmia r1!, {r8,r9,r10,r11,r12,r14} \n\t" \
  739. "push {r1, r2} \n\t" \
  740. \
  741. "umull r1, r2, r9, r8 \n\t" \
  742. "mov r3, #0 \n\t" \
  743. "umaal r2, r3, r10, r8 \n\t" \
  744. "mov r4, #0 \n\t" \
  745. "umaal r3, r4, r11, r8 \n\t" \
  746. "mov r5, #0 \n\t" \
  747. "umaal r4, r5, r12, r8 \n\t" \
  748. "mov r6, #0 \n\t" \
  749. "umaal r5, r6, r14, r8 \n\t" \
  750. \
  751. "mov r7, #0 \n\t" \
  752. "umaal r7, r3, r10, r9 \n\t" \
  753. "umaal r3, r4, r11, r9 \n\t" \
  754. "umaal r4, r5, r12, r9 \n\t" \
  755. "push {r4, r5} \n\t" \
  756. "adds r1, r1, r1 \n\t" \
  757. "adcs r2, r2, r2 \n\t" \
  758. "adcs r7, r7, r7 \n\t" \
  759. "adcs r3, r3, r3 \n\t" \
  760. \
  761. "umull r4, r5, r8, r8 \n\t" \
  762. /* Store carry in r8 */ \
  763. "mov r8, #0 \n\t" \
  764. "adc r8, r8, #0 \n\t" \
  765. "adds r5, r5, r1 \n\t" \
  766. "stmia r0!, {r4,r5} \n\t" \
  767. \
  768. "umull r4, r5, r9, r9 \n\t" \
  769. "adcs r4, r4, r2 \n\t" \
  770. "adcs r5, r5, r7 \n\t" \
  771. "stmia r0!, {r4,r5} \n\t" \
  772. \
  773. "pop {r4, r5} \n\t" \
  774. "umaal r5, r6, r14, r9 \n\t" \
  775. /* Store carry in r9 */ \
  776. "mov r9, #0 \n\t" \
  777. "adc r9, r9, #0 \n\t" \
  778. \
  779. "mov r1, #0 \n\t" \
  780. "umaal r1, r4, r11, r10 \n\t" \
  781. "umaal r4, r5, r12, r10 \n\t" \
  782. "umaal r5, r6, r14, r10 \n\t" \
  783. \
  784. "mov r2, #0 \n\t" \
  785. "umaal r2, r5, r12, r11 \n\t" \
  786. "umaal r5, r6, r14, r11 \n\t" \
  787. \
  788. "mov r7, #0 \n\t" \
  789. "umaal r7, r6, r14, r12 \n\t" \
  790. \
  791. /* Load carry from r8 */ \
  792. "lsrs r8, #1 \n\t" \
  793. "adcs r1, r1, r1 \n\t" \
  794. "adcs r4, r4, r4 \n\t" \
  795. "adcs r2, r2, r2 \n\t" \
  796. "adcs r5, r5, r5 \n\t" \
  797. "adcs r7, r7, r7 \n\t" \
  798. "adcs r6, r6, r6 \n\t" \
  799. "adc r8, r8, #0 \n\t" \
  800. \
  801. /* Use carry from r9 */ \
  802. "umaal r3, r9, r10, r10 \n\t" \
  803. "adds r9, r9, r1 \n\t" \
  804. "stmia r0!, {r3,r9} \n\t" \
  805. \
  806. "umull r9, r10, r11, r11 \n\t" \
  807. "adcs r9, r9, r4 \n\t" \
  808. "adcs r10, r10, r2 \n\t" \
  809. "stmia r0!, {r9,r10} \n\t" \
  810. \
  811. "umull r9, r10, r12, r12 \n\t" \
  812. "adcs r9, r9, r5 \n\t" \
  813. "adcs r10, r10, r7 \n\t" \
  814. "stmia r0!, {r9,r10} \n\t" \
  815. \
  816. "umull r9, r10, r14, r14 \n\t" \
  817. "adcs r9, r9, r6 \n\t" \
  818. "adcs r10, r10, r8 \n\t" \
  819. "stmia r0!, {r9,r10} \n\t" \
  820. "pop {r1, r2} \n\t"
  821. #define FAST_SQUARE_ASM_6_TO_7 \
  822. "cmp r2, #6 \n\t" \
  823. "beq 1f \n\t" \
  824. \
  825. "sub r0, #24 \n\t" \
  826. "sub r1, #24 \n\t" \
  827. \
  828. /* Do off-center multiplication */ \
  829. "ldmia r1!, {r5,r6,r7,r8,r9,r10,r14} \n\t" \
  830. "umull r3, r4, r5, r14 \n\t" \
  831. "mov r5, #0 \n\t" \
  832. "umaal r4, r5, r6, r14 \n\t" \
  833. "mov r6, #0 \n\t" \
  834. "umaal r5, r6, r7, r14 \n\t" \
  835. "mov r7, #0 \n\t" \
  836. "umaal r6, r7, r8, r14 \n\t" \
  837. "mov r8, #0 \n\t" \
  838. "umaal r7, r8, r9, r14 \n\t" \
  839. "mov r9, #0 \n\t" \
  840. "umaal r8, r9, r10, r14 \n\t" \
  841. \
  842. /* Multiply by 2 */ \
  843. "mov r10, #0 \n\t" \
  844. "adds r3, r3, r3 \n\t" \
  845. "adcs r4, r4, r4 \n\t" \
  846. "adcs r5, r5, r5 \n\t" \
  847. "adcs r6, r6, r6 \n\t" \
  848. "adcs r7, r7, r7 \n\t" \
  849. "adcs r8, r8, r8 \n\t" \
  850. "adcs r9, r9, r9 \n\t" \
  851. "adcs r10, r10, #0 \n\t" \
  852. \
  853. /* Add into previous */ \
  854. "ldr r12, [r0], #4 \n\t" \
  855. "adds r3, r3, r12 \n\t" \
  856. "ldr r12, [r0], #4 \n\t" \
  857. "adcs r4, r4, r12 \n\t" \
  858. "ldr r12, [r0], #4 \n\t" \
  859. "adcs r5, r5, r12 \n\t" \
  860. "ldr r12, [r0], #4 \n\t" \
  861. "adcs r6, r6, r12 \n\t" \
  862. "ldr r12, [r0], #4 \n\t" \
  863. "adcs r7, r7, r12 \n\t" \
  864. "ldr r12, [r0], #4 \n\t" \
  865. "adcs r8, r8, r12 \n\t" \
  866. "adcs r9, r9, #0 \n\t" \
  867. "adcs r10, r10, #0 \n\t" \
  868. "sub r0, #24 \n\t" \
  869. \
  870. /* Perform center multiplication */ \
  871. "umlal r9, r10, r14, r14 \n\t" \
  872. "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
  873. #define FAST_SQUARE_ASM_7 \
  874. "ldmia r1!, {r9,r10,r11,r12} \n\t" \
  875. "push {r2} \n\t" \
  876. \
  877. "umull r14, r2, r10, r9 \n\t" \
  878. "mov r3, #0 \n\t" \
  879. "umaal r2, r3, r11, r9 \n\t" \
  880. "mov r4, #0 \n\t" \
  881. "umaal r3, r4, r12, r9 \n\t" \
  882. \
  883. "mov r5, #0 \n\t" \
  884. "umaal r5, r3, r11, r10 \n\t" \
  885. "adds r14, r14, r14 \n\t" \
  886. "adcs r2, r2, r2 \n\t" \
  887. "adcs r5, r5, r5 \n\t" \
  888. /* Store carry in r7 */ \
  889. "mov r7, #0 \n\t" \
  890. "adc r7, r7, #0 \n\t" \
  891. \
  892. "umull r6, r8, r9, r9 \n\t" \
  893. "adds r8, r8, r14 \n\t" \
  894. "stmia r0!, {r6,r8} \n\t" \
  895. \
  896. "umull r6, r8, r10, r10 \n\t" \
  897. "adcs r6, r6, r2 \n\t" \
  898. "adcs r8, r8, r5 \n\t" \
  899. "stmia r0!, {r6,r8} \n\t" \
  900. /* Store carry in r8 */ \
  901. "mov r8, #0 \n\t" \
  902. "adc r8, r8, #0 \n\t" \
  903. \
  904. "ldmia r1!, {r2, r6, r14} \n\t" \
  905. "push {r1} \n\t" \
  906. "umaal r3, r4, r2, r9 \n\t" \
  907. "mov r5, #0 \n\t" \
  908. "umaal r4, r5, r6, r9 \n\t" \
  909. "mov r1, #0 \n\t" \
  910. "umaal r5, r1, r14, r9 \n\t" \
  911. \
  912. "mov r9, #0 \n\t" \
  913. "umaal r3, r9, r12, r10 \n\t" \
  914. "umaal r9, r4, r2, r10 \n\t" \
  915. "umaal r4, r5, r6, r10 \n\t" \
  916. "umaal r5, r1, r14, r10 \n\t" \
  917. \
  918. "mov r10, #0 \n\t" \
  919. "umaal r10, r9, r12, r11 \n\t" \
  920. "umaal r9, r4, r2, r11 \n\t" \
  921. "umaal r4, r5, r6, r11 \n\t" \
  922. "umaal r5, r1, r14, r11 \n\t" \
  923. \
  924. /* Load carry from r7 */ \
  925. "lsrs r7, #1 \n\t" \
  926. "adcs r3, r3, r3 \n\t" \
  927. "adcs r10, r10, r10 \n\t" \
  928. "adcs r9, r9, r9 \n\t" \
  929. /* Store carry back in r7 */ \
  930. "adc r7, r7, #0 \n\t" \
  931. \
  932. /* Use carry from r8 */ \
  933. "umaal r3, r8, r11, r11 \n\t" \
  934. "adds r8, r8, r10 \n\t" \
  935. "stmia r0!, {r3,r8} \n\t" \
  936. /* Store carry back in r8 */ \
  937. "mov r8, #0 \n\t" \
  938. "adc r8, r8, #0 \n\t" \
  939. \
  940. "mov r3, #0 \n\t" \
  941. "umaal r3, r4, r2, r12 \n\t" \
  942. "umaal r4, r5, r6, r12 \n\t" \
  943. "umaal r5, r1, r14, r12 \n\t" \
  944. \
  945. "mov r10, #0 \n\t" \
  946. "umaal r10, r5, r6, r2 \n\t" \
  947. "umaal r5, r1, r14, r2 \n\t" \
  948. \
  949. "mov r11, #0 \n\t" \
  950. "umaal r11, r1, r14, r6 \n\t" \
  951. \
  952. /* Load carry from r7 */ \
  953. "lsrs r7, #1 \n\t" \
  954. "adcs r3, r3, r3 \n\t" \
  955. "adcs r4, r4, r4 \n\t" \
  956. "adcs r10, r10, r10 \n\t" \
  957. "adcs r5, r5, r5 \n\t" \
  958. "adcs r11, r11, r11 \n\t" \
  959. "adcs r1, r1, r1 \n\t" \
  960. "adc r7, r7, #0 \n\t" \
  961. \
  962. /* Use carry from r8 */ \
  963. "umaal r8, r9, r12, r12 \n\t" \
  964. "adds r9, r9, r3 \n\t" \
  965. "stmia r0!, {r8,r9} \n\t" \
  966. \
  967. "umull r8, r9, r2, r2 \n\t" \
  968. "adcs r8, r8, r4 \n\t" \
  969. "adcs r9, r9, r10 \n\t" \
  970. "stmia r0!, {r8,r9} \n\t" \
  971. \
  972. "umull r8, r9, r6, r6 \n\t" \
  973. "adcs r8, r8, r5 \n\t" \
  974. "adcs r9, r9, r11 \n\t" \
  975. "stmia r0!, {r8,r9} \n\t" \
  976. \
  977. "umull r8, r9, r14, r14 \n\t" \
  978. "adcs r8, r8, r1 \n\t" \
  979. "adcs r9, r9, r7 \n\t" \
  980. "stmia r0!, {r8,r9} \n\t" \
  981. "pop {r1, r2} \n\t"
  982. #define FAST_SQUARE_ASM_7_TO_8 \
  983. "cmp r2, #7 \n\t" \
  984. "beq 1f \n\t" \
  985. \
  986. "sub r0, #28 \n\t" \
  987. "sub r1, #28 \n\t" \
  988. \
  989. /* Do off-center multiplication */ \
  990. "ldmia r1!, {r5,r6,r7,r8,r9,r10,r11,r14} \n\t" \
  991. "umull r3, r4, r5, r14 \n\t" \
  992. "mov r5, #0 \n\t" \
  993. "umaal r4, r5, r6, r14 \n\t" \
  994. "mov r6, #0 \n\t" \
  995. "umaal r5, r6, r7, r14 \n\t" \
  996. "mov r7, #0 \n\t" \
  997. "umaal r6, r7, r8, r14 \n\t" \
  998. "mov r8, #0 \n\t" \
  999. "umaal r7, r8, r9, r14 \n\t" \
  1000. "mov r9, #0 \n\t" \
  1001. "umaal r8, r9, r10, r14 \n\t" \
  1002. "mov r10, #0 \n\t" \
  1003. "umaal r9, r10, r11, r14 \n\t" \
  1004. \
  1005. /* Multiply by 2 */ \
  1006. "mov r11, #0 \n\t" \
  1007. "adds r3, r3, r3 \n\t" \
  1008. "adcs r4, r4, r4 \n\t" \
  1009. "adcs r5, r5, r5 \n\t" \
  1010. "adcs r6, r6, r6 \n\t" \
  1011. "adcs r7, r7, r7 \n\t" \
  1012. "adcs r8, r8, r8 \n\t" \
  1013. "adcs r9, r9, r9 \n\t" \
  1014. "adcs r10, r10, r10 \n\t" \
  1015. "adcs r11, r11, #0 \n\t" \
  1016. \
  1017. /* Add into previous */ \
  1018. "ldr r12, [r0], #4 \n\t" \
  1019. "adds r3, r3, r12 \n\t" \
  1020. "ldr r12, [r0], #4 \n\t" \
  1021. "adcs r4, r4, r12 \n\t" \
  1022. "ldr r12, [r0], #4 \n\t" \
  1023. "adcs r5, r5, r12 \n\t" \
  1024. "ldr r12, [r0], #4 \n\t" \
  1025. "adcs r6, r6, r12 \n\t" \
  1026. "ldr r12, [r0], #4 \n\t" \
  1027. "adcs r7, r7, r12 \n\t" \
  1028. "ldr r12, [r0], #4 \n\t" \
  1029. "adcs r8, r8, r12 \n\t" \
  1030. "ldr r12, [r0], #4 \n\t" \
  1031. "adcs r9, r9, r12 \n\t" \
  1032. "adcs r10, r10, #0 \n\t" \
  1033. "adcs r11, r11, #0 \n\t" \
  1034. "sub r0, #28 \n\t" \
  1035. \
  1036. /* Perform center multiplication */ \
  1037. "umlal r10, r11, r14, r14 \n\t" \
  1038. "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
  1039. #define FAST_SQUARE_ASM_8 \
  1040. "ldmia r1!, {r10,r11,r12,r14} \n\t" \
  1041. "push {r2} \n\t" \
  1042. \
  1043. "umull r2, r3, r11, r10 \n\t" \
  1044. "mov r4, #0 \n\t" \
  1045. "umaal r3, r4, r12, r10 \n\t" \
  1046. "mov r5, #0 \n\t" \
  1047. "umaal r4, r5, r14, r10 \n\t" \
  1048. \
  1049. "mov r6, #0 \n\t" \
  1050. "umaal r6, r4, r12, r11 \n\t" \
  1051. "adds r2, r2, r2 \n\t" \
  1052. "adcs r3, r3, r3 \n\t" \
  1053. "adcs r6, r6, r6 \n\t" \
  1054. /* Store carry in r7 */ \
  1055. "mov r7, #0 \n\t" \
  1056. "adc r7, r7, #0 \n\t" \
  1057. \
  1058. "umull r8, r9, r10, r10 \n\t" \
  1059. "adds r9, r9, r2 \n\t" \
  1060. "stmia r0!, {r8,r9} \n\t" \
  1061. \
  1062. "umull r8, r9, r11, r11 \n\t" \
  1063. "adcs r8, r8, r3 \n\t" \
  1064. "adcs r9, r9, r6 \n\t" \
  1065. "stmia r0!, {r8,r9} \n\t" \
  1066. /* Store carry in r8 */ \
  1067. "mov r8, #0 \n\t" \
  1068. "adc r8, r8, #0 \n\t" \
  1069. \
  1070. "ldmia r1!, {r2, r3} \n\t" \
  1071. "push {r1} \n\t" \
  1072. "umaal r4, r5, r2, r10 \n\t" \
  1073. "mov r6, #0 \n\t" \
  1074. "umaal r5, r6, r3, r10 \n\t" \
  1075. \
  1076. "mov r9, #0 \n\t" \
  1077. "umaal r9, r4, r14, r11 \n\t" \
  1078. "umaal r4, r5, r2, r11 \n\t" \
  1079. \
  1080. "mov r1, #0 \n\t" \
  1081. "umaal r1, r4, r14, r12 \n\t" \
  1082. \
  1083. /* Load carry from r7 */ \
  1084. "lsrs r7, #1 \n\t" \
  1085. "adcs r9, r9, r9 \n\t" \
  1086. "adcs r1, r1, r1 \n\t" \
  1087. /* Store carry back in r7 */ \
  1088. "adc r7, r7, #0 \n\t" \
  1089. \
  1090. /* Use carry from r8 */ \
  1091. "umaal r8, r9, r12, r12 \n\t" \
  1092. "adds r9, r9, r1 \n\t" \
  1093. "stmia r0!, {r8,r9} \n\t" \
  1094. /* Store carry back in r8 */ \
  1095. "mov r8, #0 \n\t" \
  1096. "adc r8, r8, #0 \n\t" \
  1097. \
  1098. "pop {r1} \n\t" \
  1099. /* TODO could fix up r1 value on stack here */ \
  1100. /* and leave the value on the stack (rather */ \
  1101. /* than popping) if supporting curves > 256 bits */ \
  1102. "ldr r9, [r1], #4 \n\t" \
  1103. "ldr r1, [r1] \n\t" \
  1104. \
  1105. "push {r7} \n\t" \
  1106. "umaal r5, r6, r9, r10 \n\t" \
  1107. "mov r7, #0 \n\t" \
  1108. "umaal r6, r7, r1, r10 \n\t" \
  1109. /* Carry now stored in r10 */ \
  1110. "pop {r10} \n\t" \
  1111. \
  1112. "umaal r4, r5, r3, r11 \n\t" \
  1113. "umaal r5, r6, r9, r11 \n\t" \
  1114. "umaal r6, r7, r1, r11 \n\t" \
  1115. \
  1116. "mov r11, #0 \n\t" \
  1117. "umaal r11, r4, r2, r12 \n\t" \
  1118. "umaal r4, r5, r3, r12 \n\t" \
  1119. "umaal r5, r6, r9, r12 \n\t" \
  1120. "umaal r6, r7, r1, r12 \n\t" \
  1121. \
  1122. "mov r12, #0 \n\t" \
  1123. "umaal r12, r4, r2, r14 \n\t" \
  1124. "umaal r4, r5, r3, r14 \n\t" \
  1125. "umaal r5, r6, r9, r14 \n\t" \
  1126. "umaal r6, r7, r1, r14 \n\t" \
  1127. \
  1128. /* Load carry from r10 */ \
  1129. "lsrs r10, #1 \n\t" \
  1130. "adcs r11, r11, r11 \n\t" \
  1131. "adcs r12, r12, r12 \n\t" \
  1132. "adc r10, r10, #0 \n\t" \
  1133. \
  1134. /* Use carry from r8 */ \
  1135. "umaal r8, r11, r14, r14 \n\t" \
  1136. "adds r11, r11, r12 \n\t" \
  1137. "stmia r0!, {r8,r11} \n\t" \
  1138. /* Store carry back in r8 */ \
  1139. "mov r8, #0 \n\t" \
  1140. "adc r8, r8, #0 \n\t" \
  1141. \
  1142. "mov r11, #0 \n\t" \
  1143. "umaal r11, r5, r3, r2 \n\t" \
  1144. "umaal r5, r6, r9, r2 \n\t" \
  1145. "umaal r6, r7, r1, r2 \n\t" \
  1146. \
  1147. "mov r12, #0 \n\t" \
  1148. "umaal r12, r6, r9, r3 \n\t" \
  1149. "umaal r6, r7, r1, r3 \n\t" \
  1150. \
  1151. "mov r14, #0 \n\t" \
  1152. "umaal r14, r7, r1, r9 \n\t" \
  1153. \
  1154. /* Load carry from r10 */ \
  1155. "lsrs r10, #1 \n\t" \
  1156. "adcs r4, r4, r4 \n\t" \
  1157. "adcs r11, r11, r11 \n\t" \
  1158. "adcs r5, r5, r5 \n\t" \
  1159. "adcs r12, r12, r12 \n\t" \
  1160. "adcs r6, r6, r6 \n\t" \
  1161. "adcs r14, r14, r14 \n\t" \
  1162. "adcs r7, r7, r7 \n\t" \
  1163. "adc r10, r10, #0 \n\t" \
  1164. \
  1165. /* Use carry from r8 */ \
  1166. "umaal r4, r8, r2, r2 \n\t" \
  1167. "adds r8, r8, r11 \n\t" \
  1168. "stmia r0!, {r4,r8} \n\t" \
  1169. \
  1170. "umull r4, r8, r3, r3 \n\t" \
  1171. "adcs r4, r4, r5 \n\t" \
  1172. "adcs r8, r8, r12 \n\t" \
  1173. "stmia r0!, {r4,r8} \n\t" \
  1174. \
  1175. "umull r4, r8, r9, r9 \n\t" \
  1176. "adcs r4, r4, r6 \n\t" \
  1177. "adcs r8, r8, r14 \n\t" \
  1178. "stmia r0!, {r4,r8} \n\t" \
  1179. \
  1180. "umull r4, r8, r1, r1 \n\t" \
  1181. "adcs r4, r4, r7 \n\t" \
  1182. "adcs r8, r8, r10 \n\t" \
  1183. "stmia r0!, {r4,r8} \n\t" \
  1184. /* TODO pop {r1, r2} if supporting curves > 256 bits */ \
  1185. "pop {r2} \n\t"
  1186. #endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */